Issues (10)

src/Service/Seeker.php (2 issues)

1
<?php
2
3
declare(strict_types=1);
4
5
namespace ReliqArts\Scavenger\Service;
6
7
use Exception;
8
use Goutte\Client as GoutteClient;
9
use Illuminate\Console\Command;
10
use InvalidArgumentException;
11
use Psr\Log\LoggerInterface;
12
use ReliqArts\Scavenger\Concern\Timed;
13
use ReliqArts\Scavenger\Contract\ConfigProvider as ConfigProviderContract;
14
use ReliqArts\Scavenger\Contract\Seeker as SeekerInterface;
15
use ReliqArts\Scavenger\Exception\InvalidTargetDefinition;
16
use ReliqArts\Scavenger\Factory\TargetBuilder;
17
use ReliqArts\Scavenger\Helper\FormattedMessage;
18
use ReliqArts\Scavenger\Helper\NodeProximityAssistant;
19
use ReliqArts\Scavenger\Helper\TargetKey;
20
use ReliqArts\Scavenger\Model\Target;
21
use ReliqArts\Scavenger\OptionSet;
22
use ReliqArts\Scavenger\Result;
23
use ReliqArts\Scavenger\TitleLink;
24
use Symfony\Component\DomCrawler\Crawler;
25
use Symfony\Component\DomCrawler\Form;
26
27
final class Seeker extends Communicator implements SeekerInterface
28
{
29
    use Timed;
30
31
    private const INITIAL_PAGE = 1;
32
33
    protected OptionSet $optionSet;
34
    protected GoutteClient $client;
35
36
    private array $targetDefinitions;
37
    private LoggerInterface $logger;
38
    private NodeProximityAssistant $nodeProximityAssistant;
39
    private int $pageLimit;
40
    private Scrapper $scrapper;
41
42
    /**
43
     * Create a new seeker.
44
     *
45
     * @throws Exception
46
     */
47
    public function __construct(
48
        LoggerInterface $logger,
49
        GoutteClient $client,
50
        ConfigProviderContract $config,
51
        NodeProximityAssistant $nodeProximityAssistant
52
    ) {
53
        parent::__construct();
54
55
        $this->logger = $logger;
56
        $this->client = $client;
57
        $this->targetDefinitions = $config->getTargets();
58
        $this->nodeProximityAssistant = $nodeProximityAssistant;
59
        $this->verbosity = $config->getVerbosity();
60
        $this->hashAlgorithm = $config->getHashAlgorithm();
61
        $this->scrapper = new Scrapper(
62
            $this->logger,
63
            $this->callingCommand,
64
            $this->hashAlgorithm,
65
            $this->verbosity
66
        );
67
    }
68
69
    /**
70
     * {@inheritdoc}
71
     */
72
    public function seek(
73
        OptionSet $options,
74
        ?string $targetName = null,
75
        ?Command $callingCommand = null
76
    ): Result {
77
        $this->optionSet = $options;
78
        $this->pageLimit = $options->getPages();
79
80
        $targetDefinitions = $this->targetDefinitions;
81
        $targetBuilder = $this->getTargetBuilder();
82
        $result = new Result();
83
84
        $this->startTimer();
85
86
        if ($targetName && array_key_exists($targetName, $targetDefinitions)) {
87
            $targetDefinitions = [$targetName => $targetDefinitions[$targetName]];
88
        } elseif ($targetName) {
89
            return $result->addError(FormattedMessage::get(FormattedMessage::TARGET_UNKNOWN, $targetName));
90
        }
91
92
        foreach ($targetDefinitions as $currentTargetName => $targetDefinition) {
93
            try {
94
                $targetDefinition[TargetKey::NAME] = $currentTargetName;
95
                $target = $targetBuilder->createFromDefinition($targetDefinition);
96
                // check for page limit override
97
                if ($target->hasPages()) {
98
                    $this->pageLimit = $target->getPages();
99
                }
100
                $this->crawlTarget($target);
101
                // reset page limit
102
                $this->pageLimit = $this->optionSet->getPages();
103
            } catch (InvalidTargetDefinition $exception) {
104
                $this->tell($exception->getMessage());
105
            } catch (Exception $exception) {
106
                $message = FormattedMessage::get(
107
                    FormattedMessage::TARGET_UNEXPECTED_EXCEPTION,
108
                    $currentTargetName,
109
                    $exception->getMessage()
110
                );
111
112
                $result->addError($message);
113
                $this->tell($message);
114
            }
115
        }
116
117
        if ($this->optionSet->isSaveScraps()) {
118
            $this->scrapper->saveScraps();
119
        }
120
121
        if ($this->optionSet->isConvertScraps()) {
122
            $this->scrapper->convertScraps(false, true);
123
        }
124
125
        $extra = (object)[
126
            'total' => $this->scrapper->getScraps()->count(),
127
            'executionTime' => $this->elapsedTime(),
128
            'new' => $this->scrapper->getNewScrapsCount(),
129
            'converted' => $this->scrapper->getRelatedObjects()->count(),
130
            'unconverted' => $this->scrapper->getScraps()->count() - $this->scrapper->getRelatedObjects()->count(),
131
            'scrapsSaved' => $this->optionSet->isSaveScraps(),
132
            'scrapsConverted' => $this->optionSet->isConvertScraps(),
133
        ];
134
135
        if (!$result->hasErrors()) {
136
            return $result
137
                ->setSuccess(true)
138
                ->setExtra($extra);
139
        }
140
141
        return $result;
142
    }
143
144
    private function crawlTarget(Target $target): void
145
    {
146
        $crawler = $this->client->request('GET', $target->getSource());
147
148
        $this->tell(FormattedMessage::get(FormattedMessage::TARGET, $target->getName()), self::COMM_DIRECTION_IN);
149
150
        // do search if search is enabled for target
151
        if ($target->hasSearch()) {
152
            $this->searchAndScrape($target, $crawler);
153
        } else {
154
            // crawl for details
155
            $this->scrape($target, $crawler);
156
        }
157
158
        $this->printBlankLine();
159
    }
160
161
    private function searchAndScrape(Target $target, Crawler $crawler): void
162
    {
163
        if ($this->verbosity >= self::VERBOSITY_MEDIUM) {
164
            $this->logger->info('Landing Document', [$crawler->html()]);
165
        }
166
167
        $searchFormConfig = $target->getSearch()[TargetKey::SEARCH_FORM];
168
        $searchFormSubmitButtonConfig = $searchFormConfig[TargetKey::SEARCH_FORM_SUBMIT_BUTTON] ?? [];
169
        $formCrawler = $crawler->filter($searchFormConfig[TargetKey::SEARCH_FORM_SELECTOR]);
170
        $submitButtonIdentifier = null;
171
        $form = null;
0 ignored issues
show
The assignment to $form is dead and can be removed.
Loading history...
172
173
        // check if submit button was defined
174
        if (!empty($searchFormSubmitButtonConfig[TargetKey::SEARCH_FORM_SUBMIT_BUTTON_ID])) {
175
            $submitButtonIdentifier = $searchFormSubmitButtonConfig[TargetKey::SEARCH_FORM_SUBMIT_BUTTON_ID];
176
        } elseif (!empty($searchFormSubmitButtonConfig[TargetKey::SEARCH_FORM_SUBMIT_BUTTON_TEXT])) {
177
            $submitButtonIdentifier = $searchFormSubmitButtonConfig[TargetKey::SEARCH_FORM_SUBMIT_BUTTON_TEXT];
178
        }
179
180
        try {
181
            $form = empty($submitButtonIdentifier)
182
                ? $formCrawler->form()
183
                : $formCrawler->selectButton((string)$submitButtonIdentifier)->form();
184
        } catch (InvalidArgumentException $e) {
185
            $this->logger->error($e);
186
        }
187
188
        if ($form instanceof Form) {
0 ignored issues
show
$form is always a sub-type of Symfony\Component\DomCrawler\Form.
Loading history...
189
            if ($this->verbosity >= self::VERBOSITY_MEDIUM) {
190
                $this->logger->info('FORM', [$form]);
191
            }
192
193
            // Search each phrase/keyword one at a time.
194
            foreach ($target->getSearch()[TargetKey::SEARCH_KEYWORDS] as $keyword) {
195
                // set keyword
196
                $form[$searchFormConfig[TargetKey::SEARCH_FORM_INPUT]] = $keyword;
197
198
                $this->tell(FormattedMessage::get(FormattedMessage::SEARCH_KEYWORD, $keyword), self::COMM_DIRECTION_IN);
199
                // submit search form
200
                $resultCrawler = $this->client->submit($form);
201
202
                if ($this->verbosity >= self::VERBOSITY_MEDIUM) {
203
                    $this->logger->info(
204
                        FormattedMessage::get(
205
                            FormattedMessage::FIRST_RESULT_PAGE_FOR_KEYWORD_ON_TARGET,
206
                            $keyword,
207
                            $target->getName()
208
                        ),
209
                        [$resultCrawler === null ? '[NULL]' : $resultCrawler->html()]
210
                    );
211
                }
212
213
                // crawl
214
                $this->scrape($target, $resultCrawler);
215
            }
216
        } else {
217
            $this->tell(FormattedMessage::get(FormattedMessage::TARGET_INVALID_SEARCH_FORM, $target->getName()));
218
        }
219
    }
220
221
    private function scrape(Target $target, ?Crawler $crawler): void
222
    {
223
        if ($crawler === null) {
224
            return;
225
        }
226
227
        $markup = $target->getMarkup();
228
        $titleLinkSelector = $markup[TargetKey::MARKUP_TITLE];
229
        $markupInside = $markup[TargetKey::special(TargetKey::MARKUP_INSIDE)] ?? [];
230
        $markupHasInside = !empty($markupInside);
231
        $markupHasItemWrapper = false;
232
233
        // choose link as title link selector if it is not empty and it is not set to a special key
234
        if (!(empty($markup[TargetKey::MARKUP_LINK]) || ConfigProvider::isSpecialKey(
235
            $markup[TargetKey::MARKUP_LINK]
236
        ))) {
237
            $titleLinkSelector = $markup[TargetKey::MARKUP_LINK];
238
        }
239
240
        // determine what item wrapper is
241
        $markup[TargetKey::special(TargetKey::ITEM_WRAPPER)] = Scanner::firstNonEmpty(
242
            $markup,
243
            [
244
                TargetKey::special(TargetKey::ITEM_WRAPPER),
245
                TargetKey::special(TargetKey::RESULT),
246
                TargetKey::special(TargetKey::ITEM),
247
                TargetKey::special(TargetKey::WRAPPER),
248
            ]
249
        );
250
        if (!empty($markup[TargetKey::special(TargetKey::ITEM_WRAPPER)])) {
251
            $markupHasItemWrapper = true;
252
        }
253
254
        // Page by page we go...
255
        $page = self::INITIAL_PAGE;
256
257
        do {
258
            $this->tell(
259
                FormattedMessage::get(FormattedMessage::PROCESSING_PAGE_N, $page),
260
                self::COMM_DIRECTION_NONE
261
            );
262
263
            // scrape each by carving the insides
264
            $items = $crawler->filter($titleLinkSelector);
265
266
            if ($markupHasItemWrapper) {
267
                $items = $crawler->filter($markup[TargetKey::special(TargetKey::ITEM_WRAPPER)]);
268
            }
269
270
            if (!$items->count()) {
271
                $this->tell(
272
                    FormattedMessage::get(FormattedMessage::NO_ITEMS_FOUND_ON_PAGE_N, $page)
273
                );
274
            }
275
276
            $items->each(
277
                function (Crawler $itemCrawler) use (
278
                    $markupHasInside,
279
                    $markupInside,
280
                    $target,
281
                    $titleLinkSelector
282
                ) {
283
                    $cursor = $target->getCursor();
284
285
                    try {
286
                        $titleLinkCrawler = $markupHasInside ? $itemCrawler : $itemCrawler->filter($titleLinkSelector);
287
                        $titleLinkText = Scanner::cleanText($titleLinkCrawler->text());
288
                        $linkCrawler = $titleLinkCrawler->selectLink($titleLinkText);
289
290
                        // link crawler is empty in this case the link may be a parent element
291
                        // so we must find closest link:
292
                        if (!count($linkCrawler)) {
293
                            $linkCrawler = $this->nodeProximityAssistant->closest('a[href]', $titleLinkCrawler);
294
                        }
295
296
                        // simply get link from crawler
297
                        $link = $linkCrawler->link();
298
299
                        $this->tell($titleLinkText, self::COMM_DIRECTION_FLAT);
300
                    } catch (InvalidArgumentException $e) {
301
                        $this->tell(
302
                            FormattedMessage::get(
303
                                ($markupHasInside
304
                                    ? FormattedMessage::UNABLE_TO_RETRIEVE_SCRAP_FOR_X
305
                                    : FormattedMessage::NO_TITLE_LINK_FOUND_FOR_X),
306
                                $cursor + 1
307
                            )
308
                        );
309
                        $this->logger->error($e);
310
311
                        // escape 'each' block
312
                        return;
313
                    }
314
315
                    $target->incrementCursor();
316
317
                    if ($markupHasInside) {
318
                        // grab handle on detail
319
                        $itemCrawler = $this->client->click($link);
320
                        $markupHasFocus = !empty($markupInside[TargetKey::special(TargetKey::MARKUP_FOCUS)]);
321
322
                        // focus detail crawler on section if specified
323
                        if ($markupHasFocus) {
324
                            $itemCrawler = $itemCrawler->filter(
325
                                $markupInside[TargetKey::special(TargetKey::MARKUP_FOCUS)]
326
                            );
327
                        }
328
                    }
329
330
                    // collect the scrap...
331
                    $this->scrapper->collect(
332
                        new TitleLink($titleLinkText, $link->getUri()),
333
                        $target,
334
                        $itemCrawler
335
                    );
336
                }
337
            );
338
339
            if ($page < $this->pageLimit && $target->hasPager()) {
340
                // Look for next page.
341
                // An InvalidArgumentException may be thrown if a 'next' link does not exist.
342
                try {
343
                    // Select pager
344
                    $pager = $crawler->filter($target->getPager()[TargetKey::PAGER_SELECTOR]);
345
                    // Grab pager/next link
346
                    $nextLink = $pager->link();
347
                    // Click it!
348
                    $crawler = $this->client->click($nextLink);
349
                } catch (InvalidArgumentException $e) {
350
                    // Next link doesn't exist
351
                    $crawler = null;
352
353
                    if ($this->verbosity >= self::VERBOSITY_HIGH) {
354
                        $errorMessage = FormattedMessage::get(
355
                            FormattedMessage::TARGET_PAGER_NEXT_NOT_FOUND,
356
                            $target->getName(),
357
                            $target->getPager()[TargetKey::PAGER_SELECTOR],
358
                            $target->getPager()[TargetKey::PAGER_TEXT]
359
                        );
360
361
                        $this->tell($errorMessage);
362
                        $this->logger->error($errorMessage);
363
                    }
364
                }
365
366
                ++$page;
367
            } else {
368
                $crawler = null;
369
            }
370
371
            // back-off
372
            sleep($this->optionSet->getBackOff());
373
        } while ($crawler !== null); // unless Crawler died...
374
    }
375
376
    private function getTargetBuilder(): TargetBuilder
377
    {
378
        return new TargetBuilder(
379
            array_filter(
380
                array_map(
381
                    'trim',
382
                    explode(',', $this->optionSet->getKeywords() ?? '')
383
                )
384
            )
385
        );
386
    }
387
}
388