Completed
Push — master ( 080942...71105f )
by Freek
05:33 queued 58s
created

Crawler::handleCrawlFailed()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 10
rs 9.4285
cc 2
eloc 6
nc 2
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use InvalidArgumentException;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Psr\Http\Message\StreamInterface;
16
use Symfony\Component\DomCrawler\Link;
17
use Psr\Http\Message\ResponseInterface;
18
use Spatie\Crawler\CrawlQueue\CrawlQueue;
19
use Tightenco\Collect\Support\Collection;
20
use GuzzleHttp\Exception\RequestException;
21
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
22
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
23
24
class Crawler
25
{
26
    /** @var \GuzzleHttp\Client */
27
    protected $client;
28
29
    /** @var \Psr\Http\Message\UriInterface */
30
    protected $baseUrl;
31
32
    /** @var array[\Spatie\Crawler\CrawlObserver] */
33
    protected $crawlObservers;
34
35
    /** @var \Spatie\Crawler\CrawlProfile */
36
    protected $crawlProfile;
37
38
    /** @var int */
39
    protected $concurrency;
40
41
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
42
    protected $crawlQueue;
43
44
    /** @var int */
45
    protected $crawledUrlCount = 0;
46
47
    /** @var int|null */
48
    protected $maximumCrawlCount = null;
49
50
    /** @var int */
51
    protected $maximumResponseSize = 1024 * 1024 * 2;
52
53
    /** @var int|null */
54
    protected $maximumDepth = null;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    protected static $defaultClientOptions = [
66
        RequestOptions::COOKIES => true,
67
        RequestOptions::CONNECT_TIMEOUT => 10,
68
        RequestOptions::TIMEOUT => 10,
69
        RequestOptions::ALLOW_REDIRECTS => false,
70
    ];
71
72
    /**
73
     * @param array $clientOptions
74
     *
75
     * @return static
76
     */
77
    public static function create(array $clientOptions = [])
78
    {
79
        $clientOptions = (count($clientOptions))
80
            ? $clientOptions
81
            : self::$defaultClientOptions;
82
83
        $client = new Client($clientOptions);
84
85
        return new static($client);
86
    }
87
88
    public function __construct(Client $client, int $concurrency = 10)
89
    {
90
        $this->client = $client;
91
92
        $this->concurrency = $concurrency;
93
94
        $this->crawlProfile = new CrawlAllUrls();
95
96
        $this->crawlQueue = new CollectionCrawlQueue();
97
    }
98
99
    /**
100
     * @param int $concurrency
101
     *
102
     * @return $this
103
     */
104
    public function setConcurrency(int $concurrency)
105
    {
106
        $this->concurrency = $concurrency;
107
108
        return $this;
109
    }
110
111
    /**
112
     * Responses that are larger that then specified value will be ignored.
113
     *
114
     * @param int $maximumResponseSizeInBytes
115
     *
116
     * @return $this
117
     */
118
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
119
    {
120
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
121
122
        return $this;
123
    }
124
125
    /**
126
     * @param int $maximumCrawlCount
127
     *
128
     * @return $this
129
     */
130
    public function setMaximumCrawlCount(int $maximumCrawlCount)
131
    {
132
        $this->maximumCrawlCount = $maximumCrawlCount;
133
134
        return $this;
135
    }
136
137
    /**
138
     * @param int $maximumDepth
139
     *
140
     * @return $this
141
     */
142
    public function setMaximumDepth(int $maximumDepth)
143
    {
144
        $this->maximumDepth = $maximumDepth;
145
146
        return $this;
147
    }
148
149
    /**
150
     * @param CrawlQueue $crawlQueue
151
     * @return $this
152
     */
153
    public function setCrawlQueue(CrawlQueue $crawlQueue)
154
    {
155
        $this->crawlQueue = $crawlQueue;
156
157
        return $this;
158
    }
159
160
    /**
161
     * @return $this
162
     */
163
    public function executeJavaScript()
164
    {
165
        $this->executeJavaScript = true;
166
167
        return $this;
168
    }
169
170
    /**
171
     * @return $this
172
     */
173
    public function doNotExecuteJavaScript()
174
    {
175
        $this->executeJavaScript = false;
176
177
        return $this;
178
    }
179
180
    /**
181
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
182
     *
183
     * @return $this
184
     */
185
    public function setCrawlObserver($crawlObservers)
186
    {
187
        if (! is_array($crawlObservers)) {
188
            $crawlObservers = [$crawlObservers];
189
        }
190
191
        return $this->setCrawlObservers($crawlObservers);
192
    }
193
194
    public function setCrawlObservers(array $crawlObservers)
195
    {
196
        $this->crawlObservers = $crawlObservers;
197
198
        return $this;
199
    }
200
201
    public function addCrawlObserver(CrawlObserver $crawlObserver)
202
    {
203
        $this->crawlObservers[] = $crawlObserver;
204
205
        return $this;
206
    }
207
208
    /**
209
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
210
     *
211
     * @return $this
212
     */
213
    public function setCrawlProfile(CrawlProfile $crawlProfile)
214
    {
215
        $this->crawlProfile = $crawlProfile;
216
217
        return $this;
218
    }
219
220
    /**
221
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
222
     */
223
    public function startCrawling($baseUrl)
224
    {
225
        if (! $baseUrl instanceof UriInterface) {
226
            $baseUrl = new Uri($baseUrl);
227
        }
228
229
        if ($baseUrl->getScheme() === '') {
230
            $baseUrl = $baseUrl->withScheme('http');
231
        }
232
233
        if ($baseUrl->getPath() === '') {
234
            $baseUrl = $baseUrl->withPath('/');
235
        }
236
237
        $this->baseUrl = $baseUrl;
238
239
        $crawlUrl = CrawlUrl::create($this->baseUrl);
240
241
        $this->addToCrawlQueue($crawlUrl);
242
243
        $this->depthTree = new Node((string) $this->baseUrl);
244
245
        $this->startCrawlingQueue();
246
247
        foreach ($this->crawlObservers as $crawlObserver) {
248
            $crawlObserver->finishedCrawling();
249
        }
250
    }
251
252
    protected function startCrawlingQueue()
253
    {
254
        while ($this->crawlQueue->hasPendingUrls()) {
255
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
256
                'concurrency' => $this->concurrency,
257
                'options' => $this->client->getConfig(),
258
                'fulfilled' => function (ResponseInterface $response, int $index) {
259
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
260
                    $this->handleCrawled($response, $crawlUrl);
261
262
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
263
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
264
                            return;
265
                        }
266
                    }
267
268
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
269
270
                    $this->addAllLinksToCrawlQueue(
271
                        $body,
272
                        $crawlUrl->url
273
                    );
274
                },
275
                'rejected' => function (RequestException $exception, int $index) {
276
                    $this->handleCrawlFailed(
277
                        $exception,
278
                        $this->crawlQueue->getUrlById($index),
279
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
280
                    );
281
                },
282
            ]);
283
284
            $promise = $pool->promise();
285
            $promise->wait();
286
        }
287
    }
288
289
    public function endsWith($haystack, $needle)
290
    {
291
        return strrpos($haystack, $needle) + strlen($needle) ===
292
            strlen($haystack);
293
    }
294
295
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
296
    {
297
        $bodyStream->rewind();
298
299
        $body = $bodyStream->read($readMaximumBytes);
300
301
        return $body;
302
    }
303
304
    /**
305
     * @param ResponseInterface|null $response
306
     * @param CrawlUrl $crawlUrl
307
     */
308
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
309
    {
310
        foreach ($this->crawlObservers as $crawlObserver) {
311
            $crawlObserver->crawled(
312
                $crawlUrl->url,
313
                $response,
314
                $crawlUrl->foundOnUrl
315
            );
316
        }
317
    }
318
319
    /**
320
     * @param RequestException $exception
321
     * @param CrawlUrl $crawlUrl
322
     */
323
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
324
    {
325
        foreach ($this->crawlObservers as $crawlObserver) {
326
            $crawlObserver->crawlFailed(
327
                $crawlUrl->url,
328
                $exception,
329
                $crawlUrl->foundOnUrl
330
            );
331
        }
332
    }
333
334
    protected function getCrawlRequests(): Generator
335
    {
336
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
337
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
338
                $this->crawlQueue->markAsProcessed($crawlUrl);
339
                continue;
340
            }
341
342
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
343
                continue;
344
            }
345
346
            foreach ($this->crawlObservers as $crawlObserver) {
347
                $crawlObserver->willCrawl($crawlUrl->url);
348
            }
349
350
            $this->crawlQueue->markAsProcessed($crawlUrl);
351
352
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
353
        }
354
    }
355
356
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
357
    {
358
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
359
360
        collect($allLinks)
361
            ->filter(function (UriInterface $url) {
362
                return $this->hasCrawlableScheme($url);
363
            })
364
            ->map(function (UriInterface $url) {
365
                return $this->normalizeUrl($url);
366
            })
367
            ->filter(function (UriInterface $url) {
368
                return $this->crawlProfile->shouldCrawl($url);
369
            })
370
            ->reject(function (UriInterface $url) {
371
                return $this->crawlQueue->has($url);
372
            })
373
            ->each(function (UriInterface $url) use ($foundOnUrl) {
374
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
375
376
                if (strpos($url->getPath(), '/tel:') === 0) {
377
                    return;
378
                }
379
380
                if (! $this->shouldCrawl($node)) {
381
                    return;
382
                }
383
384
                if ($this->maximumCrawlCountReached()) {
385
                    return;
386
                }
387
388
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
389
390
                $this->addToCrawlQueue($crawlUrl);
391
            });
392
    }
393
394
    protected function shouldCrawl(Node $node): bool
395
    {
396
        if (is_null($this->maximumDepth)) {
397
            return true;
398
        }
399
400
        return $node->getDepth() <= $this->maximumDepth;
401
    }
402
403
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection
404
    {
405
        if ($this->executeJavaScript) {
406
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
407
        }
408
409
        $domCrawler = new DomCrawler($html, $foundOnUrl);
410
411
        return collect($domCrawler->filterXpath('//a')->links())
412
            ->map(function (Link $link) {
413
                try {
414
                    return new Uri($link->getUri());
415
                } catch (InvalidArgumentException $exception) {
416
                    return;
417
                }
418
            })
419
            ->filter();
420
    }
421
422
    protected function normalizeUrl(UriInterface $url): UriInterface
423
    {
424
        return $url->withFragment('');
425
    }
426
427
    protected function hasCrawlableScheme(UriInterface $uri): bool
428
    {
429
        return in_array($uri->getScheme(), ['http', 'https']);
430
    }
431
432
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
433
    {
434
        $returnNode = null;
435
436
        if ($node->getValue() === (string) $parentUrl) {
437
            $newNode = new Node((string) $url);
438
439
            $node->addChild($newNode);
440
441
            return $newNode;
442
        }
443
444
        foreach ($node->getChildren() as $currentNode) {
445
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
446
447
            if (! is_null($returnNode)) {
448
                break;
449
            }
450
        }
451
452
        return $returnNode;
453
    }
454
455
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
456
    {
457
        $browsershot = $this->getBrowsershot();
458
459
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
460
461
        return html_entity_decode($html);
462
    }
463
464
    protected function getBrowsershot(): Browsershot
465
    {
466
        if ($this->browsershot) {
467
            return $this->browsershot;
468
        }
469
470
        $this->browsershot = new Browsershot();
471
472
        return $this->browsershot;
473
    }
474
475
    public function setBrowsershot(Browsershot $browsershot)
476
    {
477
        $this->browsershot = $browsershot;
478
479
        return $this;
480
    }
481
482
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
483
    {
484
        $this->crawledUrlCount++;
485
486
        $this->crawlQueue->add($crawlUrl);
487
488
        return $this;
489
    }
490
491
    protected function maximumCrawlCountReached(): bool
492
    {
493
        if (is_null($this->maximumCrawlCount)) {
494
            return false;
495
        }
496
497
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
498
    }
499
}
500