Completed
Push — master ( 016fca...0d1e60 )
by Brent
03:20 queued 01:42
created

Crawler::mayFollow()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 16
Code Lines 8

Duplication

Lines 16
Ratio 100 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 16
loc 16
rs 9.2
cc 4
eloc 8
nc 4
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use InvalidArgumentException;
13
use Spatie\Robots\RobotsMeta;
14
use GuzzleHttp\RequestOptions;
15
use Spatie\Robots\RobotsHeaders;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Psr\Http\Message\StreamInterface;
19
use Symfony\Component\DomCrawler\Link;
20
use Psr\Http\Message\ResponseInterface;
21
use Spatie\Crawler\CrawlQueue\CrawlQueue;
22
use GuzzleHttp\Exception\RequestException;
23
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26
class Crawler
27
{
28
    /** @var \GuzzleHttp\Client */
29
    protected $client;
30
31
    /** @var \Psr\Http\Message\UriInterface */
32
    protected $baseUrl;
33
34
    /** @var array[\Spatie\Crawler\CrawlObserver] */
35
    protected $crawlObservers;
36
37
    /** @var \Spatie\Crawler\CrawlProfile */
38
    protected $crawlProfile;
39
40
    /** @var int */
41
    protected $concurrency;
42
43
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44
    protected $crawlQueue;
45
46
    /** @var int */
47
    protected $crawledUrlCount = 0;
48
49
    /** @var int|null */
50
    protected $maximumCrawlCount = null;
51
52
    /** @var int */
53
    protected $maximumResponseSize = 1024 * 1024 * 2;
54
55
    /** @var int|null */
56
    protected $maximumDepth = null;
57
58
    /** @var bool */
59
    protected $respectRobots = true;
60
61
    /** @var \Tree\Node\Node */
62
    protected $depthTree;
63
64
    /** @var bool */
65
    protected $executeJavaScript = false;
66
67
    /** @var Browsershot */
68
    protected $browsershot = null;
69
70
    /** @var \Spatie\Robots\RobotsTxt */
71
    protected $robotsTxt = null;
72
73
    protected static $defaultClientOptions = [
74
        RequestOptions::COOKIES => true,
75
        RequestOptions::CONNECT_TIMEOUT => 10,
76
        RequestOptions::TIMEOUT => 10,
77
        RequestOptions::ALLOW_REDIRECTS => false,
78
    ];
79
80
    /**
81
     * @param array $clientOptions
82
     *
83
     * @return static
84
     */
85
    public static function create(array $clientOptions = [])
86
    {
87
        $clientOptions = (count($clientOptions))
88
            ? $clientOptions
89
            : self::$defaultClientOptions;
90
91
        $client = new Client($clientOptions);
92
93
        return new static($client);
94
    }
95
96
    public function __construct(Client $client, int $concurrency = 10)
97
    {
98
        $this->client = $client;
99
100
        $this->concurrency = $concurrency;
101
102
        $this->crawlProfile = new CrawlAllUrls();
103
104
        $this->crawlQueue = new CollectionCrawlQueue();
105
    }
106
107
    /**
108
     * @param int $concurrency
109
     *
110
     * @return $this
111
     */
112
    public function setConcurrency(int $concurrency)
113
    {
114
        $this->concurrency = $concurrency;
115
116
        return $this;
117
    }
118
119
    /**
120
     * Responses that are larger that then specified value will be ignored.
121
     *
122
     * @param int $maximumResponseSizeInBytes
123
     *
124
     * @return $this
125
     */
126
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127
    {
128
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130
        return $this;
131
    }
132
133
    /**
134
     * @param int $maximumCrawlCount
135
     *
136
     * @return $this
137
     */
138
    public function setMaximumCrawlCount(int $maximumCrawlCount)
139
    {
140
        $this->maximumCrawlCount = $maximumCrawlCount;
141
142
        return $this;
143
    }
144
145
    /**
146
     * @param int $maximumDepth
147
     *
148
     * @return $this
149
     */
150
    public function setMaximumDepth(int $maximumDepth)
151
    {
152
        $this->maximumDepth = $maximumDepth;
153
154
        return $this;
155
    }
156
157
    /**
158
     * @return $this
159
     */
160
    public function ignoreRobots()
161
    {
162
        $this->respectRobots = false;
163
164
        return $this;
165
    }
166
167
    /**
168
     * @return $this
169
     */
170
    public function respectRobots()
171
    {
172
        $this->respectRobots = true;
173
174
        return $this;
175
    }
176
177
    /**
178
     * @param CrawlQueue $crawlQueue
179
     *
180
     * @return $this
181
     */
182
    public function setCrawlQueue(CrawlQueue $crawlQueue)
183
    {
184
        $this->crawlQueue = $crawlQueue;
185
186
        return $this;
187
    }
188
189
    /**
190
     * @return $this
191
     */
192
    public function executeJavaScript()
193
    {
194
        $this->executeJavaScript = true;
195
196
        return $this;
197
    }
198
199
    /**
200
     * @return $this
201
     */
202
    public function doNotExecuteJavaScript()
203
    {
204
        $this->executeJavaScript = false;
205
206
        return $this;
207
    }
208
209
    /**
210
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
211
     *
212
     * @return $this
213
     */
214
    public function setCrawlObserver($crawlObservers)
215
    {
216
        if (! is_array($crawlObservers)) {
217
            $crawlObservers = [$crawlObservers];
218
        }
219
220
        return $this->setCrawlObservers($crawlObservers);
221
    }
222
223
    public function setCrawlObservers(array $crawlObservers)
224
    {
225
        $this->crawlObservers = $crawlObservers;
226
227
        return $this;
228
    }
229
230
    public function addCrawlObserver(CrawlObserver $crawlObserver)
231
    {
232
        $this->crawlObservers[] = $crawlObserver;
233
234
        return $this;
235
    }
236
237
    /**
238
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
239
     *
240
     * @return $this
241
     */
242
    public function setCrawlProfile(CrawlProfile $crawlProfile)
243
    {
244
        $this->crawlProfile = $crawlProfile;
245
246
        return $this;
247
    }
248
249
    /**
250
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
251
     */
252
    public function startCrawling($baseUrl)
253
    {
254
        if (! $baseUrl instanceof UriInterface) {
255
            $baseUrl = new Uri($baseUrl);
256
        }
257
258
        if ($baseUrl->getScheme() === '') {
259
            $baseUrl = $baseUrl->withScheme('http');
260
        }
261
262
        if ($baseUrl->getPath() === '') {
263
            $baseUrl = $baseUrl->withPath('/');
264
        }
265
266
        $this->baseUrl = $baseUrl;
267
268
        $crawlUrl = CrawlUrl::create($this->baseUrl);
269
270
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
271
272
        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
273
            $this->addToCrawlQueue($crawlUrl);
274
        }
275
276
        $this->depthTree = new Node((string) $this->baseUrl);
277
278
        $this->startCrawlingQueue();
279
280
        foreach ($this->crawlObservers as $crawlObserver) {
281
            $crawlObserver->finishedCrawling();
282
        }
283
    }
284
285
    protected function startCrawlingQueue()
286
    {
287
        while ($this->crawlQueue->hasPendingUrls()) {
288
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
289
                'concurrency' => $this->concurrency,
290
                'options' => $this->client->getConfig(),
291
                'fulfilled' => function (ResponseInterface $response, $index) {
292
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
293
294
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
295
296
                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());
297
298
                    $robotsMeta = RobotsMeta::create($body);
299
300
                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
301
                        return;
302
                    }
303
304
                    $this->handleCrawled($response, $crawlUrl);
305
306
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
307
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
308
                            return;
309
                        }
310
                    }
311
312
                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
313
                        return;
314
                    }
315
316
                    $this->addAllLinksToCrawlQueue(
317
                        $body,
318
                        $crawlUrl->url
319
                    );
320
                },
321
                'rejected' => function (RequestException $exception, $index) {
322
                    $this->handleCrawlFailed(
323
                        $exception,
324
                        $this->crawlQueue->getUrlById($index),
325
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
326
                    );
327
                },
328
            ]);
329
330
            $promise = $pool->promise();
331
            $promise->wait();
332
        }
333
    }
334
335
    public function endsWith($haystack, $needle)
336
    {
337
        return strrpos($haystack, $needle) + strlen($needle) ===
338
            strlen($haystack);
339
    }
340
341
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
342
    {
343
        $bodyStream->rewind();
344
345
        $body = $bodyStream->read($readMaximumBytes);
346
347
        return $body;
348
    }
349
350
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
351
    {
352
        return RobotsTxt::create($uri->withPath('/robots.txt'));
353
    }
354
355
    /**
356
     * @param ResponseInterface|null $response
357
     * @param CrawlUrl               $crawlUrl
358
     */
359
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
360
    {
361
        foreach ($this->crawlObservers as $crawlObserver) {
362
            $crawlObserver->crawled(
363
                $crawlUrl->url,
364
                $response,
365
                $crawlUrl->foundOnUrl
366
            );
367
        }
368
    }
369
370
    /**
371
     * @param RequestException $exception
372
     * @param CrawlUrl         $crawlUrl
373
     */
374
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
375
    {
376
        foreach ($this->crawlObservers as $crawlObserver) {
377
            $crawlObserver->crawlFailed(
378
                $crawlUrl->url,
379
                $exception,
380
                $crawlUrl->foundOnUrl
381
            );
382
        }
383
    }
384
385
    protected function getCrawlRequests(): Generator
386
    {
387
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
388
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
389
                $this->crawlQueue->markAsProcessed($crawlUrl);
390
                continue;
391
            }
392
393
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
394
                continue;
395
            }
396
397
            foreach ($this->crawlObservers as $crawlObserver) {
398
                $crawlObserver->willCrawl($crawlUrl->url);
399
            }
400
401
            $this->crawlQueue->markAsProcessed($crawlUrl);
402
403
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
404
        }
405
    }
406
407
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
408
    {
409
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
410
411
        collect($allLinks)
412
            ->filter(function (UriInterface $url) {
413
                return $this->hasCrawlableScheme($url);
414
            })
415
            ->map(function (UriInterface $url) {
416
                return $this->normalizeUrl($url);
417
            })
418
            ->filter(function (UriInterface $url) {
419
                return $this->crawlProfile->shouldCrawl($url);
420
            })
421
            ->reject(function (UriInterface $url) {
422
                return $this->crawlQueue->has($url);
423
            })
424
            ->each(function (UriInterface $url) use ($foundOnUrl) {
425
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
426
427
                if (strpos($url->getPath(), '/tel:') === 0) {
428
                    return;
429
                }
430
431
                if (! $this->shouldCrawl($node)) {
432
                    return;
433
                }
434
435
                if ($this->maximumCrawlCountReached()) {
436
                    return;
437
                }
438
439
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
440
441
                $this->addToCrawlQueue($crawlUrl);
442
            });
443
    }
444
445
    protected function shouldCrawl(Node $node): bool
446
    {
447
        if ($this->respectRobots) {
448
            return $this->robotsTxt->allows($node->getValue());
449
        }
450
451
        if (is_null($this->maximumDepth)) {
452
            return true;
453
        }
454
455
        return $node->getDepth() <= $this->maximumDepth;
456
    }
457
458
    /**
459
     * @param string                         $html
460
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
461
     *
462
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
463
     */
464
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
465
    {
466
        if ($this->executeJavaScript) {
467
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
468
        }
469
470
        $domCrawler = new DomCrawler($html, $foundOnUrl);
471
472
        return collect($domCrawler->filterXpath('//a')->links())
473
            ->reject(function (Link $link) {
474
                return $link->getNode()->getAttribute('rel') === 'nofollow';
475
            })
476
            ->map(function (Link $link) {
477
                try {
478
                    return new Uri($link->getUri());
479
                } catch (InvalidArgumentException $exception) {
480
                    return;
481
                }
482
            })
483
            ->filter();
484
    }
485
486
    protected function normalizeUrl(UriInterface $url): UriInterface
487
    {
488
        return $url->withFragment('');
489
    }
490
491
    protected function hasCrawlableScheme(UriInterface $uri): bool
492
    {
493
        return in_array($uri->getScheme(), ['http', 'https']);
494
    }
495
496
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
497
    {
498
        $returnNode = null;
499
500
        if ($node->getValue() === (string) $parentUrl) {
501
            $newNode = new Node((string) $url);
502
503
            $node->addChild($newNode);
504
505
            return $newNode;
506
        }
507
508
        foreach ($node->getChildren() as $currentNode) {
509
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
510
511
            if (! is_null($returnNode)) {
512
                break;
513
            }
514
        }
515
516
        return $returnNode;
517
    }
518
519
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
520
    {
521
        $browsershot = $this->getBrowsershot();
522
523
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
524
525
        return html_entity_decode($html);
526
    }
527
528
    protected function getBrowsershot(): Browsershot
529
    {
530
        if ($this->browsershot) {
531
            return $this->browsershot;
532
        }
533
534
        $this->browsershot = new Browsershot();
535
536
        return $this->browsershot;
537
    }
538
539
    public function setBrowsershot(Browsershot $browsershot)
540
    {
541
        $this->browsershot = $browsershot;
542
543
        return $this;
544
    }
545
546
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
547
    {
548
        $this->crawledUrlCount++;
549
550
        $this->crawlQueue->add($crawlUrl);
551
552
        return $this;
553
    }
554
555
    protected function maximumCrawlCountReached(): bool
556
    {
557
        if (is_null($this->maximumCrawlCount)) {
558
            return false;
559
        }
560
561
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
562
    }
563
564 View Code Duplication
    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
565
    {
566
        if (! $this->respectRobots) {
567
            return true;
568
        }
569
570
        if (! $robotsHeaders->mayIndex()) {
571
            return false;
572
        }
573
574
        if (! $robotsMeta->mayIndex()) {
575
            return false;
576
        }
577
578
        return true;
579
    }
580
581 View Code Duplication
    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
582
    {
583
        if (! $this->respectRobots) {
584
            return true;
585
        }
586
587
        if (! $robotsHeaders->mayFollow()) {
588
            return false;
589
        }
590
591
        if (! $robotsMeta->mayFollow()) {
592
            return false;
593
        }
594
595
        return true;
596
    }
597
}
598