Completed
Pull Request — master (#145)
by Freek
01:32
created

Crawler::mayIndex()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 16
Code Lines 8

Duplication

Lines 16
Ratio 100 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 16
loc 16
rs 9.2
cc 4
eloc 8
nc 4
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use InvalidArgumentException;
13
use Spatie\Robots\RobotsMeta;
14
use GuzzleHttp\RequestOptions;
15
use Spatie\Robots\RobotsHeaders;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Psr\Http\Message\StreamInterface;
19
use Symfony\Component\DomCrawler\Link;
20
use Psr\Http\Message\ResponseInterface;
21
use Spatie\Crawler\CrawlQueue\CrawlQueue;
22
use GuzzleHttp\Exception\RequestException;
23
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26
class Crawler
27
{
28
    /** @var \GuzzleHttp\Client */
29
    protected $client;
30
31
    /** @var \Psr\Http\Message\UriInterface */
32
    protected $baseUrl;
33
34
    /** @var array[\Spatie\Crawler\CrawlObserver] */
35
    protected $crawlObservers;
36
37
    /** @var \Spatie\Crawler\CrawlProfile */
38
    protected $crawlProfile;
39
40
    /** @var int */
41
    protected $concurrency;
42
43
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44
    protected $crawlQueue;
45
46
    /** @var int */
47
    protected $crawledUrlCount = 0;
48
49
    /** @var int|null */
50
    protected $maximumCrawlCount = null;
51
52
    /** @var int */
53
    protected $maximumResponseSize = 1024 * 1024 * 2;
54
55
    /** @var int|null */
56
    protected $maximumDepth = null;
57
58
    /** @var bool */
59
    protected $respectRobots = true;
60
61
    /** @var \Tree\Node\Node */
62
    protected $depthTree;
63
64
    /** @var bool */
65
    protected $executeJavaScript = false;
66
67
    /** @var Browsershot */
68
    protected $browsershot = null;
69
70
    /** @var \Spatie\Robots\RobotsTxt */
71
    protected $robotsTxt = null;
72
73
    protected static $defaultClientOptions = [
74
        RequestOptions::COOKIES => true,
75
        RequestOptions::CONNECT_TIMEOUT => 10,
76
        RequestOptions::TIMEOUT => 10,
77
        RequestOptions::ALLOW_REDIRECTS => false,
78
    ];
79
80
    /**
81
     * @param array $clientOptions
82
     *
83
     * @return static
84
     */
85
    public static function create(array $clientOptions = [])
86
    {
87
        $clientOptions = (count($clientOptions))
88
            ? $clientOptions
89
            : self::$defaultClientOptions;
90
91
        $client = new Client($clientOptions);
92
93
        return new static($client);
94
    }
95
96
    public function __construct(Client $client, int $concurrency = 10)
97
    {
98
        $this->client = $client;
99
100
        $this->concurrency = $concurrency;
101
102
        $this->crawlProfile = new CrawlAllUrls();
103
104
        $this->crawlQueue = new CollectionCrawlQueue();
105
    }
106
107
    /**
108
     * @param int $concurrency
109
     *
110
     * @return $this
111
     */
112
    public function setConcurrency(int $concurrency)
113
    {
114
        $this->concurrency = $concurrency;
115
116
        return $this;
117
    }
118
119
    /**
120
     * Responses that are larger that then specified value will be ignored.
121
     *
122
     * @param int $maximumResponseSizeInBytes
123
     *
124
     * @return $this
125
     */
126
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127
    {
128
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130
        return $this;
131
    }
132
133
    /**
134
     * @param int $maximumCrawlCount
135
     *
136
     * @return $this
137
     */
138
    public function setMaximumCrawlCount(int $maximumCrawlCount)
139
    {
140
        $this->maximumCrawlCount = $maximumCrawlCount;
141
142
        return $this;
143
    }
144
145
    /**
146
     * @param int $maximumDepth
147
     *
148
     * @return $this
149
     */
150
    public function setMaximumDepth(int $maximumDepth)
151
    {
152
        $this->maximumDepth = $maximumDepth;
153
154
        return $this;
155
    }
156
157
    /**
158
     * @return $this
159
     */
160
    public function ignoreRobots()
161
    {
162
        $this->respectRobots = false;
163
164
        return $this;
165
    }
166
167
    /**
168
     * @return $this
169
     */
170
    public function respectRobots()
171
    {
172
        $this->respectRobots = true;
173
174
        return $this;
175
    }
176
177
    /**
178
     * @param CrawlQueue $crawlQueue
179
     *
180
     * @return $this
181
     */
182
    public function setCrawlQueue(CrawlQueue $crawlQueue)
183
    {
184
        $this->crawlQueue = $crawlQueue;
185
186
        return $this;
187
    }
188
189
    /**
190
     * @return $this
191
     */
192
    public function executeJavaScript()
193
    {
194
        $this->executeJavaScript = true;
195
196
        return $this;
197
    }
198
199
    /**
200
     * @return $this
201
     */
202
    public function doNotExecuteJavaScript()
203
    {
204
        $this->executeJavaScript = false;
205
206
        return $this;
207
    }
208
209
    /**
210
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
211
     *
212
     * @return $this
213
     */
214
    public function setCrawlObserver($crawlObservers)
215
    {
216
        if (! is_array($crawlObservers)) {
217
            $crawlObservers = [$crawlObservers];
218
        }
219
220
        return $this->setCrawlObservers($crawlObservers);
221
    }
222
223
    public function setCrawlObservers(array $crawlObservers)
224
    {
225
        $this->crawlObservers = $crawlObservers;
226
227
        return $this;
228
    }
229
230
    public function addCrawlObserver(CrawlObserver $crawlObserver)
231
    {
232
        $this->crawlObservers[] = $crawlObserver;
233
234
        return $this;
235
    }
236
237
    /**
238
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
239
     *
240
     * @return $this
241
     */
242
    public function setCrawlProfile(CrawlProfile $crawlProfile)
243
    {
244
        $this->crawlProfile = $crawlProfile;
245
246
        return $this;
247
    }
248
249
    /**
250
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
251
     */
252
    public function startCrawling($baseUrl)
253
    {
254
        if (! $baseUrl instanceof UriInterface) {
255
            $baseUrl = new Uri($baseUrl);
256
        }
257
258
        if ($baseUrl->getScheme() === '') {
259
            $baseUrl = $baseUrl->withScheme('http');
260
        }
261
262
        if ($baseUrl->getPath() === '') {
263
            $baseUrl = $baseUrl->withPath('/');
264
        }
265
266
        $this->baseUrl = $baseUrl;
267
268
        $crawlUrl = CrawlUrl::create($this->baseUrl);
269
270
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
271
272
        $this->addToCrawlQueue($crawlUrl);
273
274
        $this->depthTree = new Node((string) $this->baseUrl);
275
276
        $this->startCrawlingQueue();
277
278
        foreach ($this->crawlObservers as $crawlObserver) {
279
            $crawlObserver->finishedCrawling();
280
        }
281
    }
282
283
    protected function startCrawlingQueue()
284
    {
285
        while ($this->crawlQueue->hasPendingUrls()) {
286
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
287
                'concurrency' => $this->concurrency,
288
                'options' => $this->client->getConfig(),
289
                'fulfilled' => function (ResponseInterface $response, $index) {
290
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
291
292
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
293
294
                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());
295
296
                    $robotsMeta = RobotsMeta::create($body);
297
298
                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
299
                        return;
300
                    }
301
302
                    $this->handleCrawled($response, $crawlUrl);
303
304
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
305
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
306
                            return;
307
                        }
308
                    }
309
310
                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
311
                        return;
312
                    }
313
314
                    $this->addAllLinksToCrawlQueue(
315
                        $body,
316
                        $crawlUrl->url
317
                    );
318
                },
319
                'rejected' => function (RequestException $exception, $index) {
320
                    $this->handleCrawlFailed(
321
                        $exception,
322
                        $this->crawlQueue->getUrlById($index),
323
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
324
                    );
325
                },
326
            ]);
327
328
            $promise = $pool->promise();
329
            $promise->wait();
330
        }
331
    }
332
333
    public function endsWith($haystack, $needle)
334
    {
335
        return strrpos($haystack, $needle) + strlen($needle) ===
336
            strlen($haystack);
337
    }
338
339
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
340
    {
341
        $bodyStream->rewind();
342
343
        $body = $bodyStream->read($readMaximumBytes);
344
345
        return $body;
346
    }
347
348
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
349
    {
350
        return RobotsTxt::create($uri->withPath('/robots.txt'));
351
    }
352
353
    /**
354
     * @param ResponseInterface|null $response
355
     * @param CrawlUrl               $crawlUrl
356
     */
357
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
358
    {
359
        foreach ($this->crawlObservers as $crawlObserver) {
360
            $crawlObserver->crawled(
361
                $crawlUrl->url,
362
                $response,
363
                $crawlUrl->foundOnUrl
364
            );
365
        }
366
    }
367
368
    /**
369
     * @param RequestException $exception
370
     * @param CrawlUrl         $crawlUrl
371
     */
372
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
373
    {
374
        foreach ($this->crawlObservers as $crawlObserver) {
375
            $crawlObserver->crawlFailed(
376
                $crawlUrl->url,
377
                $exception,
378
                $crawlUrl->foundOnUrl
379
            );
380
        }
381
    }
382
383
    protected function getCrawlRequests(): Generator
384
    {
385
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
386
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
387
                $this->crawlQueue->markAsProcessed($crawlUrl);
388
                continue;
389
            }
390
391
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
392
                continue;
393
            }
394
395
            foreach ($this->crawlObservers as $crawlObserver) {
396
                $crawlObserver->willCrawl($crawlUrl->url);
397
            }
398
399
            $this->crawlQueue->markAsProcessed($crawlUrl);
400
401
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
402
        }
403
    }
404
405
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
406
    {
407
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
408
409
        collect($allLinks)
410
            ->filter(function (UriInterface $url) {
411
                return $this->hasCrawlableScheme($url);
412
            })
413
            ->map(function (UriInterface $url) {
414
                return $this->normalizeUrl($url);
415
            })
416
            ->filter(function (UriInterface $url) {
417
                return $this->crawlProfile->shouldCrawl($url);
418
            })
419
            ->reject(function (UriInterface $url) {
420
                return $this->crawlQueue->has($url);
421
            })
422
            ->each(function (UriInterface $url) use ($foundOnUrl) {
423
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
424
425
                if (strpos($url->getPath(), '/tel:') === 0) {
426
                    return;
427
                }
428
429
                if (! $this->shouldCrawl($node)) {
430
                    return;
431
                }
432
433
                if ($this->maximumCrawlCountReached()) {
434
                    return;
435
                }
436
437
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
438
439
                $this->addToCrawlQueue($crawlUrl);
440
            });
441
    }
442
443
    protected function shouldCrawl(Node $node): bool
444
    {
445
        if ($this->respectRobots) {
446
            return $this->robotsTxt->allows($node->getValue());
447
        }
448
449
        if (is_null($this->maximumDepth)) {
450
            return true;
451
        }
452
453
        return $node->getDepth() <= $this->maximumDepth;
454
    }
455
456
    /**
457
     * @param string                         $html
458
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
459
     *
460
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
461
     */
462
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
463
    {
464
        if ($this->executeJavaScript) {
465
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
466
        }
467
468
        $domCrawler = new DomCrawler($html, $foundOnUrl);
469
470
        return collect($domCrawler->filterXpath('//a')->links())
471
            ->reject(function (Link $link) {
472
                return $link->getNode()->getAttribute('rel') === 'nofollow';
473
            })
474
            ->map(function (Link $link) {
475
                try {
476
                    return new Uri($link->getUri());
477
                } catch (InvalidArgumentException $exception) {
478
                    return;
479
                }
480
            })
481
            ->filter();
482
    }
483
484
    protected function normalizeUrl(UriInterface $url): UriInterface
485
    {
486
        return $url->withFragment('');
487
    }
488
489
    protected function hasCrawlableScheme(UriInterface $uri): bool
490
    {
491
        return in_array($uri->getScheme(), ['http', 'https']);
492
    }
493
494
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
495
    {
496
        $returnNode = null;
497
498
        if ($node->getValue() === (string) $parentUrl) {
499
            $newNode = new Node((string) $url);
500
501
            $node->addChild($newNode);
502
503
            return $newNode;
504
        }
505
506
        foreach ($node->getChildren() as $currentNode) {
507
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
508
509
            if (! is_null($returnNode)) {
510
                break;
511
            }
512
        }
513
514
        return $returnNode;
515
    }
516
517
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
518
    {
519
        $browsershot = $this->getBrowsershot();
520
521
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
522
523
        return html_entity_decode($html);
524
    }
525
526
    protected function getBrowsershot(): Browsershot
527
    {
528
        if ($this->browsershot) {
529
            return $this->browsershot;
530
        }
531
532
        $this->browsershot = new Browsershot();
533
534
        return $this->browsershot;
535
    }
536
537
    public function setBrowsershot(Browsershot $browsershot)
538
    {
539
        $this->browsershot = $browsershot;
540
541
        return $this;
542
    }
543
544
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
545
    {
546
        $this->crawledUrlCount++;
547
548
        $this->crawlQueue->add($crawlUrl);
549
550
        return $this;
551
    }
552
553
    protected function maximumCrawlCountReached(): bool
554
    {
555
        if (is_null($this->maximumCrawlCount)) {
556
            return false;
557
        }
558
559
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
560
    }
561
562 View Code Duplication
    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
563
    {
564
        if (! $this->respectRobots) {
565
            return false;
566
        }
567
568
        if (! $robotsHeaders->mayIndex()) {
569
            return false;
570
        }
571
572
        if (! $robotsMeta->mayIndex()) {
573
            return false;
574
        }
575
576
        return true;
577
    }
578
579 View Code Duplication
    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
580
    {
581
        if (! $this->respectRobots) {
582
            return false;
583
        }
584
585
        if (! $robotsHeaders->mayFollow()) {
586
            return false;
587
        }
588
589
        if (! $robotsMeta->mayFollow()) {
590
            return false;
591
        }
592
593
        return true;
594
    }
595
}
596