Completed
Pull Request — master (#145)
by Brent
02:06
created

Crawler::mayIndex()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 8
rs 9.4285
cc 3
eloc 4
nc 3
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use InvalidArgumentException;
13
use Spatie\Robots\RobotsMeta;
14
use GuzzleHttp\RequestOptions;
15
use Spatie\Robots\RobotsHeaders;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Psr\Http\Message\StreamInterface;
19
use Symfony\Component\DomCrawler\Link;
20
use Psr\Http\Message\ResponseInterface;
21
use Spatie\Crawler\CrawlQueue\CrawlQueue;
22
use GuzzleHttp\Exception\RequestException;
23
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26
class Crawler
27
{
28
    /** @var \GuzzleHttp\Client */
29
    protected $client;
30
31
    /** @var \Psr\Http\Message\UriInterface */
32
    protected $baseUrl;
33
34
    /** @var array[\Spatie\Crawler\CrawlObserver] */
35
    protected $crawlObservers;
36
37
    /** @var \Spatie\Crawler\CrawlProfile */
38
    protected $crawlProfile;
39
40
    /** @var int */
41
    protected $concurrency;
42
43
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44
    protected $crawlQueue;
45
46
    /** @var int */
47
    protected $crawledUrlCount = 0;
48
49
    /** @var int|null */
50
    protected $maximumCrawlCount = null;
51
52
    /** @var int */
53
    protected $maximumResponseSize = 1024 * 1024 * 2;
54
55
    /** @var int|null */
56
    protected $maximumDepth = null;
57
58
    /** @var bool */
59
    protected $ignoreRobots = false;
60
61
    /** @var \Tree\Node\Node */
62
    protected $depthTree;
63
64
    /** @var bool */
65
    protected $executeJavaScript = false;
66
67
    /** @var Browsershot */
68
    protected $browsershot = null;
69
70
    /** @var \Spatie\Robots\RobotsTxt */
71
    protected $robotsTxt = null;
72
73
    protected static $defaultClientOptions = [
74
        RequestOptions::COOKIES => true,
75
        RequestOptions::CONNECT_TIMEOUT => 10,
76
        RequestOptions::TIMEOUT => 10,
77
        RequestOptions::ALLOW_REDIRECTS => false,
78
    ];
79
80
    /**
81
     * @param array $clientOptions
82
     *
83
     * @return static
84
     */
85
    public static function create(array $clientOptions = [])
86
    {
87
        $clientOptions = (count($clientOptions))
88
            ? $clientOptions
89
            : self::$defaultClientOptions;
90
91
        $client = new Client($clientOptions);
92
93
        return new static($client);
94
    }
95
96
    public function __construct(Client $client, int $concurrency = 10)
97
    {
98
        $this->client = $client;
99
100
        $this->concurrency = $concurrency;
101
102
        $this->crawlProfile = new CrawlAllUrls();
103
104
        $this->crawlQueue = new CollectionCrawlQueue();
105
    }
106
107
    /**
108
     * @param int $concurrency
109
     *
110
     * @return $this
111
     */
112
    public function setConcurrency(int $concurrency)
113
    {
114
        $this->concurrency = $concurrency;
115
116
        return $this;
117
    }
118
119
    /**
120
     * Responses that are larger that then specified value will be ignored.
121
     *
122
     * @param int $maximumResponseSizeInBytes
123
     *
124
     * @return $this
125
     */
126
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127
    {
128
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130
        return $this;
131
    }
132
133
    /**
134
     * @param int $maximumCrawlCount
135
     *
136
     * @return $this
137
     */
138
    public function setMaximumCrawlCount(int $maximumCrawlCount)
139
    {
140
        $this->maximumCrawlCount = $maximumCrawlCount;
141
142
        return $this;
143
    }
144
145
    /**
146
     * @param int $maximumDepth
147
     *
148
     * @return $this
149
     */
150
    public function setMaximumDepth(int $maximumDepth)
151
    {
152
        $this->maximumDepth = $maximumDepth;
153
154
        return $this;
155
    }
156
157
    /**
158
     * @return $this
159
     */
160
    public function ignoreRobots()
161
    {
162
        $this->ignoreRobots = false;
163
164
        return $this;
165
    }
166
167
    /**
168
     * @return $this
169
     */
170
    public function respectRobots()
171
    {
172
        $this->ignoreRobots = true;
173
174
        return $this;
175
    }
176
177
    /**
178
     * @param CrawlQueue $crawlQueue
179
     *
180
     * @return $this
181
     */
182
    public function setCrawlQueue(CrawlQueue $crawlQueue)
183
    {
184
        $this->crawlQueue = $crawlQueue;
185
186
        return $this;
187
    }
188
189
    /**
190
     * @return $this
191
     */
192
    public function executeJavaScript()
193
    {
194
        $this->executeJavaScript = true;
195
196
        return $this;
197
    }
198
199
    /**
200
     * @return $this
201
     */
202
    public function doNotExecuteJavaScript()
203
    {
204
        $this->executeJavaScript = false;
205
206
        return $this;
207
    }
208
209
    /**
210
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
211
     *
212
     * @return $this
213
     */
214
    public function setCrawlObserver($crawlObservers)
215
    {
216
        if (! is_array($crawlObservers)) {
217
            $crawlObservers = [$crawlObservers];
218
        }
219
220
        return $this->setCrawlObservers($crawlObservers);
221
    }
222
223
    public function setCrawlObservers(array $crawlObservers)
224
    {
225
        $this->crawlObservers = $crawlObservers;
226
227
        return $this;
228
    }
229
230
    public function addCrawlObserver(CrawlObserver $crawlObserver)
231
    {
232
        $this->crawlObservers[] = $crawlObserver;
233
234
        return $this;
235
    }
236
237
    /**
238
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
239
     *
240
     * @return $this
241
     */
242
    public function setCrawlProfile(CrawlProfile $crawlProfile)
243
    {
244
        $this->crawlProfile = $crawlProfile;
245
246
        return $this;
247
    }
248
249
    /**
250
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
251
     */
252
    public function startCrawling($baseUrl)
253
    {
254
        if (! $baseUrl instanceof UriInterface) {
255
            $baseUrl = new Uri($baseUrl);
256
        }
257
258
        if ($baseUrl->getScheme() === '') {
259
            $baseUrl = $baseUrl->withScheme('http');
260
        }
261
262
        if ($baseUrl->getPath() === '') {
263
            $baseUrl = $baseUrl->withPath('/');
264
        }
265
266
        $this->baseUrl = $baseUrl;
267
268
        $crawlUrl = CrawlUrl::create($this->baseUrl);
269
270
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
271
272
        $this->addToCrawlQueue($crawlUrl);
273
274
        $this->depthTree = new Node((string) $this->baseUrl);
275
276
        $this->startCrawlingQueue();
277
278
        foreach ($this->crawlObservers as $crawlObserver) {
279
            $crawlObserver->finishedCrawling();
280
        }
281
    }
282
283
    protected function startCrawlingQueue()
284
    {
285
        while ($this->crawlQueue->hasPendingUrls()) {
286
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
287
                'concurrency' => $this->concurrency,
288
                'options' => $this->client->getConfig(),
289
                'fulfilled' => function (ResponseInterface $response, $index) {
290
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
291
292
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
293
294
                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());
295
296
                    $robotsMeta = RobotsMeta::create($body);
297
298
                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
299
                        return;
300
                    }
301
302
                    $this->handleCrawled($response, $crawlUrl);
303
304
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
305
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
306
                            return;
307
                        }
308
                    }
309
310
                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
311
                        return;
312
                    }
313
314
                    $this->addAllLinksToCrawlQueue(
315
                        $body,
316
                        $crawlUrl->url
317
                    );
318
                },
319
                'rejected' => function (RequestException $exception, $index) {
320
                    $this->handleCrawlFailed(
321
                        $exception,
322
                        $this->crawlQueue->getUrlById($index),
323
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
324
                    );
325
                },
326
            ]);
327
328
            $promise = $pool->promise();
329
            $promise->wait();
330
        }
331
    }
332
333
    public function endsWith($haystack, $needle)
334
    {
335
        return strrpos($haystack, $needle) + strlen($needle) ===
336
            strlen($haystack);
337
    }
338
339
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
340
    {
341
        $bodyStream->rewind();
342
343
        $body = $bodyStream->read($readMaximumBytes);
344
345
        return $body;
346
    }
347
348
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
349
    {
350
        return RobotsTxt::create($uri->withPath('/robots.txt'));
351
    }
352
353
    /**
354
     * @param ResponseInterface|null $response
355
     * @param CrawlUrl               $crawlUrl
356
     */
357
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
358
    {
359
        foreach ($this->crawlObservers as $crawlObserver) {
360
            $crawlObserver->crawled(
361
                $crawlUrl->url,
362
                $response,
363
                $crawlUrl->foundOnUrl
364
            );
365
        }
366
    }
367
368
    /**
369
     * @param RequestException $exception
370
     * @param CrawlUrl         $crawlUrl
371
     */
372
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
373
    {
374
        foreach ($this->crawlObservers as $crawlObserver) {
375
            $crawlObserver->crawlFailed(
376
                $crawlUrl->url,
377
                $exception,
378
                $crawlUrl->foundOnUrl
379
            );
380
        }
381
    }
382
383
    protected function getCrawlRequests(): Generator
384
    {
385
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
386
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
387
                $this->crawlQueue->markAsProcessed($crawlUrl);
388
                continue;
389
            }
390
391
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
392
                continue;
393
            }
394
395
            foreach ($this->crawlObservers as $crawlObserver) {
396
                $crawlObserver->willCrawl($crawlUrl->url);
397
            }
398
399
            $this->crawlQueue->markAsProcessed($crawlUrl);
400
401
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
402
        }
403
    }
404
405
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
406
    {
407
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
408
409
        collect($allLinks)
410
            ->filter(function (UriInterface $url) {
411
                return $this->hasCrawlableScheme($url);
412
            })
413
            ->map(function (UriInterface $url) {
414
                return $this->normalizeUrl($url);
415
            })
416
            ->filter(function (UriInterface $url) {
417
                return $this->crawlProfile->shouldCrawl($url);
418
            })
419
            ->reject(function (UriInterface $url) {
420
                return $this->crawlQueue->has($url);
421
            })
422
            ->each(function (UriInterface $url) use ($foundOnUrl) {
423
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
424
425
                if (strpos($url->getPath(), '/tel:') === 0) {
426
                    return;
427
                }
428
429
                if (! $this->shouldCrawl($node)) {
430
                    return;
431
                }
432
433
                if ($this->maximumCrawlCountReached()) {
434
                    return;
435
                }
436
437
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
438
439
                $this->addToCrawlQueue($crawlUrl);
440
            });
441
    }
442
443
    protected function shouldCrawl(Node $node): bool
444
    {
445
        if (! $this->ignoreRobots) {
446
            return false;
447
        }
448
449
        if (! $this->robotsTxt->allows($node->getValue())) {
450
            return false;
451
        }
452
453
        if (is_null($this->maximumDepth)) {
454
            return true;
455
        }
456
457
        return $node->getDepth() <= $this->maximumDepth;
458
    }
459
460
    /**
461
     * @param string                         $html
462
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
463
     *
464
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
465
     */
466
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
467
    {
468
        if ($this->executeJavaScript) {
469
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
470
        }
471
472
        $domCrawler = new DomCrawler($html, $foundOnUrl);
473
474
        return collect($domCrawler->filterXpath('//a')->links())
475
            ->reject(function (Link $link) {
476
                return $link->getNode()->getAttribute('rel') === 'nofollow';
477
            })
478
            ->map(function (Link $link) {
479
                try {
480
                    return new Uri($link->getUri());
481
                } catch (InvalidArgumentException $exception) {
482
                    return;
483
                }
484
            })
485
            ->filter();
486
    }
487
488
    protected function normalizeUrl(UriInterface $url): UriInterface
489
    {
490
        return $url->withFragment('');
491
    }
492
493
    protected function hasCrawlableScheme(UriInterface $uri): bool
494
    {
495
        return in_array($uri->getScheme(), ['http', 'https']);
496
    }
497
498
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
499
    {
500
        $returnNode = null;
501
502
        if ($node->getValue() === (string) $parentUrl) {
503
            $newNode = new Node((string) $url);
504
505
            $node->addChild($newNode);
506
507
            return $newNode;
508
        }
509
510
        foreach ($node->getChildren() as $currentNode) {
511
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
512
513
            if (! is_null($returnNode)) {
514
                break;
515
            }
516
        }
517
518
        return $returnNode;
519
    }
520
521
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
522
    {
523
        $browsershot = $this->getBrowsershot();
524
525
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
526
527
        return html_entity_decode($html);
528
    }
529
530
    protected function getBrowsershot(): Browsershot
531
    {
532
        if ($this->browsershot) {
533
            return $this->browsershot;
534
        }
535
536
        $this->browsershot = new Browsershot();
537
538
        return $this->browsershot;
539
    }
540
541
    public function setBrowsershot(Browsershot $browsershot)
542
    {
543
        $this->browsershot = $browsershot;
544
545
        return $this;
546
    }
547
548
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
549
    {
550
        $this->crawledUrlCount++;
551
552
        $this->crawlQueue->add($crawlUrl);
553
554
        return $this;
555
    }
556
557
    protected function maximumCrawlCountReached(): bool
558
    {
559
        if (is_null($this->maximumCrawlCount)) {
560
            return false;
561
        }
562
563
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
564
    }
565
566
    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
567
    {
568
        if ($this->ignoreRobots) {
569
            return true;
570
        }
571
572
        return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
573
    }
574
575
    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
576
    {
577
        if ($this->ignoreRobots) {
578
            return true;
579
        }
580
581
        return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
582
    }
583
}
584