Completed
Pull Request — master (#145)
by Brent
01:54
created

Crawler::mayIndex()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 8
rs 9.4285
cc 3
eloc 4
nc 3
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use InvalidArgumentException;
13
use Spatie\Robots\RobotsMeta;
14
use GuzzleHttp\RequestOptions;
15
use Spatie\Robots\RobotsHeaders;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Psr\Http\Message\StreamInterface;
19
use Symfony\Component\DomCrawler\Link;
20
use Psr\Http\Message\ResponseInterface;
21
use Spatie\Crawler\CrawlQueue\CrawlQueue;
22
use GuzzleHttp\Exception\RequestException;
23
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
24
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
25
26
class Crawler
27
{
28
    /** @var \GuzzleHttp\Client */
29
    protected $client;
30
31
    /** @var \Psr\Http\Message\UriInterface */
32
    protected $baseUrl;
33
34
    /** @var array[\Spatie\Crawler\CrawlObserver] */
35
    protected $crawlObservers;
36
37
    /** @var \Spatie\Crawler\CrawlProfile */
38
    protected $crawlProfile;
39
40
    /** @var int */
41
    protected $concurrency;
42
43
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
44
    protected $crawlQueue;
45
46
    /** @var int */
47
    protected $crawledUrlCount = 0;
48
49
    /** @var int|null */
50
    protected $maximumCrawlCount = null;
51
52
    /** @var int */
53
    protected $maximumResponseSize = 1024 * 1024 * 2;
54
55
    /** @var int|null */
56
    protected $maximumDepth = null;
57
58
    /** @var bool */
59
    protected $ignoreRobots = false;
60
61
    /** @var \Tree\Node\Node */
62
    protected $depthTree;
63
64
    /** @var bool */
65
    protected $executeJavaScript = false;
66
67
    /** @var Browsershot */
68
    protected $browsershot = null;
69
70
    /** @var \Spatie\Robots\RobotsTxt */
71
    protected $robotsTxt = null;
72
73
    protected static $defaultClientOptions = [
74
        RequestOptions::COOKIES => true,
75
        RequestOptions::CONNECT_TIMEOUT => 10,
76
        RequestOptions::TIMEOUT => 10,
77
        RequestOptions::ALLOW_REDIRECTS => false,
78
    ];
79
80
    /**
81
     * @param array $clientOptions
82
     *
83
     * @return static
84
     */
85
    public static function create(array $clientOptions = [])
86
    {
87
        $clientOptions = (count($clientOptions))
88
            ? $clientOptions
89
            : self::$defaultClientOptions;
90
91
        $client = new Client($clientOptions);
92
93
        return new static($client);
94
    }
95
96
    public function __construct(Client $client, int $concurrency = 10)
97
    {
98
        $this->client = $client;
99
100
        $this->concurrency = $concurrency;
101
102
        $this->crawlProfile = new CrawlAllUrls();
103
104
        $this->crawlQueue = new CollectionCrawlQueue();
105
    }
106
107
    /**
108
     * @param int $concurrency
109
     *
110
     * @return $this
111
     */
112
    public function setConcurrency(int $concurrency)
113
    {
114
        $this->concurrency = $concurrency;
115
116
        return $this;
117
    }
118
119
    /**
120
     * Responses that are larger that then specified value will be ignored.
121
     *
122
     * @param int $maximumResponseSizeInBytes
123
     *
124
     * @return $this
125
     */
126
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
127
    {
128
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130
        return $this;
131
    }
132
133
    /**
134
     * @param int $maximumCrawlCount
135
     *
136
     * @return $this
137
     */
138
    public function setMaximumCrawlCount(int $maximumCrawlCount)
139
    {
140
        $this->maximumCrawlCount = $maximumCrawlCount;
141
142
        return $this;
143
    }
144
145
    /**
146
     * @param int $maximumDepth
147
     *
148
     * @return $this
149
     */
150
    public function setMaximumDepth(int $maximumDepth)
151
    {
152
        $this->maximumDepth = $maximumDepth;
153
154
        return $this;
155
    }
156
157
    /**
158
     * @param bool $ignoreRobots
159
     *
160
     * @return $this
161
     */
162
    public function ignoreRobots(bool $ignoreRobots = true)
163
    {
164
        $this->ignoreRobots = $ignoreRobots;
165
166
        return $this;
167
    }
168
169
    /**
170
     * @param CrawlQueue $crawlQueue
171
     *
172
     * @return $this
173
     */
174
    public function setCrawlQueue(CrawlQueue $crawlQueue)
175
    {
176
        $this->crawlQueue = $crawlQueue;
177
178
        return $this;
179
    }
180
181
    /**
182
     * @return $this
183
     */
184
    public function executeJavaScript()
185
    {
186
        $this->executeJavaScript = true;
187
188
        return $this;
189
    }
190
191
    /**
192
     * @return $this
193
     */
194
    public function doNotExecuteJavaScript()
195
    {
196
        $this->executeJavaScript = false;
197
198
        return $this;
199
    }
200
201
    /**
202
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
203
     *
204
     * @return $this
205
     */
206
    public function setCrawlObserver($crawlObservers)
207
    {
208
        if (! is_array($crawlObservers)) {
209
            $crawlObservers = [$crawlObservers];
210
        }
211
212
        return $this->setCrawlObservers($crawlObservers);
213
    }
214
215
    public function setCrawlObservers(array $crawlObservers)
216
    {
217
        $this->crawlObservers = $crawlObservers;
218
219
        return $this;
220
    }
221
222
    public function addCrawlObserver(CrawlObserver $crawlObserver)
223
    {
224
        $this->crawlObservers[] = $crawlObserver;
225
226
        return $this;
227
    }
228
229
    /**
230
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
231
     *
232
     * @return $this
233
     */
234
    public function setCrawlProfile(CrawlProfile $crawlProfile)
235
    {
236
        $this->crawlProfile = $crawlProfile;
237
238
        return $this;
239
    }
240
241
    /**
242
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
243
     */
244
    public function startCrawling($baseUrl)
245
    {
246
        if (! $baseUrl instanceof UriInterface) {
247
            $baseUrl = new Uri($baseUrl);
248
        }
249
250
        if ($baseUrl->getScheme() === '') {
251
            $baseUrl = $baseUrl->withScheme('http');
252
        }
253
254
        if ($baseUrl->getPath() === '') {
255
            $baseUrl = $baseUrl->withPath('/');
256
        }
257
258
        $this->baseUrl = $baseUrl;
259
260
        $crawlUrl = CrawlUrl::create($this->baseUrl);
261
262
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
263
264
        $this->addToCrawlQueue($crawlUrl);
265
266
        $this->depthTree = new Node((string) $this->baseUrl);
267
268
        $this->startCrawlingQueue();
269
270
        foreach ($this->crawlObservers as $crawlObserver) {
271
            $crawlObserver->finishedCrawling();
272
        }
273
    }
274
275
    protected function startCrawlingQueue()
276
    {
277
        while ($this->crawlQueue->hasPendingUrls()) {
278
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
279
                'concurrency' => $this->concurrency,
280
                'options' => $this->client->getConfig(),
281
                'fulfilled' => function (ResponseInterface $response, $index) {
282
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
283
284
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
285
286
                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());
287
288
                    $robotsMeta = RobotsMeta::create($body);
289
290
                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
291
                        return;
292
                    }
293
294
                    $this->handleCrawled($response, $crawlUrl);
295
296
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
297
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
298
                            return;
299
                        }
300
                    }
301
302
                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
303
                        return;
304
                    }
305
306
                    $this->addAllLinksToCrawlQueue(
307
                        $body,
308
                        $crawlUrl->url
309
                    );
310
                },
311
                'rejected' => function (RequestException $exception, $index) {
312
                    $this->handleCrawlFailed(
313
                        $exception,
314
                        $this->crawlQueue->getUrlById($index),
315
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
316
                    );
317
                },
318
            ]);
319
320
            $promise = $pool->promise();
321
            $promise->wait();
322
        }
323
    }
324
325
    public function endsWith($haystack, $needle)
326
    {
327
        return strrpos($haystack, $needle) + strlen($needle) ===
328
            strlen($haystack);
329
    }
330
331
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
332
    {
333
        $bodyStream->rewind();
334
335
        $body = $bodyStream->read($readMaximumBytes);
336
337
        return $body;
338
    }
339
340
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
341
    {
342
        return RobotsTxt::create($uri->withPath('/robots.txt'));
343
    }
344
345
    /**
346
     * @param ResponseInterface|null $response
347
     * @param CrawlUrl               $crawlUrl
348
     */
349
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
350
    {
351
        foreach ($this->crawlObservers as $crawlObserver) {
352
            $crawlObserver->crawled(
353
                $crawlUrl->url,
354
                $response,
355
                $crawlUrl->foundOnUrl
356
            );
357
        }
358
    }
359
360
    /**
361
     * @param RequestException $exception
362
     * @param CrawlUrl         $crawlUrl
363
     */
364
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
365
    {
366
        foreach ($this->crawlObservers as $crawlObserver) {
367
            $crawlObserver->crawlFailed(
368
                $crawlUrl->url,
369
                $exception,
370
                $crawlUrl->foundOnUrl
371
            );
372
        }
373
    }
374
375
    protected function getCrawlRequests(): Generator
376
    {
377
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
378
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
379
                $this->crawlQueue->markAsProcessed($crawlUrl);
380
                continue;
381
            }
382
383
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
384
                continue;
385
            }
386
387
            foreach ($this->crawlObservers as $crawlObserver) {
388
                $crawlObserver->willCrawl($crawlUrl->url);
389
            }
390
391
            $this->crawlQueue->markAsProcessed($crawlUrl);
392
393
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
394
        }
395
    }
396
397
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
398
    {
399
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
400
401
        collect($allLinks)
402
            ->filter(function (UriInterface $url) {
403
                return $this->hasCrawlableScheme($url);
404
            })
405
            ->map(function (UriInterface $url) {
406
                return $this->normalizeUrl($url);
407
            })
408
            ->filter(function (UriInterface $url) {
409
                return $this->crawlProfile->shouldCrawl($url);
410
            })
411
            ->reject(function (UriInterface $url) {
412
                return $this->crawlQueue->has($url);
413
            })
414
            ->each(function (UriInterface $url) use ($foundOnUrl) {
415
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
416
417
                if (strpos($url->getPath(), '/tel:') === 0) {
418
                    return;
419
                }
420
421
                if (! $this->shouldCrawl($node)) {
422
                    return;
423
                }
424
425
                if ($this->maximumCrawlCountReached()) {
426
                    return;
427
                }
428
429
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
430
431
                $this->addToCrawlQueue($crawlUrl);
432
            });
433
    }
434
435
    protected function shouldCrawl(Node $node): bool
436
    {
437
        if (! $this->ignoreRobots && ! $this->robotsTxt->allows($node->getValue())) {
438
            return false;
439
        }
440
441
        if (is_null($this->maximumDepth)) {
442
            return true;
443
        }
444
445
        return $node->getDepth() <= $this->maximumDepth;
446
    }
447
448
    /**
449
     * @param string                         $html
450
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
451
     *
452
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
453
     */
454
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
455
    {
456
        if ($this->executeJavaScript) {
457
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
458
        }
459
460
        $domCrawler = new DomCrawler($html, $foundOnUrl);
461
462
        return collect($domCrawler->filterXpath('//a')->links())
463
            ->reject(function (Link $link) {
464
                return $link->getNode()->getAttribute('rel') === 'nofollow';
465
            })
466
            ->map(function (Link $link) {
467
                try {
468
                    return new Uri($link->getUri());
469
                } catch (InvalidArgumentException $exception) {
470
                    return;
471
                }
472
            })
473
            ->filter();
474
    }
475
476
    protected function normalizeUrl(UriInterface $url): UriInterface
477
    {
478
        return $url->withFragment('');
479
    }
480
481
    protected function hasCrawlableScheme(UriInterface $uri): bool
482
    {
483
        return in_array($uri->getScheme(), ['http', 'https']);
484
    }
485
486
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
487
    {
488
        $returnNode = null;
489
490
        if ($node->getValue() === (string) $parentUrl) {
491
            $newNode = new Node((string) $url);
492
493
            $node->addChild($newNode);
494
495
            return $newNode;
496
        }
497
498
        foreach ($node->getChildren() as $currentNode) {
499
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
500
501
            if (! is_null($returnNode)) {
502
                break;
503
            }
504
        }
505
506
        return $returnNode;
507
    }
508
509
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
510
    {
511
        $browsershot = $this->getBrowsershot();
512
513
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
514
515
        return html_entity_decode($html);
516
    }
517
518
    protected function getBrowsershot(): Browsershot
519
    {
520
        if ($this->browsershot) {
521
            return $this->browsershot;
522
        }
523
524
        $this->browsershot = new Browsershot();
525
526
        return $this->browsershot;
527
    }
528
529
    public function setBrowsershot(Browsershot $browsershot)
530
    {
531
        $this->browsershot = $browsershot;
532
533
        return $this;
534
    }
535
536
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
537
    {
538
        $this->crawledUrlCount++;
539
540
        $this->crawlQueue->add($crawlUrl);
541
542
        return $this;
543
    }
544
545
    protected function maximumCrawlCountReached(): bool
546
    {
547
        if (is_null($this->maximumCrawlCount)) {
548
            return false;
549
        }
550
551
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
552
    }
553
554
    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
555
    {
556
        if ($this->ignoreRobots) {
557
            return true;
558
        }
559
560
        return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
561
    }
562
563
    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
564
    {
565
        if ($this->ignoreRobots) {
566
            return true;
567
        }
568
569
        return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
570
    }
571
}
572