Completed
Pull Request — master (#145)
by Brent
10:17 queued 06:31
created

Crawler   F

Complexity

Total Complexity 69

Size/Duplication

Total Lines 546
Duplicated Lines 0 %

Coupling/Cohesion

Components 2
Dependencies 20

Importance

Changes 6
Bugs 2 Features 0
Metric Value
wmc 69
c 6
b 2
f 0
lcom 2
cbo 20
dl 0
loc 546
rs 2.1568

35 Methods

Rating   Name   Duplication   Size   Complexity  
A create() 0 10 2
A setConcurrency() 0 6 1
A setMaximumResponseSize() 0 6 1
A setMaximumCrawlCount() 0 6 1
A setMaximumDepth() 0 6 1
A ignoreRobots() 0 6 1
A setCrawlQueue() 0 6 1
A executeJavaScript() 0 6 1
A doNotExecuteJavaScript() 0 6 1
A setCrawlObserver() 0 8 2
A setCrawlObservers() 0 6 1
A addCrawlObserver() 0 6 1
A setCrawlProfile() 0 6 1
B startCrawlingQueue() 0 49 6
A endsWith() 0 5 1
A convertBodyToString() 0 8 1
A createRobotsTxt() 0 4 1
A handleCrawled() 0 10 2
A handleCrawlFailed() 0 10 2
B getCrawlRequests() 0 21 5
B addAllLinksToCrawlQueue() 0 37 4
A shouldCrawl() 0 12 4
A extractAllLinks() 0 21 3
A normalizeUrl() 0 4 1
A hasCrawlableScheme() 0 4 1
B addtoDepthTree() 0 22 4
A getBodyAfterExecutingJavaScript() 0 8 1
A getBrowsershot() 0 10 2
A setBrowsershot() 0 6 1
A addToCrawlQueue() 0 8 1
A maximumCrawlCountReached() 0 8 2
A mayIndex() 0 8 3
A mayFollow() 0 8 3
B startCrawling() 0 30 5
A __construct() 0 10 1

How to fix   Complexity   

Complex Class

Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use Spatie\Robots\Robots;
11
use GuzzleHttp\Psr7\Request;
12
use Spatie\Robots\RobotsTxt;
13
use InvalidArgumentException;
14
use Spatie\Robots\RobotsMeta;
15
use GuzzleHttp\RequestOptions;
16
use Spatie\Robots\RobotsHeaders;
17
use Psr\Http\Message\UriInterface;
18
use Spatie\Browsershot\Browsershot;
19
use Psr\Http\Message\StreamInterface;
20
use Symfony\Component\DomCrawler\Link;
21
use Psr\Http\Message\ResponseInterface;
22
use Spatie\Crawler\CrawlQueue\CrawlQueue;
23
use GuzzleHttp\Exception\RequestException;
24
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
25
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
26
27
class Crawler
28
{
29
    /** @var \GuzzleHttp\Client */
30
    protected $client;
31
32
    /** @var \Psr\Http\Message\UriInterface */
33
    protected $baseUrl;
34
35
    /** @var array[\Spatie\Crawler\CrawlObserver] */
36
    protected $crawlObservers;
37
38
    /** @var \Spatie\Crawler\CrawlProfile */
39
    protected $crawlProfile;
40
41
    /** @var int */
42
    protected $concurrency;
43
44
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
45
    protected $crawlQueue;
46
47
    /** @var int */
48
    protected $crawledUrlCount = 0;
49
50
    /** @var int|null */
51
    protected $maximumCrawlCount = null;
52
53
    /** @var int */
54
    protected $maximumResponseSize = 1024 * 1024 * 2;
55
56
    /** @var int|null */
57
    protected $maximumDepth = null;
58
59
    /** @var bool */
60
    protected $ignoreRobots = false;
61
62
    /** @var \Tree\Node\Node */
63
    protected $depthTree;
64
65
    /** @var bool */
66
    protected $executeJavaScript = false;
67
68
    /** @var Browsershot */
69
    protected $browsershot = null;
70
71
    /** @var \Spatie\Robots\RobotsTxt */
72
    private $robotsTxt = null;
73
74
    protected static $defaultClientOptions = [
75
        RequestOptions::COOKIES => true,
76
        RequestOptions::CONNECT_TIMEOUT => 10,
77
        RequestOptions::TIMEOUT => 10,
78
        RequestOptions::ALLOW_REDIRECTS => false,
79
    ];
80
81
    /**
82
     * @param array $clientOptions
83
     *
84
     * @return static
85
     */
86
    public static function create(array $clientOptions = [])
87
    {
88
        $clientOptions = (count($clientOptions))
89
            ? $clientOptions
90
            : self::$defaultClientOptions;
91
92
        $client = new Client($clientOptions);
93
94
        return new static($client);
95
    }
96
97
    public function __construct(Client $client, int $concurrency = 10)
98
    {
99
        $this->client = $client;
100
101
        $this->concurrency = $concurrency;
102
103
        $this->crawlProfile = new CrawlAllUrls();
104
105
        $this->crawlQueue = new CollectionCrawlQueue();
106
    }
107
108
    /**
109
     * @param int $concurrency
110
     *
111
     * @return $this
112
     */
113
    public function setConcurrency(int $concurrency)
114
    {
115
        $this->concurrency = $concurrency;
116
117
        return $this;
118
    }
119
120
    /**
121
     * Responses that are larger that then specified value will be ignored.
122
     *
123
     * @param int $maximumResponseSizeInBytes
124
     *
125
     * @return $this
126
     */
127
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
128
    {
129
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
130
131
        return $this;
132
    }
133
134
    /**
135
     * @param int $maximumCrawlCount
136
     *
137
     * @return $this
138
     */
139
    public function setMaximumCrawlCount(int $maximumCrawlCount)
140
    {
141
        $this->maximumCrawlCount = $maximumCrawlCount;
142
143
        return $this;
144
    }
145
146
    /**
147
     * @param int $maximumDepth
148
     *
149
     * @return $this
150
     */
151
    public function setMaximumDepth(int $maximumDepth)
152
    {
153
        $this->maximumDepth = $maximumDepth;
154
155
        return $this;
156
    }
157
158
    /**
159
     * @param bool $ignoreRobots
160
     *
161
     * @return $this
162
     */
163
    public function ignoreRobots(bool $ignoreRobots = true)
164
    {
165
        $this->ignoreRobots = $ignoreRobots;
166
167
        return $this;
168
    }
169
170
    /**
171
     * @param CrawlQueue $crawlQueue
172
     *
173
     * @return $this
174
     */
175
    public function setCrawlQueue(CrawlQueue $crawlQueue)
176
    {
177
        $this->crawlQueue = $crawlQueue;
178
179
        return $this;
180
    }
181
182
    /**
183
     * @return $this
184
     */
185
    public function executeJavaScript()
186
    {
187
        $this->executeJavaScript = true;
188
189
        return $this;
190
    }
191
192
    /**
193
     * @return $this
194
     */
195
    public function doNotExecuteJavaScript()
196
    {
197
        $this->executeJavaScript = false;
198
199
        return $this;
200
    }
201
202
    /**
203
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
204
     *
205
     * @return $this
206
     */
207
    public function setCrawlObserver($crawlObservers)
208
    {
209
        if (! is_array($crawlObservers)) {
210
            $crawlObservers = [$crawlObservers];
211
        }
212
213
        return $this->setCrawlObservers($crawlObservers);
214
    }
215
216
    public function setCrawlObservers(array $crawlObservers)
217
    {
218
        $this->crawlObservers = $crawlObservers;
219
220
        return $this;
221
    }
222
223
    public function addCrawlObserver(CrawlObserver $crawlObserver)
224
    {
225
        $this->crawlObservers[] = $crawlObserver;
226
227
        return $this;
228
    }
229
230
    /**
231
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
232
     *
233
     * @return $this
234
     */
235
    public function setCrawlProfile(CrawlProfile $crawlProfile)
236
    {
237
        $this->crawlProfile = $crawlProfile;
238
239
        return $this;
240
    }
241
242
    /**
243
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
244
     */
245
    public function startCrawling($baseUrl)
246
    {
247
        if (! $baseUrl instanceof UriInterface) {
248
            $baseUrl = new Uri($baseUrl);
249
        }
250
251
        if ($baseUrl->getScheme() === '') {
252
            $baseUrl = $baseUrl->withScheme('http');
253
        }
254
255
        if ($baseUrl->getPath() === '') {
256
            $baseUrl = $baseUrl->withPath('/');
257
        }
258
259
        $this->baseUrl = $baseUrl;
260
261
        $crawlUrl = CrawlUrl::create($this->baseUrl);
262
263
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
264
265
        $this->addToCrawlQueue($crawlUrl);
266
267
        $this->depthTree = new Node((string) $this->baseUrl);
268
269
        $this->startCrawlingQueue();
270
271
        foreach ($this->crawlObservers as $crawlObserver) {
272
            $crawlObserver->finishedCrawling();
273
        }
274
    }
275
276
    protected function startCrawlingQueue()
277
    {
278
        while ($this->crawlQueue->hasPendingUrls()) {
279
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
280
                'concurrency' => $this->concurrency,
281
                'options' => $this->client->getConfig(),
282
                'fulfilled' => function (ResponseInterface $response, $index) {
283
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
284
285
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
286
287
                    $robotsHeaders = RobotsHeaders::create($response->getHeaders());
288
289
                    $robotsMeta = RobotsMeta::create($body);
290
291
                    if (! $this->mayIndex($robotsHeaders, $robotsMeta)) {
292
                        return;
293
                    }
294
295
                    $this->handleCrawled($response, $crawlUrl);
296
297
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
298
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
299
                            return;
300
                        }
301
                    }
302
303
                    if (! $this->mayFollow($robotsHeaders, $robotsMeta)) {
304
                        return;
305
                    }
306
307
                    $this->addAllLinksToCrawlQueue(
308
                        $body,
309
                        $crawlUrl->url
310
                    );
311
                },
312
                'rejected' => function (RequestException $exception, $index) {
313
                    $this->handleCrawlFailed(
314
                        $exception,
315
                        $this->crawlQueue->getUrlById($index),
316
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
317
                    );
318
                },
319
            ]);
320
321
            $promise = $pool->promise();
322
            $promise->wait();
323
        }
324
    }
325
326
    public function endsWith($haystack, $needle)
327
    {
328
        return strrpos($haystack, $needle) + strlen($needle) ===
329
            strlen($haystack);
330
    }
331
332
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
333
    {
334
        $bodyStream->rewind();
335
336
        $body = $bodyStream->read($readMaximumBytes);
337
338
        return $body;
339
    }
340
341
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
342
    {
343
        return RobotsTxt::create($uri->withPath('/robots.txt'));
344
    }
345
346
    /**
347
     * @param ResponseInterface|null $response
348
     * @param CrawlUrl               $crawlUrl
349
     */
350
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
351
    {
352
        foreach ($this->crawlObservers as $crawlObserver) {
353
            $crawlObserver->crawled(
354
                $crawlUrl->url,
355
                $response,
356
                $crawlUrl->foundOnUrl
357
            );
358
        }
359
    }
360
361
    /**
362
     * @param RequestException $exception
363
     * @param CrawlUrl         $crawlUrl
364
     */
365
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
366
    {
367
        foreach ($this->crawlObservers as $crawlObserver) {
368
            $crawlObserver->crawlFailed(
369
                $crawlUrl->url,
370
                $exception,
371
                $crawlUrl->foundOnUrl
372
            );
373
        }
374
    }
375
376
    protected function getCrawlRequests(): Generator
377
    {
378
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
379
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
380
                $this->crawlQueue->markAsProcessed($crawlUrl);
381
                continue;
382
            }
383
384
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
385
                continue;
386
            }
387
388
            foreach ($this->crawlObservers as $crawlObserver) {
389
                $crawlObserver->willCrawl($crawlUrl->url);
390
            }
391
392
            $this->crawlQueue->markAsProcessed($crawlUrl);
393
394
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
395
        }
396
    }
397
398
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
399
    {
400
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
401
402
        collect($allLinks)
403
            ->filter(function (UriInterface $url) {
404
                return $this->hasCrawlableScheme($url);
405
            })
406
            ->map(function (UriInterface $url) {
407
                return $this->normalizeUrl($url);
408
            })
409
            ->filter(function (UriInterface $url) {
410
                return $this->crawlProfile->shouldCrawl($url);
411
            })
412
            ->reject(function (UriInterface $url) {
413
                return $this->crawlQueue->has($url);
414
            })
415
            ->each(function (UriInterface $url) use ($foundOnUrl) {
416
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
417
418
                if (strpos($url->getPath(), '/tel:') === 0) {
419
                    return;
420
                }
421
422
                if (! $this->shouldCrawl($node)) {
423
                    return;
424
                }
425
426
                if ($this->maximumCrawlCountReached()) {
427
                    return;
428
                }
429
430
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
431
432
                $this->addToCrawlQueue($crawlUrl);
433
            });
434
    }
435
436
    protected function shouldCrawl(Node $node): bool
437
    {
438
        if (! $this->ignoreRobots && ! $this->robotsTxt->allows($node->getValue())) {
439
            return false;
440
        }
441
442
        if (is_null($this->maximumDepth)) {
443
            return true;
444
        }
445
446
        return $node->getDepth() <= $this->maximumDepth;
447
    }
448
449
    /**
450
     * @param string                         $html
451
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
452
     *
453
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
454
     */
455
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
456
    {
457
        if ($this->executeJavaScript) {
458
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
459
        }
460
461
        $domCrawler = new DomCrawler($html, $foundOnUrl);
462
463
        return collect($domCrawler->filterXpath('//a')->links())
464
            ->reject(function (Link $link) {
465
                return $link->getNode()->getAttribute('rel') === 'nofollow';
466
            })
467
            ->map(function (Link $link) {
468
                try {
469
                    return new Uri($link->getUri());
470
                } catch (InvalidArgumentException $exception) {
471
                    return;
472
                }
473
            })
474
            ->filter();
475
    }
476
477
    protected function normalizeUrl(UriInterface $url): UriInterface
478
    {
479
        return $url->withFragment('');
480
    }
481
482
    protected function hasCrawlableScheme(UriInterface $uri): bool
483
    {
484
        return in_array($uri->getScheme(), ['http', 'https']);
485
    }
486
487
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
488
    {
489
        $returnNode = null;
490
491
        if ($node->getValue() === (string) $parentUrl) {
492
            $newNode = new Node((string) $url);
493
494
            $node->addChild($newNode);
495
496
            return $newNode;
497
        }
498
499
        foreach ($node->getChildren() as $currentNode) {
500
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
501
502
            if (! is_null($returnNode)) {
503
                break;
504
            }
505
        }
506
507
        return $returnNode;
508
    }
509
510
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
511
    {
512
        $browsershot = $this->getBrowsershot();
513
514
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
515
516
        return html_entity_decode($html);
517
    }
518
519
    protected function getBrowsershot(): Browsershot
520
    {
521
        if ($this->browsershot) {
522
            return $this->browsershot;
523
        }
524
525
        $this->browsershot = new Browsershot();
526
527
        return $this->browsershot;
528
    }
529
530
    public function setBrowsershot(Browsershot $browsershot)
531
    {
532
        $this->browsershot = $browsershot;
533
534
        return $this;
535
    }
536
537
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
538
    {
539
        $this->crawledUrlCount++;
540
541
        $this->crawlQueue->add($crawlUrl);
542
543
        return $this;
544
    }
545
546
    protected function maximumCrawlCountReached(): bool
547
    {
548
        if (is_null($this->maximumCrawlCount)) {
549
            return false;
550
        }
551
552
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
553
    }
554
555
    protected function mayIndex(RobotsHeaders  $robotsHeaders, RobotsMeta $robotsMeta): bool
556
    {
557
        if ($this->ignoreRobots) {
558
            return true;
559
        }
560
561
        return $robotsHeaders->mayIndex() && $robotsMeta->mayIndex();
562
    }
563
564
    protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool
565
    {
566
        if ($this->ignoreRobots) {
567
            return true;
568
        }
569
570
        return $robotsHeaders->mayFollow() && $robotsMeta->mayFollow();
571
    }
572
}
573