Completed
Push — master ( 530533...08338b )
by Brent
11s
created

Crawler   F

Complexity

Total Complexity 65

Size/Duplication

Total Lines 469
Duplicated Lines 0 %

Coupling/Cohesion

Components 4
Dependencies 17

Importance

Changes 0
Metric Value
wmc 65
lcom 4
cbo 17
dl 0
loc 469
rs 3.2
c 0
b 0
f 0

42 Methods

Rating   Name   Duplication   Size   Complexity  
A create() 0 10 2
A __construct() 0 16 1
A setConcurrency() 0 6 1
A setMaximumResponseSize() 0 6 1
A getMaximumResponseSize() 0 4 1
A setMaximumCrawlCount() 0 6 1
A getMaximumCrawlCount() 0 4 1
A getCrawlerUrlCount() 0 4 1
A setMaximumDepth() 0 6 1
A getMaximumDepth() 0 4 1
A setDelayBetweenRequests() 0 6 1
A getDelayBetweenRequests() 0 4 1
A ignoreRobots() 0 6 1
A respectRobots() 0 6 1
A mustRespectRobots() 0 4 1
A getRobotsTxt() 0 4 1
A setCrawlQueue() 0 6 1
A getCrawlQueue() 0 4 1
A executeJavaScript() 0 6 1
A doNotExecuteJavaScript() 0 6 1
A mayExecuteJavascript() 0 4 1
A setCrawlObserver() 0 8 2
A setCrawlObservers() 0 6 1
A addCrawlObserver() 0 6 1
A getCrawlObservers() 0 4 1
A setCrawlProfile() 0 6 1
A getCrawlProfile() 0 4 1
A setCrawlFulfilledHandlerClass() 0 12 2
A setCrawlFailedHandlerClass() 0 12 2
A setBrowsershot() 0 6 1
A getUserAgent() 0 4 1
A getBrowsershot() 0 8 2
A getBaseUrl() 0 4 1
B startCrawling() 0 34 7
A addToDepthTree() 0 28 5
A startCrawlingQueue() 0 15 2
A endsWith() 0 5 1
A createRobotsTxt() 0 4 1
A getCrawlRequests() 0 21 5
A addToCrawlQueue() 0 16 3
A maximumCrawlCountReached() 0 10 2
A setUserAgent() 0 9 1

How to fix   Complexity   

Complex Class

Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
20
21
class Crawler
22
{
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var \Spatie\Crawler\CrawlObserverCollection */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var bool */
54
    protected $respectRobots = true;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var \Spatie\Robots\RobotsTxt */
66
    protected $robotsTxt = null;
67
68
    /** @var string */
69
    protected $crawlRequestFulfilledClass;
70
71
    /** @var string */
72
    protected $crawlRequestFailedClass;
73
74
    /** @var int */
75
    protected $delayBetweenRequests = 0;
76
77
    /** @var   */
78
    protected static $defaultClientOptions = [
79
        RequestOptions::COOKIES => true,
80
        RequestOptions::CONNECT_TIMEOUT => 10,
81
        RequestOptions::TIMEOUT => 10,
82
        RequestOptions::ALLOW_REDIRECTS => false,
83
    ];
84
85
    public static function create(array $clientOptions = []): Crawler
86
    {
87
        $clientOptions = (count($clientOptions))
88
            ? $clientOptions
89
            : static::$defaultClientOptions;
90
91
        $client = new Client($clientOptions);
92
93
        return new static($client);
94
    }
95
96
    public function __construct(Client $client, int $concurrency = 10)
97
    {
98
        $this->client = $client;
99
100
        $this->concurrency = $concurrency;
101
102
        $this->crawlProfile = new CrawlAllUrls();
103
104
        $this->crawlQueue = new CollectionCrawlQueue();
105
106
        $this->crawlObservers = new CrawlObserverCollection();
107
108
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
109
110
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
111
    }
112
113
    public function setConcurrency(int $concurrency): Crawler
114
    {
115
        $this->concurrency = $concurrency;
116
117
        return $this;
118
    }
119
120
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
121
    {
122
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
123
124
        return $this;
125
    }
126
127
    public function getMaximumResponseSize(): ?int
128
    {
129
        return $this->maximumResponseSize;
130
    }
131
132
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
133
    {
134
        $this->maximumCrawlCount = $maximumCrawlCount;
135
136
        return $this;
137
    }
138
139
    public function getMaximumCrawlCount(): ?int
140
    {
141
        return $this->maximumCrawlCount;
142
    }
143
144
    public function getCrawlerUrlCount(): int
145
    {
146
        return $this->crawledUrlCount;
147
    }
148
149
    public function setMaximumDepth(int $maximumDepth): Crawler
150
    {
151
        $this->maximumDepth = $maximumDepth;
152
153
        return $this;
154
    }
155
156
    public function getMaximumDepth(): ?int
157
    {
158
        return $this->maximumDepth;
159
    }
160
161
    /**
162
     * @param int $delay The delay in milliseconds.
163
     *
164
     * @return Crawler
165
     */
166
    public function setDelayBetweenRequests(int $delay): Crawler
167
    {
168
        $this->delayBetweenRequests = ($delay * 1000);
169
170
        return $this;
171
    }
172
173
    /**
174
     * @return int The delay in milliseconds.
175
     */
176
    public function getDelayBetweenRequests(): int
177
    {
178
        return $this->delayBetweenRequests;
179
    }
180
181
    public function ignoreRobots(): Crawler
182
    {
183
        $this->respectRobots = false;
184
185
        return $this;
186
    }
187
188
    public function respectRobots(): Crawler
189
    {
190
        $this->respectRobots = true;
191
192
        return $this;
193
    }
194
195
    public function mustRespectRobots(): bool
196
    {
197
        return $this->respectRobots;
198
    }
199
200
    public function getRobotsTxt(): RobotsTxt
201
    {
202
        return $this->robotsTxt;
203
    }
204
205
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
206
    {
207
        $this->crawlQueue = $crawlQueue;
208
209
        return $this;
210
    }
211
212
    public function getCrawlQueue(): CrawlQueue
213
    {
214
        return $this->crawlQueue;
215
    }
216
217
    public function executeJavaScript(): Crawler
218
    {
219
        $this->executeJavaScript = true;
220
221
        return $this;
222
    }
223
224
    public function doNotExecuteJavaScript(): Crawler
225
    {
226
        $this->executeJavaScript = false;
227
228
        return $this;
229
    }
230
231
    public function mayExecuteJavascript(): bool
232
    {
233
        return $this->executeJavaScript;
234
    }
235
236
    /**
237
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
238
     *
239
     * @return $this
240
     */
241
    public function setCrawlObserver($crawlObservers): Crawler
242
    {
243
        if (! is_array($crawlObservers)) {
244
            $crawlObservers = [$crawlObservers];
245
        }
246
247
        return $this->setCrawlObservers($crawlObservers);
248
    }
249
250
    public function setCrawlObservers(array $crawlObservers): Crawler
251
    {
252
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
253
254
        return $this;
255
    }
256
257
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
258
    {
259
        $this->crawlObservers->addObserver($crawlObserver);
260
261
        return $this;
262
    }
263
264
    public function getCrawlObservers(): CrawlObserverCollection
265
    {
266
        return $this->crawlObservers;
267
    }
268
269
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
270
    {
271
        $this->crawlProfile = $crawlProfile;
272
273
        return $this;
274
    }
275
276
    public function getCrawlProfile(): CrawlProfile
277
    {
278
        return $this->crawlProfile;
279
    }
280
281
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
282
    {
283
        $baseClass = CrawlRequestFulfilled::class;
284
285
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
286
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
287
        }
288
289
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
290
291
        return $this;
292
    }
293
294
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
295
    {
296
        $baseClass = CrawlRequestFailed::class;
297
298
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
299
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
300
        }
301
302
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
303
304
        return $this;
305
    }
306
307
    public function setBrowsershot(Browsershot $browsershot)
308
    {
309
        $this->browsershot = $browsershot;
310
311
        return $this;
312
    }
313
314
    public function setUserAgent(string $userAgent): Crawler
315
    {
316
        $clientOptions = $this->client->getConfig();
317
        $clientOptions['headers']['User-Agent'] = strtolower($userAgent);
318
319
        $this->client = new Client($clientOptions);
320
321
        return $this;
322
    }
323
324
    public function getUserAgent(): string
325
    {
326
        return $this->client->getConfig('headers')['User-Agent'];
327
    }
328
329
    public function getBrowsershot(): Browsershot
330
    {
331
        if (! $this->browsershot) {
332
            $this->browsershot = new Browsershot();
333
        }
334
335
        return $this->browsershot;
336
    }
337
338
    public function getBaseUrl(): UriInterface
339
    {
340
        return $this->baseUrl;
341
    }
342
343
    /**
344
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
345
     */
346
    public function startCrawling($baseUrl)
347
    {
348
        if (! $baseUrl instanceof UriInterface) {
349
            $baseUrl = new Uri($baseUrl);
350
        }
351
352
        if ($baseUrl->getScheme() === '') {
353
            $baseUrl = $baseUrl->withScheme('http');
354
        }
355
356
        if ($baseUrl->getPath() === '') {
357
            $baseUrl = $baseUrl->withPath('/');
358
        }
359
360
        $this->baseUrl = $baseUrl;
361
362
        $crawlUrl = CrawlUrl::create($this->baseUrl);
363
364
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
365
366
        if ($this->robotsTxt->allows((string) $crawlUrl->url, (string) $this->getUserAgent()) ||
367
            ! $this->respectRobots
368
        ) {
369
            $this->addToCrawlQueue($crawlUrl);
370
        }
371
372
        $this->depthTree = new Node((string) $this->baseUrl);
373
374
        $this->startCrawlingQueue();
375
376
        foreach ($this->crawlObservers as $crawlObserver) {
377
            $crawlObserver->finishedCrawling();
378
        }
379
    }
380
381
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
382
    {
383
        if (is_null($this->maximumDepth)) {
384
            return new Node((string) $url);
385
        }
386
387
        $node = $node ?? $this->depthTree;
388
389
        $returnNode = null;
390
391
        if ($node->getValue() === (string) $parentUrl) {
392
            $newNode = new Node((string) $url);
393
394
            $node->addChild($newNode);
395
396
            return $newNode;
397
        }
398
399
        foreach ($node->getChildren() as $currentNode) {
400
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
401
402
            if (! is_null($returnNode)) {
403
                break;
404
            }
405
        }
406
407
        return $returnNode;
408
    }
409
410
    protected function startCrawlingQueue()
411
    {
412
        while ($this->crawlQueue->hasPendingUrls()) {
413
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
414
                'concurrency' => $this->concurrency,
415
                'options' => $this->client->getConfig(),
416
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
417
                'rejected' => new $this->crawlRequestFailedClass($this),
418
            ]);
419
420
            $promise = $pool->promise();
421
422
            $promise->wait();
423
        }
424
    }
425
426
    /**
427
     * @deprecated This function will be removed in the next major version
428
     */
429
    public function endsWith($haystack, $needle)
430
    {
431
        return strrpos($haystack, $needle) + strlen($needle) ===
432
            strlen($haystack);
433
    }
434
435
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
436
    {
437
        return RobotsTxt::create($uri->withPath('/robots.txt'));
438
    }
439
440
    protected function getCrawlRequests(): Generator
441
    {
442
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
443
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
444
                $this->crawlQueue->markAsProcessed($crawlUrl);
445
                continue;
446
            }
447
448
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
449
                continue;
450
            }
451
452
            foreach ($this->crawlObservers as $crawlObserver) {
453
                $crawlObserver->willCrawl($crawlUrl->url);
454
            }
455
456
            $this->crawlQueue->markAsProcessed($crawlUrl);
457
458
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
459
        }
460
    }
461
462
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
463
    {
464
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
465
            return $this;
466
        }
467
468
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
469
            return $this;
470
        }
471
472
        $this->crawledUrlCount++;
473
474
        $this->crawlQueue->add($crawlUrl);
475
476
        return $this;
477
    }
478
479
    public function maximumCrawlCountReached(): bool
480
    {
481
        $maximumCrawlCount = $this->getMaximumCrawlCount();
482
483
        if (is_null($maximumCrawlCount)) {
484
            return false;
485
        }
486
487
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
488
    }
489
}
490