Completed
Pull Request — master (#237)
by Benjamin
01:16
created

Crawler::getRetryProfile()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
20
use Spatie\Crawler\RetryProfile\NoRetry;
21
22
class Crawler
23
{
24
    /** @var \GuzzleHttp\Client */
25
    protected $client;
26
27
    /** @var \Psr\Http\Message\UriInterface */
28
    protected $baseUrl;
29
30
    /** @var \Spatie\Crawler\CrawlObserverCollection */
31
    protected $crawlObservers;
32
33
    /** @var \Spatie\Crawler\CrawlProfile */
34
    protected $crawlProfile;
35
36
    /** @var RetryProfile */
37
    protected $retryProfile;
38
39
    /** @var int */
40
    protected $concurrency;
41
42
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
43
    protected $crawlQueue;
44
45
    /** @var int */
46
    protected $crawledUrlCount = 0;
47
48
    /** @var int|null */
49
    protected $maximumCrawlCount = null;
50
51
    /** @var int */
52
    protected $maximumResponseSize = 1024 * 1024 * 2;
53
54
    /** @var int|null */
55
    protected $maximumDepth = null;
56
57
    /** @var bool */
58
    protected $respectRobots = true;
59
60
    /** @var \Tree\Node\Node */
61
    protected $depthTree;
62
63
    /** @var bool */
64
    protected $executeJavaScript = false;
65
66
    /** @var Browsershot */
67
    protected $browsershot = null;
68
69
    /** @var \Spatie\Robots\RobotsTxt */
70
    protected $robotsTxt = null;
71
72
    /** @var string */
73
    protected $crawlRequestFulfilledClass;
74
75
    /** @var string */
76
    protected $crawlRequestFailedClass;
77
78
    /** @var float */
79
    protected $delayBetweenRequests = 0;
80
81
    /** @var   */
82
    protected static $defaultClientOptions = [
83
        RequestOptions::COOKIES => true,
84
        RequestOptions::CONNECT_TIMEOUT => 10,
85
        RequestOptions::TIMEOUT => 10,
86
        RequestOptions::ALLOW_REDIRECTS => false,
87
    ];
88
89
    public static function create(array $clientOptions = []): Crawler
90
    {
91
        $clientOptions = (count($clientOptions))
92
            ? $clientOptions
93
            : static::$defaultClientOptions;
94
95
        $client = new Client($clientOptions);
96
97
        return new static($client);
98
    }
99
100
    public function __construct(Client $client, int $concurrency = 10)
101
    {
102
        $this->client = $client;
103
104
        $this->concurrency = $concurrency;
105
106
        $this->crawlProfile = new CrawlAllUrls();
107
108
        $this->retryProfile = new NoRetry();
109
110
        $this->crawlQueue = new CollectionCrawlQueue();
111
112
        $this->crawlObservers = new CrawlObserverCollection();
113
114
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
115
116
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
117
    }
118
119
    public function setConcurrency(int $concurrency): Crawler
120
    {
121
        $this->concurrency = $concurrency;
122
123
        return $this;
124
    }
125
126
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
127
    {
128
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
129
130
        return $this;
131
    }
132
133
    public function getMaximumResponseSize(): ?int
134
    {
135
        return $this->maximumResponseSize;
136
    }
137
138
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
139
    {
140
        $this->maximumCrawlCount = $maximumCrawlCount;
141
142
        return $this;
143
    }
144
145
    public function getMaximumCrawlCount(): ?int
146
    {
147
        return $this->maximumCrawlCount;
148
    }
149
150
    public function getCrawlerUrlCount(): int
151
    {
152
        return $this->crawledUrlCount;
153
    }
154
155
    public function setMaximumDepth(int $maximumDepth): Crawler
156
    {
157
        $this->maximumDepth = $maximumDepth;
158
159
        return $this;
160
    }
161
162
    public function getMaximumDepth(): ?int
163
    {
164
        return $this->maximumDepth;
165
    }
166
167
    public function setDelayBetweenRequests(int $delay): Crawler
168
    {
169
        $this->delayBetweenRequests = ($delay * 1000);
0 ignored issues
show
Documentation Bug introduced by
The property $delayBetweenRequests was declared of type double, but $delay * 1000 is of type integer. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
170
171
        return $this;
172
    }
173
174
    public function getDelayBetweenRequests(): float
175
    {
176
        return $this->delayBetweenRequests;
177
    }
178
179
    public function ignoreRobots(): Crawler
180
    {
181
        $this->respectRobots = false;
182
183
        return $this;
184
    }
185
186
    public function respectRobots(): Crawler
187
    {
188
        $this->respectRobots = true;
189
190
        return $this;
191
    }
192
193
    public function mustRespectRobots(): bool
194
    {
195
        return $this->respectRobots;
196
    }
197
198
    public function getRobotsTxt(): RobotsTxt
199
    {
200
        return $this->robotsTxt;
201
    }
202
203
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
204
    {
205
        $this->crawlQueue = $crawlQueue;
206
207
        return $this;
208
    }
209
210
    public function getCrawlQueue(): CrawlQueue
211
    {
212
        return $this->crawlQueue;
213
    }
214
215
    public function executeJavaScript(): Crawler
216
    {
217
        $this->executeJavaScript = true;
218
219
        return $this;
220
    }
221
222
    public function doNotExecuteJavaScript(): Crawler
223
    {
224
        $this->executeJavaScript = false;
225
226
        return $this;
227
    }
228
229
    public function mayExecuteJavascript(): bool
230
    {
231
        return $this->executeJavaScript;
232
    }
233
234
    /**
235
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
236
     *
237
     * @return $this
238
     */
239
    public function setCrawlObserver($crawlObservers): Crawler
240
    {
241
        if (! is_array($crawlObservers)) {
242
            $crawlObservers = [$crawlObservers];
243
        }
244
245
        return $this->setCrawlObservers($crawlObservers);
246
    }
247
248
    public function setCrawlObservers(array $crawlObservers): Crawler
249
    {
250
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
251
252
        return $this;
253
    }
254
255
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
256
    {
257
        $this->crawlObservers->addObserver($crawlObserver);
258
259
        return $this;
260
    }
261
262
    public function getCrawlObservers(): CrawlObserverCollection
263
    {
264
        return $this->crawlObservers;
265
    }
266
267
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
268
    {
269
        $this->crawlProfile = $crawlProfile;
270
271
        return $this;
272
    }
273
274
    public function getCrawlProfile(): CrawlProfile
275
    {
276
        return $this->crawlProfile;
277
    }
278
279
    public function setRetryProfile(RetryProfile $retryProfile): Crawler
280
    {
281
        $this->retryProfile = $retryProfile;
282
283
        return $this;
284
    }
285
286
    public function getRetryProfile(): RetryProfile
287
    {
288
        return $this->retryProfile;
289
    }
290
291
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
292
    {
293
        $baseClass = CrawlRequestFulfilled::class;
294
295
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
296
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
297
        }
298
299
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
300
301
        return $this;
302
    }
303
304
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
305
    {
306
        $baseClass = CrawlRequestFailed::class;
307
308
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
309
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
310
        }
311
312
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
313
314
        return $this;
315
    }
316
317
    public function setBrowsershot(Browsershot $browsershot)
318
    {
319
        $this->browsershot = $browsershot;
320
321
        return $this;
322
    }
323
324
    public function getBrowsershot(): Browsershot
325
    {
326
        if (! $this->browsershot) {
327
            $this->browsershot = new Browsershot();
328
        }
329
330
        return $this->browsershot;
331
    }
332
333
    public function getBaseUrl(): UriInterface
334
    {
335
        return $this->baseUrl;
336
    }
337
338
    /**
339
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
340
     */
341
    public function startCrawling($baseUrl)
342
    {
343
        if (! $baseUrl instanceof UriInterface) {
344
            $baseUrl = new Uri($baseUrl);
345
        }
346
347
        if ($baseUrl->getScheme() === '') {
348
            $baseUrl = $baseUrl->withScheme('http');
349
        }
350
351
        if ($baseUrl->getPath() === '') {
352
            $baseUrl = $baseUrl->withPath('/');
353
        }
354
355
        $this->baseUrl = $baseUrl;
356
357
        $crawlUrl = CrawlUrl::create($this->baseUrl);
358
359
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
360
361
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
362
            ! $this->respectRobots
363
        ) {
364
            $this->addToCrawlQueue($crawlUrl);
365
        }
366
367
        $this->depthTree = new Node((string) $this->baseUrl);
368
369
        $this->startCrawlingQueue();
370
371
        foreach ($this->crawlObservers as $crawlObserver) {
372
            $crawlObserver->finishedCrawling();
373
        }
374
    }
375
376
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
377
    {
378
        if (is_null($this->maximumDepth)) {
379
            return new Node((string) $url);
380
        }
381
382
        $node = $node ?? $this->depthTree;
383
384
        $returnNode = null;
385
386
        if ($node->getValue() === (string) $parentUrl) {
387
            $newNode = new Node((string) $url);
388
389
            $node->addChild($newNode);
390
391
            return $newNode;
392
        }
393
394
        foreach ($node->getChildren() as $currentNode) {
395
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
396
397
            if (! is_null($returnNode)) {
398
                break;
399
            }
400
        }
401
402
        return $returnNode;
403
    }
404
405
    protected function startCrawlingQueue()
406
    {
407
        while ($this->crawlQueue->hasPendingUrls()) {
408
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
409
                'concurrency' => $this->concurrency,
410
                'options' => $this->client->getConfig(),
411
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
412
                'rejected' => new $this->crawlRequestFailedClass($this),
413
            ]);
414
415
            $promise = $pool->promise();
416
417
            $promise->wait();
418
        }
419
    }
420
421
    /**
422
     * @deprecated This function will be removed in the next major version
423
     */
424
    public function endsWith($haystack, $needle)
425
    {
426
        return strrpos($haystack, $needle) + strlen($needle) ===
427
            strlen($haystack);
428
    }
429
430
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
431
    {
432
        return RobotsTxt::create($uri->withPath('/robots.txt'));
433
    }
434
435
    protected function getCrawlRequests(): Generator
436
    {
437
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
438
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
439
                $this->crawlQueue->markAsProcessed($crawlUrl);
440
                continue;
441
            }
442
443
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
444
                continue;
445
            }
446
447
            foreach ($this->crawlObservers as $crawlObserver) {
448
                $crawlObserver->willCrawl($crawlUrl->url);
449
            }
450
451
            $crawlUrl->incrementAttempts();
452
            $this->crawlQueue->markAsProcessed($crawlUrl);
453
454
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
455
        }
456
    }
457
458
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
459
    {
460
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
461
            return $this;
462
        }
463
464
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
465
            return $this;
466
        }
467
468
        $this->crawledUrlCount++;
469
470
        $this->crawlQueue->add($crawlUrl);
471
472
        return $this;
473
    }
474
475
    public function maximumCrawlCountReached(): bool
476
    {
477
        $maximumCrawlCount = $this->getMaximumCrawlCount();
478
479
        if (is_null($maximumCrawlCount)) {
480
            return false;
481
        }
482
483
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
484
    }
485
}
486