Completed
Pull Request — master (#210)
by
unknown
04:11
created

Crawler::addToDepthArray()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 16
rs 9.7333
c 0
b 0
f 0
cc 3
nc 3
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Uri;
9
use GuzzleHttp\Psr7\Request;
10
use Spatie\Robots\RobotsTxt;
11
use GuzzleHttp\RequestOptions;
12
use Psr\Http\Message\UriInterface;
13
use Spatie\Browsershot\Browsershot;
14
use Spatie\Crawler\CrawlQueue\CrawlQueue;
15
use Spatie\Crawler\Handlers\CrawlRequestFailed;
16
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
17
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
18
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
19
20
class Crawler
21
{
22
    /** @var \GuzzleHttp\Client */
23
    protected $client;
24
25
    /** @var \Psr\Http\Message\UriInterface */
26
    protected $baseUrl;
27
28
    /** @var \Spatie\Crawler\CrawlObserverCollection */
29
    protected $crawlObservers;
30
31
    /** @var \Spatie\Crawler\CrawlProfile */
32
    protected $crawlProfile;
33
34
    /** @var int */
35
    protected $concurrency;
36
37
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
38
    protected $crawlQueue;
39
40
    /** @var int */
41
    protected $crawledUrlCount = 0;
42
43
    /** @var int|null */
44
    protected $maximumCrawlCount = null;
45
46
    /** @var int */
47
    protected $maximumResponseSize = 1024 * 1024 * 2;
48
49
    /** @var int|null */
50
    protected $maximumDepth = null;
51
52
    /** @var bool */
53
    protected $respectRobots = true;
54
55
    /** @var array */
56
    protected $depthArray;
57
58
    /** @var bool */
59
    protected $executeJavaScript = false;
60
61
    /** @var Browsershot */
62
    protected $browsershot = null;
63
64
    /** @var \Spatie\Robots\RobotsTxt */
65
    protected $robotsTxt = null;
66
67
    /** @var string */
68
    protected $crawlRequestFulfilledClass;
69
70
    /** @var string */
71
    protected $crawlRequestFailedClass;
72
73
    /** @var float */
74
    protected $delayBetweenRequests = 0;
75
76
    /** @var   */
77
    protected static $defaultClientOptions = [
78
        RequestOptions::COOKIES => true,
79
        RequestOptions::CONNECT_TIMEOUT => 10,
80
        RequestOptions::TIMEOUT => 10,
81
        RequestOptions::ALLOW_REDIRECTS => false,
82
    ];
83
84
    public static function create(array $clientOptions = []): Crawler
85
    {
86
        $clientOptions = (count($clientOptions))
87
            ? $clientOptions
88
            : static::$defaultClientOptions;
89
90
        $client = new Client($clientOptions);
91
92
        return new static($client);
93
    }
94
95
    public function __construct(Client $client, int $concurrency = 10)
96
    {
97
        $this->client = $client;
98
99
        $this->concurrency = $concurrency;
100
101
        $this->crawlProfile = new CrawlAllUrls();
102
103
        $this->crawlQueue = new CollectionCrawlQueue();
104
105
        $this->crawlObservers = new CrawlObserverCollection();
106
107
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
108
109
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
110
    }
111
112
    public function setConcurrency(int $concurrency): Crawler
113
    {
114
        $this->concurrency = $concurrency;
115
116
        return $this;
117
    }
118
119
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
120
    {
121
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
122
123
        return $this;
124
    }
125
126
    public function getMaximumResponseSize(): ?int
127
    {
128
        return $this->maximumResponseSize;
129
    }
130
131
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
132
    {
133
        $this->maximumCrawlCount = $maximumCrawlCount;
134
135
        return $this;
136
    }
137
138
    public function getMaximumCrawlCount(): ?int
139
    {
140
        return $this->maximumCrawlCount;
141
    }
142
143
    public function getCrawlerUrlCount(): int
144
    {
145
        return $this->crawledUrlCount;
146
    }
147
148
    public function setMaximumDepth(int $maximumDepth): Crawler
149
    {
150
        $this->maximumDepth = $maximumDepth;
151
152
        return $this;
153
    }
154
155
    public function getMaximumDepth(): ?int
156
    {
157
        return $this->maximumDepth;
158
    }
159
160
    public function setDelayBetweenRequests(int $delay): Crawler
161
    {
162
        $this->delayBetweenRequests = ($delay * 1000);
0 ignored issues
show
Documentation Bug introduced by
The property $delayBetweenRequests was declared of type double, but $delay * 1000 is of type integer. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
163
164
        return $this;
165
    }
166
167
    public function getDelayBetweenRequests(): float
168
    {
169
        return $this->delayBetweenRequests;
170
    }
171
172
    public function ignoreRobots(): Crawler
173
    {
174
        $this->respectRobots = false;
175
176
        return $this;
177
    }
178
179
    public function respectRobots(): Crawler
180
    {
181
        $this->respectRobots = true;
182
183
        return $this;
184
    }
185
186
    public function mustRespectRobots(): bool
187
    {
188
        return $this->respectRobots;
189
    }
190
191
    public function getRobotsTxt(): RobotsTxt
192
    {
193
        return $this->robotsTxt;
194
    }
195
196
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
197
    {
198
        $this->crawlQueue = $crawlQueue;
199
200
        return $this;
201
    }
202
203
    public function getCrawlQueue(): CrawlQueue
204
    {
205
        return $this->crawlQueue;
206
    }
207
208
    public function executeJavaScript(): Crawler
209
    {
210
        $this->executeJavaScript = true;
211
212
        return $this;
213
    }
214
215
    public function doNotExecuteJavaScript(): Crawler
216
    {
217
        $this->executeJavaScript = false;
218
219
        return $this;
220
    }
221
222
    public function mayExecuteJavascript(): bool
223
    {
224
        return $this->executeJavaScript;
225
    }
226
227
    /**
228
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
229
     *
230
     * @return $this
231
     */
232
    public function setCrawlObserver($crawlObservers): Crawler
233
    {
234
        if (! is_array($crawlObservers)) {
235
            $crawlObservers = [$crawlObservers];
236
        }
237
238
        return $this->setCrawlObservers($crawlObservers);
239
    }
240
241
    public function setCrawlObservers(array $crawlObservers): Crawler
242
    {
243
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
244
245
        return $this;
246
    }
247
248
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
249
    {
250
        $this->crawlObservers->addObserver($crawlObserver);
251
252
        return $this;
253
    }
254
255
    public function getCrawlObservers(): CrawlObserverCollection
256
    {
257
        return $this->crawlObservers;
258
    }
259
260
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
261
    {
262
        $this->crawlProfile = $crawlProfile;
263
264
        return $this;
265
    }
266
267
    public function getCrawlProfile(): CrawlProfile
268
    {
269
        return $this->crawlProfile;
270
    }
271
272
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
273
    {
274
        $baseClass = CrawlRequestFulfilled::class;
275
276
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
277
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
278
        }
279
280
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
281
282
        return $this;
283
    }
284
285
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
286
    {
287
        $baseClass = CrawlRequestFailed::class;
288
289
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
290
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
291
        }
292
293
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
294
295
        return $this;
296
    }
297
298
    public function setBrowsershot(Browsershot $browsershot)
299
    {
300
        $this->browsershot = $browsershot;
301
302
        return $this;
303
    }
304
305
    public function getBrowsershot(): Browsershot
306
    {
307
        if (! $this->browsershot) {
308
            $this->browsershot = new Browsershot();
309
        }
310
311
        return $this->browsershot;
312
    }
313
314
    public function getBaseUrl(): UriInterface
315
    {
316
        return $this->baseUrl;
317
    }
318
319
    /**
320
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
321
     */
322
    public function startCrawling($baseUrl)
323
    {
324
        if (! $baseUrl instanceof UriInterface) {
325
            $baseUrl = new Uri($baseUrl);
326
        }
327
328
        if ($baseUrl->getScheme() === '') {
329
            $baseUrl = $baseUrl->withScheme('http');
330
        }
331
332
        if ($baseUrl->getPath() === '') {
333
            $baseUrl = $baseUrl->withPath('/');
334
        }
335
336
        $this->baseUrl = $baseUrl;
337
338
        $crawlUrl = CrawlUrl::create($this->baseUrl);
339
340
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
341
342
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
343
            ! $this->respectRobots
344
        ) {
345
            $this->addToCrawlQueue($crawlUrl);
346
        }
347
348
        $this->depthArray = [(string) $this->baseUrl => 0];
349
350
        $this->startCrawlingQueue();
351
352
        foreach ($this->crawlObservers as $crawlObserver) {
353
            $crawlObserver->finishedCrawling();
354
        }
355
    }
356
357
    public function addToDepthArray(UriInterface $url, UriInterface $parentUrl): int
358
    {
359
        if (is_null($this->maximumDepth)) {
360
            return 0;
361
        }
362
363
        if (isset($this->depthArray[(string) $url])) {
364
            return $this->depthArray[(string) $url];
365
        }
366
367
        $depth = $this->depthArray[(string) $parentUrl] + 1;
368
369
        $this->depthArray[(string) $url] = $depth;
370
371
        return $depth;
372
    }
373
374
    protected function startCrawlingQueue()
375
    {
376
        while ($this->crawlQueue->hasPendingUrls()) {
377
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
378
                'concurrency' => $this->concurrency,
379
                'options' => $this->client->getConfig(),
380
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
381
                'rejected' => new $this->crawlRequestFailedClass($this),
382
            ]);
383
384
            $promise = $pool->promise();
385
386
            $promise->wait();
387
        }
388
    }
389
390
    /**
391
     * @deprecated This function will be removed in the next major version
392
     */
393
    public function endsWith($haystack, $needle)
394
    {
395
        return strrpos($haystack, $needle) + strlen($needle) ===
396
            strlen($haystack);
397
    }
398
399
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
400
    {
401
        return RobotsTxt::create($uri->withPath('/robots.txt'));
402
    }
403
404
    protected function getCrawlRequests(): Generator
405
    {
406
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
407
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
408
                $this->crawlQueue->markAsProcessed($crawlUrl);
409
                continue;
410
            }
411
412
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
413
                continue;
414
            }
415
416
            foreach ($this->crawlObservers as $crawlObserver) {
417
                $crawlObserver->willCrawl($crawlUrl->url);
418
            }
419
420
            $this->crawlQueue->markAsProcessed($crawlUrl);
421
422
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
423
        }
424
    }
425
426
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
427
    {
428
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
429
            return $this;
430
        }
431
432
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
433
            return $this;
434
        }
435
436
        $this->crawledUrlCount++;
437
438
        $this->crawlQueue->add($crawlUrl);
439
440
        return $this;
441
    }
442
443
    public function maximumCrawlCountReached(): bool
444
    {
445
        $maximumCrawlCount = $this->getMaximumCrawlCount();
446
447
        if (is_null($maximumCrawlCount)) {
448
            return false;
449
        }
450
451
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
452
    }
453
}
454