Completed
Pull Request — master (#192)
by
unknown
01:29
created

Crawler::getCrawlRequests()   B

Complexity

Conditions 7
Paths 6

Size

Total Lines 30

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 30
rs 8.5066
c 0
b 0
f 0
cc 7
nc 6
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
20
21
class Crawler
22
{
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var \Spatie\Crawler\CrawlObserverCollection */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var int|null */
54
    protected $poolItemLimit = null;
55
56
    /** @var bool */
57
    protected $respectRobots = true;
58
59
    /** @var \Tree\Node\Node */
60
    protected $depthTree;
61
62
    /** @var bool */
63
    protected $executeJavaScript = false;
64
65
    /** @var Browsershot */
66
    protected $browsershot = null;
67
68
    /** @var \Spatie\Robots\RobotsTxt */
69
    protected $robotsTxt = null;
70
71
    /** @var string */
72
    protected $crawlRequestFulfilledClass;
73
74
    /** @var string */
75
    protected $crawlRequestFailedClass;
76
77
    /** @var float */
78
    protected $delayBetweenRequests = 0;
79
80
    /** @var   */
81
    protected static $defaultClientOptions = [
82
        RequestOptions::COOKIES => true,
83
        RequestOptions::CONNECT_TIMEOUT => 10,
84
        RequestOptions::TIMEOUT => 10,
85
        RequestOptions::ALLOW_REDIRECTS => false,
86
    ];
87
88
    /** @var array */
89
    protected $proxiesConfig = null;
90
91
    /** @var bool */
92
    protected $usingProxies = false;
93
94
    public static function create(array $clientOptions = []): Crawler
95
    {
96
        $clientOptions = (count($clientOptions))
97
            ? $clientOptions
98
            : static::$defaultClientOptions;
99
100
        $client = new Client($clientOptions);
101
102
        return new static($client);
103
    }
104
105
    public function __construct(Client $client, int $concurrency = 10)
106
    {
107
        $this->client = $client;
108
109
        $this->concurrency = $concurrency;
110
111
        $this->crawlProfile = new CrawlAllUrls();
112
113
        $this->crawlQueue = new CollectionCrawlQueue();
114
115
        $this->crawlObservers = new CrawlObserverCollection();
116
117
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
118
119
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
120
    }
121
122
    public function setConcurrency(int $concurrency): Crawler
123
    {
124
        $this->concurrency = $concurrency;
125
126
        return $this;
127
    }
128
129
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
130
    {
131
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
132
133
        return $this;
134
    }
135
136
    public function getMaximumResponseSize(): ?int
137
    {
138
        return $this->maximumResponseSize;
139
    }
140
141
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
142
    {
143
        $this->maximumCrawlCount = $maximumCrawlCount;
144
145
        return $this;
146
    }
147
148
    public function getMaximumCrawlCount(): ?int
149
    {
150
        return $this->maximumCrawlCount;
151
    }
152
153
    public function getCrawlerUrlCount(): int
154
    {
155
        return $this->crawledUrlCount;
156
    }
157
158
    public function setMaximumDepth(int $maximumDepth): Crawler
159
    {
160
        $this->maximumDepth = $maximumDepth;
161
162
        return $this;
163
    }
164
165
    public function getPoolItemLimit(): ?int
166
    {
167
        return $this->poolItemLimit;
168
    }
169
170
    public function setPoolItemLimit(int $poolItemLimit): Crawler
171
    {
172
        $this->poolItemLimit = $poolItemLimit;
173
174
        return $this;
175
    }
176
177
    public function getMaximumDepth(): ?int
178
    {
179
        return $this->maximumDepth;
180
    }
181
182
    public function setDelayBetweenRequests(int $delay): Crawler
183
    {
184
        $this->delayBetweenRequests = ($delay * 1000);
0 ignored issues
show
Documentation Bug introduced by
The property $delayBetweenRequests was declared of type double, but $delay * 1000 is of type integer. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
185
186
        return $this;
187
    }
188
189
    public function getDelayBetweenRequests(): float
190
    {
191
        return $this->delayBetweenRequests;
192
    }
193
194
    public function ignoreRobots(): Crawler
195
    {
196
        $this->respectRobots = false;
197
198
        return $this;
199
    }
200
201
    public function respectRobots(): Crawler
202
    {
203
        $this->respectRobots = true;
204
205
        return $this;
206
    }
207
208
    public function mustRespectRobots(): bool
209
    {
210
        return $this->respectRobots;
211
    }
212
213
    public function getRobotsTxt(): RobotsTxt
214
    {
215
        return $this->robotsTxt;
216
    }
217
218
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
219
    {
220
        $this->crawlQueue = $crawlQueue;
221
222
        return $this;
223
    }
224
225
    public function getCrawlQueue(): CrawlQueue
226
    {
227
        return $this->crawlQueue;
228
    }
229
230
    public function executeJavaScript(): Crawler
231
    {
232
        $this->executeJavaScript = true;
233
234
        return $this;
235
    }
236
237
    public function doNotExecuteJavaScript(): Crawler
238
    {
239
        $this->executeJavaScript = false;
240
241
        return $this;
242
    }
243
244
    public function mayExecuteJavascript(): bool
245
    {
246
        return $this->executeJavaScript;
247
    }
248
249
    /**
250
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
251
     *
252
     * @return $this
253
     */
254
    public function setCrawlObserver($crawlObservers): Crawler
255
    {
256
        if (! is_array($crawlObservers)) {
257
            $crawlObservers = [$crawlObservers];
258
        }
259
260
        return $this->setCrawlObservers($crawlObservers);
261
    }
262
263
    public function setCrawlObservers(array $crawlObservers): Crawler
264
    {
265
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
266
267
        return $this;
268
    }
269
270
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
271
    {
272
        $this->crawlObservers->addObserver($crawlObserver);
273
274
        return $this;
275
    }
276
277
    public function getCrawlObservers(): CrawlObserverCollection
278
    {
279
        return $this->crawlObservers;
280
    }
281
282
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
283
    {
284
        $this->crawlProfile = $crawlProfile;
285
286
        return $this;
287
    }
288
289
    public function getCrawlProfile(): CrawlProfile
290
    {
291
        return $this->crawlProfile;
292
    }
293
294
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
295
    {
296
        $baseClass = CrawlRequestFulfilled::class;
297
298
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
299
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
300
        }
301
302
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
303
304
        return $this;
305
    }
306
307
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
308
    {
309
        $baseClass = CrawlRequestFailed::class;
310
311
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
312
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
313
        }
314
315
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
316
317
        return $this;
318
    }
319
320
    public function setBrowsershot(Browsershot $browsershot)
321
    {
322
        $this->browsershot = $browsershot;
323
324
        return $this;
325
    }
326
327
    public function getBrowsershot(): Browsershot
328
    {
329
        if (! $this->browsershot) {
330
            $this->browsershot = new Browsershot();
331
        }
332
333
        return $this->browsershot;
334
    }
335
336
    public function getBaseUrl(): UriInterface
337
    {
338
        return $this->baseUrl;
339
    }
340
341
    public function setProxies(array $proxyConfig): Crawler
342
    {
343
        $this->proxyConfig = $proxyConfig;
0 ignored issues
show
Bug introduced by
The property proxyConfig does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
344
        $this->usingProxies = true;
345
346
        return $this;
347
    }
348
349
    /**
350
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
351
     */
352
    public function startCrawling($baseUrl)
353
    {
354
        if (! $baseUrl instanceof UriInterface) {
355
            $baseUrl = new Uri($baseUrl);
356
        }
357
358
        if ($baseUrl->getScheme() === '') {
359
            $baseUrl = $baseUrl->withScheme('http');
360
        }
361
362
        if ($baseUrl->getPath() === '') {
363
            $baseUrl = $baseUrl->withPath('/');
364
        }
365
366
        $this->baseUrl = $baseUrl;
367
368
        $crawlUrl = CrawlUrl::create($this->baseUrl);
369
370
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
371
372
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
373
            ! $this->respectRobots
374
        ) {
375
            $this->addToCrawlQueue($crawlUrl);
376
        }
377
378
        $this->depthTree = new Node((string) $this->baseUrl);
379
380
        $this->startCrawlingQueue();
381
382
        foreach ($this->crawlObservers as $crawlObserver) {
383
            $crawlObserver->finishedCrawling();
384
        }
385
    }
386
387
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
388
    {
389
        if (is_null($this->maximumDepth)) {
390
            return new Node((string) $url);
391
        }
392
393
        $node = $node ?? $this->depthTree;
394
395
        $returnNode = null;
396
397
        if ($node->getValue() === (string) $parentUrl) {
398
            $newNode = new Node((string) $url);
399
400
            $node->addChild($newNode);
401
402
            return $newNode;
403
        }
404
405
        foreach ($node->getChildren() as $currentNode) {
406
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
407
408
            if (! is_null($returnNode)) {
409
                break;
410
            }
411
        }
412
413
        return $returnNode;
414
    }
415
416
    protected function startCrawlingQueue()
417
    {
418
        while ($this->crawlQueue->hasPendingUrls()) {
419
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
420
                'concurrency' => $this->concurrency,
421
                'options' => $this->getConfig(),
422
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
423
                'rejected' => new $this->crawlRequestFailedClass($this),
424
            ]);
425
426
            $promise = $pool->promise();
427
428
            $promise->wait();
429
        }
430
    }
431
432
    protected function getConfig()
433
    {
434
        $config = $this->client->getConfig();
435
        if ($this->usingProxies) {
436
            $config['proxy'] = $this->getProxyConfig();
437
        }
438
        return $config;
439
    }
440
441
    protected function getProxyConfig()
442
    {
443
        $ips = collect($this->proxyConfig['ips']);
444
        $username = $this->proxyConfig['username'];
445
        $password = $this->proxyConfig['password'];
446
        $port = $this->proxyConfig['port'];
447
        $proxyIp = $ips->random();
448
449
        return "http://{$username}:{$password}@{$proxyIp}:{$port}";
450
    }
451
452
    /**
453
     * @deprecated This function will be removed in the next major version
454
     */
455
    public function endsWith($haystack, $needle)
456
    {
457
        return strrpos($haystack, $needle) + strlen($needle) ===
458
            strlen($haystack);
459
    }
460
461
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
462
    {
463
        return RobotsTxt::create($uri->withPath('/robots.txt'));
464
    }
465
466
    protected function getCrawlRequests(): Generator
467
    {
468
        $poolItemLimit = $this->getPoolItemLimit();
469
        $crawledUrlCount = 0;
470
471
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
472
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
473
                $this->crawlQueue->markAsProcessed($crawlUrl);
474
                continue;
475
            }
476
477
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
478
                continue;
479
            }
480
481
            if ($poolItemLimit && $poolItemLimit <= $crawledUrlCount) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $poolItemLimit of type integer|null is loosely compared to true; this is ambiguous if the integer can be zero. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For integer values, zero is a special case, in particular the following results might be unexpected:

0   == false // true
0   == null  // true
123 == false // false
123 == null  // false

// It is often better to use strict comparison
0 === false // false
0 === null  // false
Loading history...
482
                break;
483
            }
484
485
            foreach ($this->crawlObservers as $crawlObserver) {
486
                $crawlObserver->willCrawl($crawlUrl->url);
487
            }
488
489
            $crawledUrlCount++;
490
491
            $this->crawlQueue->markAsProcessed($crawlUrl);
492
493
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
494
        }
495
    }
496
497
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
498
    {
499
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
500
            return $this;
501
        }
502
503
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
504
            return $this;
505
        }
506
507
        $this->crawledUrlCount++;
508
509
        $this->crawlQueue->add($crawlUrl);
510
511
        return $this;
512
    }
513
514
    public function maximumCrawlCountReached(): bool
515
    {
516
        $maximumCrawlCount = $this->getMaximumCrawlCount();
517
518
        if (is_null($maximumCrawlCount)) {
519
            return false;
520
        }
521
522
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
523
    }
524
}
525