Completed
Push — master ( cc9dbe...7325eb )
by Freek
08:09
created

src/Crawler.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
20
21
class Crawler
22
{
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var \Spatie\Crawler\CrawlObserverCollection */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var bool */
54
    protected $respectRobots = true;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var \Spatie\Robots\RobotsTxt */
66
    protected $robotsTxt = null;
67
68
    /** @var string */
69
    protected $crawlRequestFulfilledClass;
70
71
    /** @var string */
72
    protected $crawlRequestFailedClass;
73
74
    /** @var float */
75
    protected $delayBetweenRequests = 0;
76
77
    /** @var   */
78
    protected static $defaultClientOptions = [
79
        RequestOptions::COOKIES => true,
80
        RequestOptions::CONNECT_TIMEOUT => 10,
81
        RequestOptions::TIMEOUT => 10,
82
        RequestOptions::ALLOW_REDIRECTS => false,
83
    ];
84
85
    public static function create(array $clientOptions = []): Crawler
86
    {
87
        $clientOptions = (count($clientOptions))
88
            ? $clientOptions
89
            : static::$defaultClientOptions;
90
91
        $client = new Client($clientOptions);
92
93
        return new static($client);
94
    }
95
96
    public function __construct(Client $client, int $concurrency = 10)
97
    {
98
        $this->client = $client;
99
100
        $this->concurrency = $concurrency;
101
102
        $this->crawlProfile = new CrawlAllUrls();
103
104
        $this->crawlQueue = new CollectionCrawlQueue();
105
106
        $this->crawlObservers = new CrawlObserverCollection();
107
108
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
109
110
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
111
    }
112
113
    public function setConcurrency(int $concurrency): Crawler
114
    {
115
        $this->concurrency = $concurrency;
116
117
        return $this;
118
    }
119
120
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
121
    {
122
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
123
124
        return $this;
125
    }
126
127
    public function getMaximumResponseSize(): ?int
128
    {
129
        return $this->maximumResponseSize;
130
    }
131
132
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
133
    {
134
        $this->maximumCrawlCount = $maximumCrawlCount;
135
136
        return $this;
137
    }
138
139
    public function getMaximumCrawlCount(): ?int
140
    {
141
        return $this->maximumCrawlCount;
142
    }
143
144
    public function getCrawlerUrlCount(): int
145
    {
146
        return $this->crawledUrlCount;
147
    }
148
149
    public function setMaximumDepth(int $maximumDepth): Crawler
150
    {
151
        $this->maximumDepth = $maximumDepth;
152
153
        return $this;
154
    }
155
156
    public function getMaximumDepth(): ?int
157
    {
158
        return $this->maximumDepth;
159
    }
160
161
    public function setDelayBetweenRequests(int $delay): Crawler
162
    {
163
        $this->delayBetweenRequests = ($delay * 1000);
0 ignored issues
show
Documentation Bug introduced by
The property $delayBetweenRequests was declared of type double, but $delay * 1000 is of type integer. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
164
165
        return $this;
166
    }
167
168
    public function getDelayBetweenRequests(): float
169
    {
170
        return $this->delayBetweenRequests;
171
    }
172
173
    public function ignoreRobots(): Crawler
174
    {
175
        $this->respectRobots = false;
176
177
        return $this;
178
    }
179
180
    public function respectRobots(): Crawler
181
    {
182
        $this->respectRobots = true;
183
184
        return $this;
185
    }
186
187
    public function mustRespectRobots(): bool
188
    {
189
        return $this->respectRobots;
190
    }
191
192
    public function getRobotsTxt(): RobotsTxt
193
    {
194
        return $this->robotsTxt;
195
    }
196
197
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
198
    {
199
        $this->crawlQueue = $crawlQueue;
200
201
        return $this;
202
    }
203
204
    public function getCrawlQueue(): CrawlQueue
205
    {
206
        return $this->crawlQueue;
207
    }
208
209
    public function executeJavaScript(): Crawler
210
    {
211
        $this->executeJavaScript = true;
212
213
        return $this;
214
    }
215
216
    public function doNotExecuteJavaScript(): Crawler
217
    {
218
        $this->executeJavaScript = false;
219
220
        return $this;
221
    }
222
223
    public function mayExecuteJavascript(): bool
224
    {
225
        return $this->executeJavaScript;
226
    }
227
228
    /**
229
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
230
     *
231
     * @return $this
232
     */
233
    public function setCrawlObserver($crawlObservers): Crawler
234
    {
235
        if (! is_array($crawlObservers)) {
236
            $crawlObservers = [$crawlObservers];
237
        }
238
239
        return $this->setCrawlObservers($crawlObservers);
240
    }
241
242
    public function setCrawlObservers(array $crawlObservers): Crawler
243
    {
244
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
245
246
        return $this;
247
    }
248
249
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
250
    {
251
        $this->crawlObservers->addObserver($crawlObserver);
252
253
        return $this;
254
    }
255
256
    public function getCrawlObservers(): CrawlObserverCollection
257
    {
258
        return $this->crawlObservers;
259
    }
260
261
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
262
    {
263
        $this->crawlProfile = $crawlProfile;
264
265
        return $this;
266
    }
267
268
    public function getCrawlProfile(): CrawlProfile
269
    {
270
        return $this->crawlProfile;
271
    }
272
273
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
274
    {
275
        $baseClass = CrawlRequestFulfilled::class;
276
277
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
278
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
279
        }
280
281
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
282
283
        return $this;
284
    }
285
286
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
287
    {
288
        $baseClass = CrawlRequestFailed::class;
289
290
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
291
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
292
        }
293
294
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
295
296
        return $this;
297
    }
298
299
    public function setBrowsershot(Browsershot $browsershot)
300
    {
301
        $this->browsershot = $browsershot;
302
303
        return $this;
304
    }
305
306
    public function getBrowsershot(): Browsershot
307
    {
308
        if (! $this->browsershot) {
309
            $this->browsershot = new Browsershot();
310
        }
311
312
        return $this->browsershot;
313
    }
314
315
    public function getBaseUrl(): UriInterface
316
    {
317
        return $this->baseUrl;
318
    }
319
320
    /**
321
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
322
     */
323
    public function startCrawling($baseUrl)
324
    {
325
        if (! $baseUrl instanceof UriInterface) {
326
            $baseUrl = new Uri($baseUrl);
327
        }
328
329
        if ($baseUrl->getScheme() === '') {
330
            $baseUrl = $baseUrl->withScheme('http');
331
        }
332
333
        if ($baseUrl->getPath() === '') {
334
            $baseUrl = $baseUrl->withPath('/');
335
        }
336
337
        $this->baseUrl = $baseUrl;
338
339
        $crawlUrl = CrawlUrl::create($this->baseUrl);
340
341
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
342
343
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
344
            ! $this->respectRobots
345
        ) {
346
            $this->addToCrawlQueue($crawlUrl);
347
        }
348
349
        $this->depthTree = new Node((string) $this->baseUrl);
350
351
        $this->startCrawlingQueue();
352
353
        foreach ($this->crawlObservers as $crawlObserver) {
354
            $crawlObserver->finishedCrawling();
355
        }
356
    }
357
358
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
359
    {
360
        if (is_null($this->maximumDepth)) {
361
            return new Node((string) $url);
362
        }
363
364
        $node = $node ?? $this->depthTree;
365
366
        $returnNode = null;
367
368
        if ($node->getValue() === (string) $parentUrl) {
369
            $newNode = new Node((string) $url);
370
371
            $node->addChild($newNode);
372
373
            return $newNode;
374
        }
375
376
        foreach ($node->getChildren() as $currentNode) {
377
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
378
379
            if (! is_null($returnNode)) {
380
                break;
381
            }
382
        }
383
384
        return $returnNode;
385
    }
386
387
    protected function startCrawlingQueue()
388
    {
389
        while ($this->crawlQueue->hasPendingUrls()) {
390
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
391
                'concurrency' => $this->concurrency,
392
                'options' => $this->client->getConfig(),
393
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
394
                'rejected' => new $this->crawlRequestFailedClass($this),
395
            ]);
396
397
            $promise = $pool->promise();
398
399
            $promise->wait();
400
        }
401
    }
402
403
    /**
404
     * @deprecated This function will be removed in the next major version
405
     */
406
    public function endsWith($haystack, $needle)
407
    {
408
        return strrpos($haystack, $needle) + strlen($needle) ===
409
            strlen($haystack);
410
    }
411
412
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
413
    {
414
        return RobotsTxt::create($uri->withPath('/robots.txt'));
415
    }
416
417
    protected function getCrawlRequests(): Generator
418
    {
419
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
420
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
421
                $this->crawlQueue->markAsProcessed($crawlUrl);
422
                continue;
423
            }
424
425
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
426
                continue;
427
            }
428
429
            foreach ($this->crawlObservers as $crawlObserver) {
430
                $crawlObserver->willCrawl($crawlUrl->url);
431
            }
432
433
            $this->crawlQueue->markAsProcessed($crawlUrl);
434
435
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
436
        }
437
    }
438
439
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
440
    {
441
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
442
            return $this;
443
        }
444
445
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
446
            return $this;
447
        }
448
449
        $this->crawledUrlCount++;
450
451
        $this->crawlQueue->add($crawlUrl);
452
453
        return $this;
454
    }
455
456
    public function maximumCrawlCountReached(): bool
457
    {
458
        $maximumCrawlCount = $this->getMaximumCrawlCount();
459
460
        if (is_null($maximumCrawlCount)) {
461
            return false;
462
        }
463
464
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
465
    }
466
}
467