Crawler::addToDepthTree()   A
last analyzed

Complexity

Conditions 5
Paths 5

Size

Total Lines 28

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 28
rs 9.1608
cc 5
nc 5
nop 3
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Client;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\RequestOptions;
11
use Psr\Http\Message\UriInterface;
12
use Spatie\Browsershot\Browsershot;
13
use Spatie\Crawler\CrawlQueue\ArrayCrawlQueue;
14
use Spatie\Crawler\CrawlQueue\CrawlQueue;
15
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Robots\RobotsTxt;
19
use Tree\Node\Node;
20
21
class Crawler
22
{
23
    public const DEFAULT_USER_AGENT = '*';
24
25
    /** @var \GuzzleHttp\Client */
26
    protected $client;
27
28
    /** @var \Psr\Http\Message\UriInterface */
29
    protected $baseUrl;
30
31
    /** @var \Spatie\Crawler\CrawlObserverCollection */
32
    protected $crawlObservers;
33
34
    /** @var \Spatie\Crawler\CrawlProfile */
35
    protected $crawlProfile;
36
37
    /** @var int */
38
    protected $concurrency;
39
40
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
41
    protected $crawlQueue;
42
43
    /** @var int */
44
    protected $crawledUrlCount = 0;
45
46
    /** @var int|null */
47
    protected $maximumCrawlCount = null;
48
49
    /** @var int */
50
    protected $maximumResponseSize = 1024 * 1024 * 2;
51
52
    /** @var int|null */
53
    protected $maximumDepth = null;
54
55
    /** @var bool */
56
    protected $respectRobots = true;
57
58
    /** @var \Tree\Node\Node */
59
    protected $depthTree;
60
61
    /** @var bool */
62
    protected $executeJavaScript = false;
63
64
    /** @var Browsershot */
65
    protected $browsershot = null;
66
67
    /** @var \Spatie\Robots\RobotsTxt */
68
    protected $robotsTxt = null;
69
70
    /** @var string */
71
    protected $crawlRequestFulfilledClass;
72
73
    /** @var string */
74
    protected $crawlRequestFailedClass;
75
76
    /** @var int */
77
    protected $delayBetweenRequests = 0;
78
79
    /** @var   */
80
    protected static $defaultClientOptions = [
81
        RequestOptions::COOKIES => true,
82
        RequestOptions::CONNECT_TIMEOUT => 10,
83
        RequestOptions::TIMEOUT => 10,
84
        RequestOptions::ALLOW_REDIRECTS => false,
85
        RequestOptions::HEADERS => [
86
            'User-Agent' => self::DEFAULT_USER_AGENT,
87
        ],
88
    ];
89
90
    public static function create(array $clientOptions = []): Crawler
91
    {
92
        $clientOptions = (count($clientOptions))
93
            ? $clientOptions
94
            : static::$defaultClientOptions;
95
96
        $client = new Client($clientOptions);
97
98
        return new static($client);
99
    }
100
101
    public function __construct(Client $client, int $concurrency = 10)
102
    {
103
        $this->client = $client;
104
105
        $this->concurrency = $concurrency;
106
107
        $this->crawlProfile = new CrawlAllUrls();
108
109
        $this->crawlQueue = new ArrayCrawlQueue();
110
111
        $this->crawlObservers = new CrawlObserverCollection();
112
113
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
114
115
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
116
    }
117
118
    public function setConcurrency(int $concurrency): Crawler
119
    {
120
        $this->concurrency = $concurrency;
121
122
        return $this;
123
    }
124
125
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
126
    {
127
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
128
129
        return $this;
130
    }
131
132
    public function getMaximumResponseSize(): ?int
133
    {
134
        return $this->maximumResponseSize;
135
    }
136
137
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
138
    {
139
        $this->maximumCrawlCount = $maximumCrawlCount;
140
141
        return $this;
142
    }
143
144
    public function getMaximumCrawlCount(): ?int
145
    {
146
        return $this->maximumCrawlCount;
147
    }
148
149
    public function getCrawlerUrlCount(): int
150
    {
151
        return $this->crawledUrlCount;
152
    }
153
154
    public function setMaximumDepth(int $maximumDepth): Crawler
155
    {
156
        $this->maximumDepth = $maximumDepth;
157
158
        return $this;
159
    }
160
161
    public function getMaximumDepth(): ?int
162
    {
163
        return $this->maximumDepth;
164
    }
165
166
    /**
167
     * @param int $delay The delay in milliseconds.
168
     *
169
     * @return Crawler
170
     */
171
    public function setDelayBetweenRequests(int $delay): Crawler
172
    {
173
        $this->delayBetweenRequests = ($delay * 1000);
174
175
        return $this;
176
    }
177
178
    /**
179
     * @return int The delay in milliseconds.
180
     */
181
    public function getDelayBetweenRequests(): int
182
    {
183
        return $this->delayBetweenRequests;
184
    }
185
186
    public function ignoreRobots(): Crawler
187
    {
188
        $this->respectRobots = false;
189
190
        return $this;
191
    }
192
193
    public function respectRobots(): Crawler
194
    {
195
        $this->respectRobots = true;
196
197
        return $this;
198
    }
199
200
    public function mustRespectRobots(): bool
201
    {
202
        return $this->respectRobots;
203
    }
204
205
    public function getRobotsTxt(): RobotsTxt
206
    {
207
        return $this->robotsTxt;
208
    }
209
210
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
211
    {
212
        $this->crawlQueue = $crawlQueue;
213
214
        return $this;
215
    }
216
217
    public function getCrawlQueue(): CrawlQueue
218
    {
219
        return $this->crawlQueue;
220
    }
221
222
    public function executeJavaScript(): Crawler
223
    {
224
        $this->executeJavaScript = true;
225
226
        return $this;
227
    }
228
229
    public function doNotExecuteJavaScript(): Crawler
230
    {
231
        $this->executeJavaScript = false;
232
233
        return $this;
234
    }
235
236
    public function mayExecuteJavascript(): bool
237
    {
238
        return $this->executeJavaScript;
239
    }
240
241
    /**
242
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
243
     *
244
     * @return $this
245
     */
246
    public function setCrawlObserver($crawlObservers): Crawler
247
    {
248
        if (! is_array($crawlObservers)) {
249
            $crawlObservers = [$crawlObservers];
250
        }
251
252
        return $this->setCrawlObservers($crawlObservers);
253
    }
254
255
    public function setCrawlObservers(array $crawlObservers): Crawler
256
    {
257
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
258
259
        return $this;
260
    }
261
262
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
263
    {
264
        $this->crawlObservers->addObserver($crawlObserver);
265
266
        return $this;
267
    }
268
269
    public function getCrawlObservers(): CrawlObserverCollection
270
    {
271
        return $this->crawlObservers;
272
    }
273
274
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
275
    {
276
        $this->crawlProfile = $crawlProfile;
277
278
        return $this;
279
    }
280
281
    public function getCrawlProfile(): CrawlProfile
282
    {
283
        return $this->crawlProfile;
284
    }
285
286
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
287
    {
288
        $baseClass = CrawlRequestFulfilled::class;
289
290
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
291
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
292
        }
293
294
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
295
296
        return $this;
297
    }
298
299
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
300
    {
301
        $baseClass = CrawlRequestFailed::class;
302
303
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
304
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
305
        }
306
307
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
308
309
        return $this;
310
    }
311
312
    public function setBrowsershot(Browsershot $browsershot)
313
    {
314
        $this->browsershot = $browsershot;
315
316
        return $this;
317
    }
318
319
    public function setUserAgent(string $userAgent): Crawler
320
    {
321
        $clientOptions = $this->client->getConfig();
322
323
        $headers = array_change_key_case($clientOptions['headers']);
324
        $headers['user-agent'] = $userAgent;
325
326
        $clientOptions['headers'] = $headers;
327
328
        $this->client = new Client($clientOptions);
329
330
        return $this;
331
    }
332
333
    public function getUserAgent(): string
334
    {
335
        $headers = $this->client->getConfig('headers');
336
337
        foreach (array_keys($headers) as $name) {
338
            if (strtolower($name) === 'user-agent') {
339
                return (string) $headers[$name];
340
            }
341
        }
342
343
        return static::DEFAULT_USER_AGENT;
344
    }
345
346
    public function getBrowsershot(): Browsershot
347
    {
348
        if (! $this->browsershot) {
349
            $this->browsershot = new Browsershot();
350
        }
351
352
        return $this->browsershot;
353
    }
354
355
    public function getBaseUrl(): UriInterface
356
    {
357
        return $this->baseUrl;
358
    }
359
360
    /**
361
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
362
     */
363
    public function startCrawling($baseUrl)
364
    {
365
        if (! $baseUrl instanceof UriInterface) {
366
            $baseUrl = new Uri($baseUrl);
367
        }
368
369
        if ($baseUrl->getScheme() === '') {
370
            $baseUrl = $baseUrl->withScheme('http');
371
        }
372
373
        if ($baseUrl->getPath() === '') {
374
            $baseUrl = $baseUrl->withPath('/');
375
        }
376
377
        $this->baseUrl = $baseUrl;
378
379
        $crawlUrl = CrawlUrl::create($this->baseUrl);
380
381
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
382
383
        if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent()) ||
384
            ! $this->respectRobots
385
        ) {
386
            $this->addToCrawlQueue($crawlUrl);
387
        }
388
389
        $this->depthTree = new Node((string) $this->baseUrl);
390
391
        $this->startCrawlingQueue();
392
393
        foreach ($this->crawlObservers as $crawlObserver) {
394
            $crawlObserver->finishedCrawling();
395
        }
396
    }
397
398
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
399
    {
400
        if (is_null($this->maximumDepth)) {
401
            return new Node((string) $url);
402
        }
403
404
        $node = $node ?? $this->depthTree;
405
406
        $returnNode = null;
407
408
        if ($node->getValue() === (string) $parentUrl) {
409
            $newNode = new Node((string) $url);
410
411
            $node->addChild($newNode);
412
413
            return $newNode;
414
        }
415
416
        foreach ($node->getChildren() as $currentNode) {
417
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
418
419
            if (! is_null($returnNode)) {
420
                break;
421
            }
422
        }
423
424
        return $returnNode;
425
    }
426
427
    protected function startCrawlingQueue()
428
    {
429
        while ($this->crawlQueue->hasPendingUrls()) {
430
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
431
                'concurrency' => $this->concurrency,
432
                'options' => $this->client->getConfig(),
433
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
434
                'rejected' => new $this->crawlRequestFailedClass($this),
435
            ]);
436
437
            $promise = $pool->promise();
438
439
            $promise->wait();
440
        }
441
    }
442
443
    /**
444
     * @deprecated This function will be removed in the next major version
445
     */
446
    public function endsWith($haystack, $needle)
447
    {
448
        return strrpos($haystack, $needle) + strlen($needle) ===
449
            strlen($haystack);
450
    }
451
452
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
453
    {
454
        return RobotsTxt::create($uri->withPath('/robots.txt'));
455
    }
456
457
    protected function getCrawlRequests(): Generator
458
    {
459
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
460
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
461
                $this->crawlQueue->markAsProcessed($crawlUrl);
462
                continue;
463
            }
464
465
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
466
                continue;
467
            }
468
469
            foreach ($this->crawlObservers as $crawlObserver) {
470
                $crawlObserver->willCrawl($crawlUrl->url);
471
            }
472
473
            $this->crawlQueue->markAsProcessed($crawlUrl);
474
475
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
476
        }
477
    }
478
479
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
480
    {
481
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
482
            return $this;
483
        }
484
485
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
486
            return $this;
487
        }
488
489
        $this->crawledUrlCount++;
490
491
        $this->crawlQueue->add($crawlUrl);
492
493
        return $this;
494
    }
495
496
    public function maximumCrawlCountReached(): bool
497
    {
498
        $maximumCrawlCount = $this->getMaximumCrawlCount();
499
500
        if (is_null($maximumCrawlCount)) {
501
            return false;
502
        }
503
504
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
505
    }
506
}
507