Completed
Pull Request — master (#173)
by Freek
04:48 queued 01:28
created

Crawler::addToDepthTree()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 24
rs 9.536
cc 4
nc 4
nop 3
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
20
21
class Crawler
22
{
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var \Spatie\Crawler\CrawlObserverCollection */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var bool */
54
    protected $respectRobots = true;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var \Spatie\Robots\RobotsTxt */
66
    protected $robotsTxt = null;
67
68
    /** @var string */
69
    protected $crawlRequestFulfilledClass;
70
71
    /** @var string */
72
    protected $crawlRequestFailedClass;
73
74
    /** @var   */
75
    protected static $defaultClientOptions = [
76
        RequestOptions::COOKIES => true,
77
        RequestOptions::CONNECT_TIMEOUT => 10,
78
        RequestOptions::TIMEOUT => 10,
79
        RequestOptions::ALLOW_REDIRECTS => false,
80
    ];
81
82
    public static function create(array $clientOptions = []): Crawler
83
    {
84
        $clientOptions = (count($clientOptions))
85
            ? $clientOptions
86
            : static::$defaultClientOptions;
87
88
        $client = new Client($clientOptions);
89
90
        return new static($client);
91
    }
92
93
    public function __construct(Client $client, int $concurrency = 10)
94
    {
95
        $this->client = $client;
96
97
        $this->concurrency = $concurrency;
98
99
        $this->crawlProfile = new CrawlAllUrls();
100
101
        $this->crawlQueue = new CollectionCrawlQueue();
102
103
        $this->crawlObservers = new CrawlObserverCollection();
104
105
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
106
107
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
108
    }
109
110
    public function setConcurrency(int $concurrency): Crawler
111
    {
112
        $this->concurrency = $concurrency;
113
114
        return $this;
115
    }
116
117
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
118
    {
119
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
120
121
        return $this;
122
    }
123
124
    public function getMaximumResponseSize(): ?int
125
    {
126
        return $this->maximumResponseSize;
127
    }
128
129
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
130
    {
131
        $this->maximumCrawlCount = $maximumCrawlCount;
132
133
        return $this;
134
    }
135
136
    public function getMaximumCrawlCount(): ?int
137
    {
138
        return $this->maximumCrawlCount;
139
    }
140
141
    public function getCrawlerUrlCount(): int
142
    {
143
        return $this->crawledUrlCount;
144
    }
145
146
    public function setMaximumDepth(int $maximumDepth): Crawler
147
    {
148
        $this->maximumDepth = $maximumDepth;
149
150
        return $this;
151
    }
152
153
    public function getMaximumDepth(): ?int
154
    {
155
        return $this->maximumDepth;
156
    }
157
158
    public function ignoreRobots(): Crawler
159
    {
160
        $this->respectRobots = false;
161
162
        return $this;
163
    }
164
165
    public function respectRobots(): Crawler
166
    {
167
        $this->respectRobots = true;
168
169
        return $this;
170
    }
171
172
    public function mustRespectRobots(): bool
173
    {
174
        return $this->respectRobots;
175
    }
176
177
    public function getRobotsTxt(): RobotsTxt
178
    {
179
        return $this->robotsTxt;
180
    }
181
182
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
183
    {
184
        $this->crawlQueue = $crawlQueue;
185
186
        return $this;
187
    }
188
189
    public function getCrawlQueue(): CrawlQueue
190
    {
191
        return $this->crawlQueue;
192
    }
193
194
    public function executeJavaScript(): Crawler
195
    {
196
        $this->executeJavaScript = true;
197
198
        return $this;
199
    }
200
201
    public function doNotExecuteJavaScript(): Crawler
202
    {
203
        $this->executeJavaScript = false;
204
205
        return $this;
206
    }
207
208
    public function mayExecuteJavascript(): bool
209
    {
210
        return $this->executeJavaScript;
211
    }
212
213
    /**
214
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
215
     *
216
     * @return $this
217
     */
218
    public function setCrawlObserver($crawlObservers): Crawler
219
    {
220
        if (! is_array($crawlObservers)) {
221
            $crawlObservers = [$crawlObservers];
222
        }
223
224
        return $this->setCrawlObservers($crawlObservers);
225
    }
226
227
    public function setCrawlObservers(array $crawlObservers): Crawler
228
    {
229
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
230
231
        return $this;
232
    }
233
234
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
235
    {
236
        $this->crawlObservers->addObserver($crawlObserver);
237
238
        return $this;
239
    }
240
241
    public function getCrawlObservers(): CrawlObserverCollection
242
    {
243
        return $this->crawlObservers;
244
    }
245
246
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
247
    {
248
        $this->crawlProfile = $crawlProfile;
249
250
        return $this;
251
    }
252
253
    public function getCrawlProfile(): CrawlProfile
254
    {
255
        return $this->crawlProfile;
256
    }
257
258
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
259
    {
260
        $baseClass = CrawlRequestFulfilled::class;
261
262
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
263
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
264
        }
265
266
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
267
268
        return $this;
269
    }
270
271
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
272
    {
273
        $baseClass = CrawlRequestFailed::class;
274
275
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
276
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
277
        }
278
279
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
280
281
        return $this;
282
    }
283
284
    public function setBrowsershot(Browsershot $browsershot)
285
    {
286
        $this->browsershot = $browsershot;
287
288
        return $this;
289
    }
290
291
    public function getBrowsershot(): Browsershot
292
    {
293
        if (! $this->browsershot) {
294
            $this->browsershot = new Browsershot();
295
        }
296
297
        return $this->browsershot;
298
    }
299
300
    public function getBaseUrl(): UriInterface
301
    {
302
        return $this->baseUrl;
303
    }
304
305
    /**
306
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
307
     */
308
    public function startCrawling($baseUrl)
309
    {
310
        if (! $baseUrl instanceof UriInterface) {
311
            $baseUrl = new Uri($baseUrl);
312
        }
313
314
        if ($baseUrl->getScheme() === '') {
315
            $baseUrl = $baseUrl->withScheme('http');
316
        }
317
318
        if ($baseUrl->getPath() === '') {
319
            $baseUrl = $baseUrl->withPath('/');
320
        }
321
322
        $this->baseUrl = $baseUrl;
323
324
        $crawlUrl = CrawlUrl::create($this->baseUrl);
325
326
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
327
328
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
329
            ! $this->respectRobots
330
        ) {
331
            $this->addToCrawlQueue($crawlUrl);
332
        }
333
334
        $this->depthTree = new Node((string) $this->baseUrl);
335
336
        $this->startCrawlingQueue();
337
338
        foreach ($this->crawlObservers as $crawlObserver) {
339
            $crawlObserver->finishedCrawling();
340
        }
341
    }
342
343
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
344
    {
345
        $node = $node ?? $this->depthTree;
346
347
        $returnNode = null;
348
349
        if ($node->getValue() === (string) $parentUrl) {
350
            $newNode = new Node((string) $url);
351
352
            $node->addChild($newNode);
353
354
            return $newNode;
355
        }
356
357
        foreach ($node->getChildren() as $currentNode) {
358
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
359
360
            if (! is_null($returnNode)) {
361
                break;
362
            }
363
        }
364
365
        return $returnNode;
366
    }
367
368
    protected function startCrawlingQueue()
369
    {
370
        while ($this->crawlQueue->hasPendingUrls()) {
371
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
372
                'concurrency' => $this->concurrency,
373
                'options' => $this->client->getConfig(),
374
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
375
                'rejected' => new $this->crawlRequestFailedClass($this),
376
            ]);
377
378
            $promise = $pool->promise();
379
380
            $promise->wait();
381
        }
382
    }
383
384
    /**
385
     * @deprecated This function will be removed in the next major version
386
     */
387
    public function endsWith($haystack, $needle)
388
    {
389
        return strrpos($haystack, $needle) + strlen($needle) ===
390
            strlen($haystack);
391
    }
392
393
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
394
    {
395
        return RobotsTxt::create($uri->withPath('/robots.txt'));
396
    }
397
398
    protected function getCrawlRequests(): Generator
399
    {
400
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
401
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
402
                $this->crawlQueue->markAsProcessed($crawlUrl);
403
                continue;
404
            }
405
406
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
407
                continue;
408
            }
409
410
            foreach ($this->crawlObservers as $crawlObserver) {
411
                $crawlObserver->willCrawl($crawlUrl->url);
412
            }
413
414
            $this->crawlQueue->markAsProcessed($crawlUrl);
415
416
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
417
        }
418
    }
419
420
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
421
    {
422
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
423
            return $this;
424
        }
425
426
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
427
            return $this;
428
        }
429
430
        $this->crawledUrlCount++;
431
432
        $this->crawlQueue->add($crawlUrl);
433
434
        return $this;
435
    }
436
437
    public function maximumCrawlCountReached(): bool
438
    {
439
        $maximumCrawlCount = $this->getMaximumCrawlCount();
440
441
        if (is_null($maximumCrawlCount)) {
442
            return false;
443
        }
444
445
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
446
    }
447
}
448