Completed
Pull Request — master (#173)
by Freek
03:36
created

Crawler::setCrawlFulfilledHandlerClass()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 12
rs 9.8666
cc 2
nc 2
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
7
use Spatie\Crawler\Handlers\CrawlRequestFailed;
8
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
9
use Tree\Node\Node;
10
use GuzzleHttp\Pool;
11
use GuzzleHttp\Client;
12
use GuzzleHttp\Psr7\Uri;
13
use GuzzleHttp\Psr7\Request;
14
use Spatie\Robots\RobotsTxt;
15
use GuzzleHttp\RequestOptions;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Spatie\Crawler\CrawlQueue\CrawlQueue;
19
use Spatie\Crawler\Handlers\DefaultCrawlRequestFailed;
20
use Spatie\Crawler\Handlers\DefaultCrawlRequestFulfilled;
21
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
22
23
class Crawler
24
{
25
    /** @var \GuzzleHttp\Client */
26
    protected $client;
27
28
    /** @var \Psr\Http\Message\UriInterface */
29
    protected $baseUrl;
30
31
    /** @var \Spatie\Crawler\CrawlObserverCollection */
32
    protected $crawlObservers;
33
34
    /** @var \Spatie\Crawler\CrawlProfile */
35
    protected $crawlProfile;
36
37
    /** @var int */
38
    protected $concurrency;
39
40
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
41
    protected $crawlQueue;
42
43
    /** @var int */
44
    protected $crawledUrlCount = 0;
45
46
    /** @var int|null */
47
    protected $maximumCrawlCount = null;
48
49
    /** @var int */
50
    protected $maximumResponseSize = 1024 * 1024 * 2;
51
52
    /** @var int|null */
53
    protected $maximumDepth = null;
54
55
    /** @var bool */
56
    protected $respectRobots = true;
57
58
    /** @var \Tree\Node\Node */
59
    protected $depthTree;
60
61
    /** @var bool */
62
    protected $executeJavaScript = false;
63
64
    /** @var Browsershot */
65
    protected $browsershot = null;
66
67
    /** @var \Spatie\Robots\RobotsTxt */
68
    protected $robotsTxt = null;
69
70
    /** @var string */
71
    protected $crawlRequestFulfilledClass;
72
73
    /** @var string */
74
    protected $crawlRequestFailedClass;
75
76
    /** @var   */
77
    protected static $defaultClientOptions = [
78
        RequestOptions::COOKIES => true,
79
        RequestOptions::CONNECT_TIMEOUT => 10,
80
        RequestOptions::TIMEOUT => 10,
81
        RequestOptions::ALLOW_REDIRECTS => false,
82
    ];
83
84
    public static function create(array $clientOptions = []): Crawler
85
    {
86
        $clientOptions = (count($clientOptions))
87
            ? $clientOptions
88
            : static::$defaultClientOptions;
89
90
        $client = new Client($clientOptions);
91
92
        return new static($client);
93
    }
94
95
    public function __construct(Client $client, int $concurrency = 10)
96
    {
97
        $this->client = $client;
98
99
        $this->concurrency = $concurrency;
100
101
        $this->crawlProfile = new CrawlAllUrls();
102
103
        $this->crawlQueue = new CollectionCrawlQueue();
104
105
        $this->crawlObservers = new CrawlObserverCollection();
106
107
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
108
109
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
110
    }
111
112
    public function setConcurrency(int $concurrency): Crawler
113
    {
114
        $this->concurrency = $concurrency;
115
116
        return $this;
117
    }
118
119
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
120
    {
121
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
122
123
        return $this;
124
    }
125
126
    public function getMaximumResponseSize(): ?int
127
    {
128
        return $this->maximumResponseSize;
129
    }
130
131
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
132
    {
133
        $this->maximumCrawlCount = $maximumCrawlCount;
134
135
        return $this;
136
    }
137
138
    public function getMaximumCrawlCount(): ?int
139
    {
140
        return $this->maximumCrawlCount;
141
    }
142
143
    public function getCrawlerUrlCount(): int
144
    {
145
        return $this->crawledUrlCount;
146
    }
147
148
    public function setMaximumDepth(int $maximumDepth): Crawler
149
    {
150
        $this->maximumDepth = $maximumDepth;
151
152
        return $this;
153
    }
154
155
    public function getMaximumDepth(): ?int
156
    {
157
        return $this->maximumDepth;
158
    }
159
160
    public function ignoreRobots(): Crawler
161
    {
162
        $this->respectRobots = false;
163
164
        return $this;
165
    }
166
167
    public function respectRobots(): Crawler
168
    {
169
        $this->respectRobots = true;
170
171
        return $this;
172
    }
173
174
    public function mustRespectRobots(): bool
175
    {
176
        return $this->respectRobots;
177
    }
178
179
    public function getRobotsTxt(): RobotsTxt
180
    {
181
        return $this->robotsTxt;
182
    }
183
184
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
185
    {
186
        $this->crawlQueue = $crawlQueue;
187
188
        return $this;
189
    }
190
191
    public function getCrawlQueue(): CrawlQueue
192
    {
193
        return $this->crawlQueue;
194
    }
195
196
    public function executeJavaScript(): Crawler
197
    {
198
        $this->executeJavaScript = true;
199
200
        return $this;
201
    }
202
203
    public function doNotExecuteJavaScript(): Crawler
204
    {
205
        $this->executeJavaScript = false;
206
207
        return $this;
208
    }
209
210
    public function mayExecuteJavascript(): bool
211
    {
212
        return $this->executeJavaScript;
213
    }
214
215
    /**
216
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
217
     *
218
     * @return $this
219
     */
220
    public function setCrawlObserver($crawlObservers): Crawler
221
    {
222
        if (!is_array($crawlObservers)) {
223
            $crawlObservers = [$crawlObservers];
224
        }
225
226
        return $this->setCrawlObservers($crawlObservers);
227
    }
228
229
    public function setCrawlObservers(array $crawlObservers): Crawler
230
    {
231
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
232
233
        return $this;
234
    }
235
236
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
237
    {
238
        $this->crawlObservers->addObserver($crawlObserver);
239
240
        return $this;
241
    }
242
243
    public function getCrawlObservers(): CrawlObserverCollection
244
    {
245
        return $this->crawlObservers;
246
    }
247
248
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
249
    {
250
        $this->crawlProfile = $crawlProfile;
251
252
        return $this;
253
    }
254
255
    public function getCrawlProfile(): CrawlProfile
256
    {
257
        return $this->crawlProfile;
258
    }
259
260
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
261
    {
262
        $baseClass = CrawlRequestFulfilled::class;
263
264
        if (!is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
265
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
266
        }
267
268
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
269
270
        return $this;
271
    }
272
273
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
274
    {
275
        $baseClass = CrawlRequestFailed::class;
276
277
        if (!is_subclass_of($crawlRequestFailedClass, $baseClass)) {
278
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
279
        }
280
281
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
282
283
        return $this;
284
    }
285
286
287
    public function setBrowsershot(Browsershot $browsershot)
288
    {
289
        $this->browsershot = $browsershot;
290
291
        return $this;
292
    }
293
294
    public function getBrowsershot(): Browsershot
295
    {
296
        if (!$this->browsershot) {
297
            $this->browsershot = new Browsershot();
298
        }
299
300
        return $this->browsershot;
301
    }
302
303
304
    public function getBaseUrl(): UriInterface
305
    {
306
        return $this->baseUrl;
307
    }
308
309
    /**
310
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
311
     */
312
    public function startCrawling($baseUrl)
313
    {
314
        if (! $baseUrl instanceof UriInterface) {
315
            $baseUrl = new Uri($baseUrl);
316
        }
317
318
        if ($baseUrl->getScheme() === '') {
319
            $baseUrl = $baseUrl->withScheme('http');
320
        }
321
322
        if ($baseUrl->getPath() === '') {
323
            $baseUrl = $baseUrl->withPath('/');
324
        }
325
326
        $this->baseUrl = $baseUrl;
327
328
        $crawlUrl = CrawlUrl::create($this->baseUrl);
329
330
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
331
332
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
333
            ! $this->respectRobots
334
        ) {
335
            $this->addToCrawlQueue($crawlUrl);
336
        }
337
338
        $this->depthTree = new Node((string) $this->baseUrl);
339
340
        $this->startCrawlingQueue();
341
342
        foreach ($this->crawlObservers as $crawlObserver) {
343
            $crawlObserver->finishedCrawling();
344
        }
345
    }
346
347
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
348
    {
349
        $node = $node ?? $this->depthTree;
350
351
        $returnNode = null;
352
353
        if ($node->getValue() === (string) $parentUrl) {
354
            $newNode = new Node((string) $url);
355
356
            $node->addChild($newNode);
357
358
            return $newNode;
359
        }
360
361
        foreach ($node->getChildren() as $currentNode) {
362
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
363
364
            if (!is_null($returnNode)) {
365
                break;
366
            }
367
        }
368
369
        return $returnNode;
370
    }
371
372
    protected function startCrawlingQueue()
373
    {
374
        while ($this->crawlQueue->hasPendingUrls()) {
375
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
376
                'concurrency' => $this->concurrency,
377
                'options' => $this->client->getConfig(),
378
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
379
                'rejected' => new $this->crawlRequestFailedClass($this)
380
            ]);
381
382
            $promise = $pool->promise();
383
384
            $promise->wait();
385
        }
386
    }
387
388
    /**
389
     * @deprecated This function will be removed in the next major version
390
     */
391
    public function endsWith($haystack, $needle)
392
    {
393
        return strrpos($haystack, $needle) + strlen($needle) ===
394
            strlen($haystack);
395
    }
396
397
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
398
    {
399
        return RobotsTxt::create($uri->withPath('/robots.txt'));
400
    }
401
402
    protected function getCrawlRequests(): Generator
403
    {
404
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
405
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
406
                $this->crawlQueue->markAsProcessed($crawlUrl);
407
                continue;
408
            }
409
410
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
411
                continue;
412
            }
413
414
            foreach ($this->crawlObservers as $crawlObserver) {
415
                $crawlObserver->willCrawl($crawlUrl->url);
416
            }
417
418
            $this->crawlQueue->markAsProcessed($crawlUrl);
419
420
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
421
        }
422
    }
423
424
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
425
    {
426
        if (!$this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
427
            return $this;
428
        }
429
430
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
431
            return $this;
432
        }
433
434
        $this->crawledUrlCount++;
435
436
        $this->crawlQueue->add($crawlUrl);
437
438
        return $this;
439
    }
440
441
    public function maximumCrawlCountReached(): bool
442
    {
443
        $maximumCrawlCount = $this->getMaximumCrawlCount();
444
445
        if (is_null($maximumCrawlCount)) {
446
            return false;
447
        }
448
449
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
450
    }
451
}
452