Completed
Pull Request — master (#168)
by
unknown
01:49 queued 11s
created

Crawler::setCrawlFulfilledHandlerClass()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 10
rs 9.4285
cc 2
eloc 5
nc 2
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Spatie\Crawler\Exception\InvalidCrawlRequestHandlerException;
7
use Spatie\Crawler\Handlers\CrawlRequestFailedAbstract;
8
use Spatie\Crawler\Handlers\CrawlRequestFulfilledAbstract;
9
use Tree\Node\Node;
10
use GuzzleHttp\Pool;
11
use GuzzleHttp\Client;
12
use GuzzleHttp\Psr7\Uri;
13
use GuzzleHttp\Psr7\Request;
14
use Spatie\Robots\RobotsTxt;
15
use GuzzleHttp\RequestOptions;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Spatie\Crawler\CrawlQueue\CrawlQueue;
19
use Spatie\Crawler\Handlers\CrawlRequestFailed;
20
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
21
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
22
23
class Crawler
24
{
25
    /** @var \GuzzleHttp\Client */
26
    protected $client;
27
28
    /** @var \Psr\Http\Message\UriInterface */
29
    protected $baseUrl;
30
31
    /** @var \Spatie\Crawler\CrawlObserverCollection */
32
    protected $crawlObservers;
33
34
    /** @var \Spatie\Crawler\CrawlProfile */
35
    protected $crawlProfile;
36
37
    /** @var int */
38
    protected $concurrency;
39
40
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
41
    protected $crawlQueue;
42
43
    /** @var int */
44
    protected $crawledUrlCount = 0;
45
46
    /** @var int|null */
47
    protected $maximumCrawlCount = null;
48
49
    /** @var int */
50
    protected $maximumResponseSize = 1024 * 1024 * 2;
51
52
    /** @var int|null */
53
    protected $maximumDepth = null;
54
55
    /** @var bool */
56
    protected $respectRobots = true;
57
58
    /** @var \Tree\Node\Node */
59
    protected $depthTree;
60
61
    /** @var bool */
62
    protected $executeJavaScript = false;
63
64
    /** @var Browsershot */
65
    protected $browsershot = null;
66
67
    /** @var \Spatie\Robots\RobotsTxt */
68
    protected $robotsTxt = null;
69
70
    /** @var string */
71
    protected $crawlRequestFulfilledClass = null;
72
73
    /** @var string */
74
    protected $crawlRequestFailedClass = null;
75
76
    /** @var   */
77
    protected static $defaultClientOptions = [
78
        RequestOptions::COOKIES => true,
79
        RequestOptions::CONNECT_TIMEOUT => 10,
80
        RequestOptions::TIMEOUT => 10,
81
        RequestOptions::ALLOW_REDIRECTS => false,
82
    ];
83
84
    /**
85
     * @param array $clientOptions
86
     *
87
     * @return static
88
     */
89
    public static function create(array $clientOptions = [])
90
    {
91
        $clientOptions = (count($clientOptions))
92
            ? $clientOptions
93
            : static::$defaultClientOptions;
94
95
        $client = new Client($clientOptions);
96
97
        return new static($client);
98
    }
99
100
    public function __construct(Client $client, int $concurrency = 10)
101
    {
102
        $this->client = $client;
103
104
        $this->concurrency = $concurrency;
105
106
        $this->crawlProfile = new CrawlAllUrls();
107
108
        $this->crawlQueue = new CollectionCrawlQueue();
109
110
        $this->crawlObservers = new CrawlObserverCollection();
111
    }
112
113
    public function setConcurrency(int $concurrency): Crawler
114
    {
115
        $this->concurrency = $concurrency;
116
117
        return $this;
118
    }
119
120
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
121
    {
122
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
123
124
        return $this;
125
    }
126
127
    public function getMaximumResponseSize(): ?int
128
    {
129
        return $this->maximumResponseSize;
130
    }
131
132
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
133
    {
134
        $this->maximumCrawlCount = $maximumCrawlCount;
135
136
        return $this;
137
    }
138
139
    public function getMaximumCrawlCount(): ?int
140
    {
141
        return $this->maximumCrawlCount;
142
    }
143
144
    public function getCrawlerUrlCount(): int
145
    {
146
        return $this->crawledUrlCount;
147
    }
148
149
    public function setMaximumDepth(int $maximumDepth): Crawler
150
    {
151
        $this->maximumDepth = $maximumDepth;
152
153
        return $this;
154
    }
155
156
    public function getMaximumDepth(): ?int
157
    {
158
        return $this->maximumDepth;
159
    }
160
161
    public function ignoreRobots(): Crawler
162
    {
163
        $this->respectRobots = false;
164
165
        return $this;
166
    }
167
168
    public function respectRobots(): Crawler
169
    {
170
        $this->respectRobots = true;
171
172
        return $this;
173
    }
174
175
    public function mustRespectRobots(): bool
176
    {
177
        return $this->respectRobots;
178
    }
179
180
    public function getRobotsTxt(): RobotsTxt
181
    {
182
        return $this->robotsTxt;
183
    }
184
185
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
186
    {
187
        $this->crawlQueue = $crawlQueue;
188
189
        return $this;
190
    }
191
192
    public function getCrawlQueue(): CrawlQueue
193
    {
194
        return $this->crawlQueue;
195
    }
196
197
    public function executeJavaScript(): Crawler
198
    {
199
        $this->executeJavaScript = true;
200
201
        return $this;
202
    }
203
204
    public function doNotExecuteJavaScript(): Crawler
205
    {
206
        $this->executeJavaScript = false;
207
208
        return $this;
209
    }
210
211
    public function mayExecuteJavascript(): bool
212
    {
213
        return $this->executeJavaScript;
214
    }
215
216
    /**
217
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
218
     *
219
     * @return $this
220
     */
221
    public function setCrawlObserver($crawlObservers): Crawler
222
    {
223
        if (!is_array($crawlObservers)) {
224
            $crawlObservers = [$crawlObservers];
225
        }
226
227
        return $this->setCrawlObservers($crawlObservers);
228
    }
229
230
    public function setCrawlObservers(array $crawlObservers): Crawler
231
    {
232
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
233
234
        return $this;
235
    }
236
237
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
238
    {
239
        $this->crawlObservers->addObserver($crawlObserver);
240
241
        return $this;
242
    }
243
244
    public function getCrawlObservers(): CrawlObserverCollection
245
    {
246
        return $this->crawlObservers;
247
    }
248
249
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
250
    {
251
        $this->crawlProfile = $crawlProfile;
252
253
        return $this;
254
    }
255
256
    public function getCrawlProfile(): CrawlProfile
257
    {
258
        return $this->crawlProfile;
259
    }
260
261
    public function setBrowsershot(Browsershot $browsershot)
262
    {
263
        $this->browsershot = $browsershot;
264
265
        return $this;
266
    }
267
268
    public function getBrowsershot(): Browsershot
269
    {
270
        if (!$this->browsershot) {
271
            $this->browsershot = new Browsershot();
272
        }
273
274
        return $this->browsershot;
275
    }
276
277
    public function getCrawlFulfilledHandler(): CrawlRequestFulfilledAbstract
278
    {
279
        return $this->crawlRequestFulfilledClass ?
280
            new $this->crawlRequestFulfilledClass($this) : new CrawlRequestFulfilled($this);
281
    }
282
283
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
284
    {
285
        if (!is_subclass_of($crawlRequestFulfilledClass, $abstract = CrawlRequestFulfilledAbstract::class)) {
286
            throw new InvalidCrawlRequestHandlerException("Fulfilled handler class must extend {$abstract}");
287
        }
288
289
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
290
291
        return $this;
292
    }
293
294
    public function getCrawlFailedHandler(): CrawlRequestFailedAbstract
295
    {
296
        return $this->crawlRequestFailedClass ?
297
            new $this->crawlRequestFailedClass($this) : new CrawlRequestFailed($this);
298
    }
299
300
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
301
    {
302
        if (!is_subclass_of($crawlRequestFailedClass, $abstract = CrawlRequestFailedAbstract::class)) {
303
            throw new InvalidCrawlRequestHandlerException("Failed handler class must extend {$abstract}");
304
        }
305
306
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
307
308
        return $this;
309
    }
310
311
    public function getBaseUrl(): UriInterface
312
    {
313
        return $this->baseUrl;
314
    }
315
316
    /**
317
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
318
     */
319
    public function startCrawling($baseUrl)
320
    {
321
        if (! $baseUrl instanceof UriInterface) {
322
            $baseUrl = new Uri($baseUrl);
323
        }
324
325
        if ($baseUrl->getScheme() === '') {
326
            $baseUrl = $baseUrl->withScheme('http');
327
        }
328
329
        if ($baseUrl->getPath() === '') {
330
            $baseUrl = $baseUrl->withPath('/');
331
        }
332
333
        $this->baseUrl = $baseUrl;
334
335
        $crawlUrl = CrawlUrl::create($this->baseUrl);
336
337
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
338
339
        if ($this->robotsTxt->allows((string) $crawlUrl->url) ||
340
            ! $this->respectRobots
341
        ) {
342
            $this->addToCrawlQueue($crawlUrl);
343
        }
344
345
        $this->depthTree = new Node((string) $this->baseUrl);
346
347
        $this->startCrawlingQueue();
348
349
        foreach ($this->crawlObservers as $crawlObserver) {
350
            $crawlObserver->finishedCrawling();
351
        }
352
    }
353
354
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
355
    {
356
        $node = $node ?? $this->depthTree;
357
358
        $returnNode = null;
359
360
        if ($node->getValue() === (string) $parentUrl) {
361
            $newNode = new Node((string) $url);
362
363
            $node->addChild($newNode);
364
365
            return $newNode;
366
        }
367
368
        foreach ($node->getChildren() as $currentNode) {
369
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
370
371
            if (!is_null($returnNode)) {
372
                break;
373
            }
374
        }
375
376
        return $returnNode;
377
    }
378
379
    protected function startCrawlingQueue()
380
    {
381
        while ($this->crawlQueue->hasPendingUrls()) {
382
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
383
                'concurrency' => $this->concurrency,
384
                'options' => $this->client->getConfig(),
385
                'fulfilled' => $this->getCrawlFulfilledHandler(),
386
                'rejected' => $this->getCrawlFailedHandler(),
387
            ]);
388
389
            $promise = $pool->promise();
390
391
            $promise->wait();
392
        }
393
    }
394
395
    /**
396
     * @deprecated This function will be removed in the next major version
397
     */
398
    public function endsWith($haystack, $needle)
399
    {
400
        return strrpos($haystack, $needle) + strlen($needle) ===
401
            strlen($haystack);
402
    }
403
404
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
405
    {
406
        return RobotsTxt::create($uri->withPath('/robots.txt'));
407
    }
408
409
    protected function getCrawlRequests(): Generator
410
    {
411
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
412
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
413
                $this->crawlQueue->markAsProcessed($crawlUrl);
414
                continue;
415
            }
416
417
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
418
                continue;
419
            }
420
421
            foreach ($this->crawlObservers as $crawlObserver) {
422
                $crawlObserver->willCrawl($crawlUrl->url);
423
            }
424
425
            $this->crawlQueue->markAsProcessed($crawlUrl);
426
427
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
428
        }
429
    }
430
431
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
432
    {
433
        if (!$this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
434
            return $this;
435
        }
436
437
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
438
            return $this;
439
        }
440
441
        $this->crawledUrlCount++;
442
443
        $this->crawlQueue->add($crawlUrl);
444
445
        return $this;
446
    }
447
448
    public function maximumCrawlCountReached(): bool
449
    {
450
        $maximumCrawlCount = $this->getMaximumCrawlCount();
451
452
        if (is_null($maximumCrawlCount)) {
453
            return false;
454
        }
455
456
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
457
    }
458
}
459