Completed
Pull Request — master (#246)
by
unknown
01:24
created

Crawler::setUserAgent()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use Spatie\Robots\RobotsTxt;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Spatie\Crawler\CrawlQueue\CrawlQueue;
16
use Spatie\Crawler\Handlers\CrawlRequestFailed;
17
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
18
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
19
use Spatie\Crawler\Exception\InvalidCrawlRequestHandler;
20
21
class Crawler
22
{
23
    /** @var \GuzzleHttp\Client */
24
    protected $client;
25
26
    /** @var \Psr\Http\Message\UriInterface */
27
    protected $baseUrl;
28
29
    /** @var \Spatie\Crawler\CrawlObserverCollection */
30
    protected $crawlObservers;
31
32
    /** @var \Spatie\Crawler\CrawlProfile */
33
    protected $crawlProfile;
34
35
    /** @var int */
36
    protected $concurrency;
37
38
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
39
    protected $crawlQueue;
40
41
    /** @var int */
42
    protected $crawledUrlCount = 0;
43
44
    /** @var int|null */
45
    protected $maximumCrawlCount = null;
46
47
    /** @var int */
48
    protected $maximumResponseSize = 1024 * 1024 * 2;
49
50
    /** @var int|null */
51
    protected $maximumDepth = null;
52
53
    /** @var bool */
54
    protected $respectRobots = true;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var \Spatie\Robots\RobotsTxt */
66
    protected $robotsTxt = null;
67
68
    /** @var string */
69
    protected $crawlRequestFulfilledClass;
70
71
    /** @var string */
72
    protected $crawlRequestFailedClass;
73
74
    /** @var int */
75
    protected $delayBetweenRequests = 0;
76
77
    /** @var string */
78
    protected $userAgent = '*';
79
80
    /** @var   */
81
    protected static $defaultClientOptions = [
82
        RequestOptions::COOKIES => true,
83
        RequestOptions::CONNECT_TIMEOUT => 10,
84
        RequestOptions::TIMEOUT => 10,
85
        RequestOptions::ALLOW_REDIRECTS => false,
86
    ];
87
88
    public static function create(array $clientOptions = []): Crawler
89
    {
90
        $clientOptions = (count($clientOptions))
91
            ? $clientOptions
92
            : static::$defaultClientOptions;
93
94
        $client = new Client($clientOptions);
95
96
        return new static($client);
97
    }
98
99
    public function __construct(Client $client, int $concurrency = 10)
100
    {
101
        $this->client = $client;
102
103
        $this->concurrency = $concurrency;
104
105
        $this->crawlProfile = new CrawlAllUrls();
106
107
        $this->crawlQueue = new CollectionCrawlQueue();
108
109
        $this->crawlObservers = new CrawlObserverCollection();
110
111
        $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
112
113
        $this->crawlRequestFailedClass = CrawlRequestFailed::class;
114
    }
115
116
    public function setConcurrency(int $concurrency): Crawler
117
    {
118
        $this->concurrency = $concurrency;
119
120
        return $this;
121
    }
122
123
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler
124
    {
125
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
126
127
        return $this;
128
    }
129
130
    public function getMaximumResponseSize(): ?int
131
    {
132
        return $this->maximumResponseSize;
133
    }
134
135
    public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler
136
    {
137
        $this->maximumCrawlCount = $maximumCrawlCount;
138
139
        return $this;
140
    }
141
142
    public function getMaximumCrawlCount(): ?int
143
    {
144
        return $this->maximumCrawlCount;
145
    }
146
147
    public function getCrawlerUrlCount(): int
148
    {
149
        return $this->crawledUrlCount;
150
    }
151
152
    public function setMaximumDepth(int $maximumDepth): Crawler
153
    {
154
        $this->maximumDepth = $maximumDepth;
155
156
        return $this;
157
    }
158
159
    public function getMaximumDepth(): ?int
160
    {
161
        return $this->maximumDepth;
162
    }
163
164
    /**
165
     * @param int $delay The delay in milliseconds.
166
     *
167
     * @return Crawler
168
     */
169
    public function setDelayBetweenRequests(int $delay): Crawler
170
    {
171
        $this->delayBetweenRequests = ($delay * 1000);
172
173
        return $this;
174
    }
175
176
    /**
177
     * @return int The delay in milliseconds.
178
     */
179
    public function getDelayBetweenRequests(): int
180
    {
181
        return $this->delayBetweenRequests;
182
    }
183
184
    public function ignoreRobots(): Crawler
185
    {
186
        $this->respectRobots = false;
187
188
        return $this;
189
    }
190
191
    public function respectRobots(): Crawler
192
    {
193
        $this->respectRobots = true;
194
195
        return $this;
196
    }
197
198
    public function mustRespectRobots(): bool
199
    {
200
        return $this->respectRobots;
201
    }
202
203
    public function getRobotsTxt(): RobotsTxt
204
    {
205
        return $this->robotsTxt;
206
    }
207
208
    public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler
209
    {
210
        $this->crawlQueue = $crawlQueue;
211
212
        return $this;
213
    }
214
215
    public function getCrawlQueue(): CrawlQueue
216
    {
217
        return $this->crawlQueue;
218
    }
219
220
    public function executeJavaScript(): Crawler
221
    {
222
        $this->executeJavaScript = true;
223
224
        return $this;
225
    }
226
227
    public function doNotExecuteJavaScript(): Crawler
228
    {
229
        $this->executeJavaScript = false;
230
231
        return $this;
232
    }
233
234
    public function mayExecuteJavascript(): bool
235
    {
236
        return $this->executeJavaScript;
237
    }
238
239
    /**
240
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
241
     *
242
     * @return $this
243
     */
244
    public function setCrawlObserver($crawlObservers): Crawler
245
    {
246
        if (! is_array($crawlObservers)) {
247
            $crawlObservers = [$crawlObservers];
248
        }
249
250
        return $this->setCrawlObservers($crawlObservers);
251
    }
252
253
    public function setCrawlObservers(array $crawlObservers): Crawler
254
    {
255
        $this->crawlObservers = new CrawlObserverCollection($crawlObservers);
256
257
        return $this;
258
    }
259
260
    public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler
261
    {
262
        $this->crawlObservers->addObserver($crawlObserver);
263
264
        return $this;
265
    }
266
267
    public function getCrawlObservers(): CrawlObserverCollection
268
    {
269
        return $this->crawlObservers;
270
    }
271
272
    public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler
273
    {
274
        $this->crawlProfile = $crawlProfile;
275
276
        return $this;
277
    }
278
279
    public function getCrawlProfile(): CrawlProfile
280
    {
281
        return $this->crawlProfile;
282
    }
283
284
    public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler
285
    {
286
        $baseClass = CrawlRequestFulfilled::class;
287
288
        if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
289
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
290
        }
291
292
        $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
293
294
        return $this;
295
    }
296
297
    public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler
298
    {
299
        $baseClass = CrawlRequestFailed::class;
300
301
        if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
302
            throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
303
        }
304
305
        $this->crawlRequestFailedClass = $crawlRequestFailedClass;
306
307
        return $this;
308
    }
309
310
    public function setBrowsershot(Browsershot $browsershot)
311
    {
312
        $this->browsershot = $browsershot;
313
314
        return $this;
315
    }
316
317
    public function setUserAgent(string $userAgent): Crawler
318
    {
319
        $this->userAgent = strtolower($userAgent);
320
321
        return $this;
322
    }
323
324
    public function getUserAgent(): string
325
    {
326
        return $this->userAgent;
327
    }
328
329
    public function getBrowsershot(): Browsershot
330
    {
331
        if (! $this->browsershot) {
332
            $this->browsershot = new Browsershot();
333
        }
334
335
        return $this->browsershot;
336
    }
337
338
    public function getBaseUrl(): UriInterface
339
    {
340
        return $this->baseUrl;
341
    }
342
343
    /**
344
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
345
     */
346
    public function startCrawling($baseUrl)
347
    {
348
        if (! $baseUrl instanceof UriInterface) {
349
            $baseUrl = new Uri($baseUrl);
350
        }
351
352
        if ($baseUrl->getScheme() === '') {
353
            $baseUrl = $baseUrl->withScheme('http');
354
        }
355
356
        if ($baseUrl->getPath() === '') {
357
            $baseUrl = $baseUrl->withPath('/');
358
        }
359
360
        $this->baseUrl = $baseUrl;
361
362
        $crawlUrl = CrawlUrl::create($this->baseUrl);
363
364
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
365
366
        if ($this->robotsTxt->allows((string) $crawlUrl->url, (string) $this->userAgent) ||
367
            ! $this->respectRobots
368
        ) {
369
            $this->addToCrawlQueue($crawlUrl);
370
        }
371
372
        $this->depthTree = new Node((string) $this->baseUrl);
373
374
        $this->startCrawlingQueue();
375
376
        foreach ($this->crawlObservers as $crawlObserver) {
377
            $crawlObserver->finishedCrawling();
378
        }
379
    }
380
381
    public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
382
    {
383
        if (is_null($this->maximumDepth)) {
384
            return new Node((string) $url);
385
        }
386
387
        $node = $node ?? $this->depthTree;
388
389
        $returnNode = null;
390
391
        if ($node->getValue() === (string) $parentUrl) {
392
            $newNode = new Node((string) $url);
393
394
            $node->addChild($newNode);
395
396
            return $newNode;
397
        }
398
399
        foreach ($node->getChildren() as $currentNode) {
400
            $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode);
401
402
            if (! is_null($returnNode)) {
403
                break;
404
            }
405
        }
406
407
        return $returnNode;
408
    }
409
410
    protected function startCrawlingQueue()
411
    {
412
        while ($this->crawlQueue->hasPendingUrls()) {
413
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
414
                'concurrency' => $this->concurrency,
415
                'options' => $this->client->getConfig(),
416
                'fulfilled' => new $this->crawlRequestFulfilledClass($this),
417
                'rejected' => new $this->crawlRequestFailedClass($this),
418
            ]);
419
420
            $promise = $pool->promise();
421
422
            $promise->wait();
423
        }
424
    }
425
426
    /**
427
     * @deprecated This function will be removed in the next major version
428
     */
429
    public function endsWith($haystack, $needle)
430
    {
431
        return strrpos($haystack, $needle) + strlen($needle) ===
432
            strlen($haystack);
433
    }
434
435
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
436
    {
437
        return RobotsTxt::create($uri->withPath('/robots.txt'));
438
    }
439
440
    protected function getCrawlRequests(): Generator
441
    {
442
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
443
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
444
                $this->crawlQueue->markAsProcessed($crawlUrl);
445
                continue;
446
            }
447
448
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
449
                continue;
450
            }
451
452
            foreach ($this->crawlObservers as $crawlObserver) {
453
                $crawlObserver->willCrawl($crawlUrl->url);
454
            }
455
456
            $this->crawlQueue->markAsProcessed($crawlUrl);
457
458
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
459
        }
460
    }
461
462
    public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler
463
    {
464
        if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
465
            return $this;
466
        }
467
468
        if ($this->getCrawlQueue()->has($crawlUrl->url)) {
469
            return $this;
470
        }
471
472
        $this->crawledUrlCount++;
473
474
        $this->crawlQueue->add($crawlUrl);
475
476
        return $this;
477
    }
478
479
    public function maximumCrawlCountReached(): bool
480
    {
481
        $maximumCrawlCount = $this->getMaximumCrawlCount();
482
483
        if (is_null($maximumCrawlCount)) {
484
            return false;
485
        }
486
487
        return $this->getCrawlerUrlCount() >= $maximumCrawlCount;
488
    }
489
}
490