Completed
Pull Request — master (#139)
by
unknown
01:57
created

Crawler::addCrawlObserver()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use InvalidArgumentException;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Psr\Http\Message\StreamInterface;
16
use Symfony\Component\DomCrawler\Link;
17
use Psr\Http\Message\ResponseInterface;
18
use Spatie\Crawler\CrawlQueue\CrawlQueue;
19
use GuzzleHttp\Exception\RequestException;
20
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
21
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
22
23
class Crawler
24
{
25
    /** @var \GuzzleHttp\Client */
26
    protected $client;
27
28
    /** @var \Psr\Http\Message\UriInterface */
29
    protected $baseUrl;
30
31
    /** @var array[\Spatie\Crawler\CrawlObserver] */
32
    protected $crawlObservers;
33
34
    /** @var \Spatie\Crawler\CrawlProfile */
35
    protected $crawlProfile;
36
37
    /** @var int */
38
    protected $concurrency;
39
40
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
41
    protected $crawlQueue;
42
43
    /** @var int */
44
    protected $crawledUrlCount = 0;
45
46
    /** @var int|null */
47
    protected $maximumCrawlCount = null;
48
49
    /** @var int */
50
    protected $maximumResponseSize = 1024 * 1024 * 2;
51
52
    /** @var int|null */
53
    protected $maximumDepth = null;
54
55
    /** @var \Tree\Node\Node */
56
    protected $depthTree;
57
58
    /** @var bool */
59
    protected $executeJavaScript = false;
60
61
    /** @var bool */
62
    protected $honorMetaRobotsFollowPolicy = true;
63
64
    /** @var Browsershot */
65
    protected $browsershot = null;
66
67
    protected static $defaultClientOptions = [
68
        RequestOptions::COOKIES => true,
69
        RequestOptions::CONNECT_TIMEOUT => 10,
70
        RequestOptions::TIMEOUT => 10,
71
        RequestOptions::ALLOW_REDIRECTS => false,
72
    ];
73
74
    /**
75
     * @param array $clientOptions
76
     *
77
     * @return static
78
     */
79
    public static function create(array $clientOptions = [])
80
    {
81
        $clientOptions = (count($clientOptions))
82
            ? $clientOptions
83
            : self::$defaultClientOptions;
84
85
        $client = new Client($clientOptions);
86
87
        return new static($client);
88
    }
89
90
    public function __construct(Client $client, int $concurrency = 10)
91
    {
92
        $this->client = $client;
93
94
        $this->concurrency = $concurrency;
95
96
        $this->crawlProfile = new CrawlAllUrls();
97
98
        $this->crawlQueue = new CollectionCrawlQueue();
99
    }
100
101
    /**
102
     * @param int $concurrency
103
     *
104
     * @return $this
105
     */
106
    public function setConcurrency(int $concurrency)
107
    {
108
        $this->concurrency = $concurrency;
109
110
        return $this;
111
    }
112
113
    /**
114
     * Responses that are larger that then specified value will be ignored.
115
     *
116
     * @param int $maximumResponseSizeInBytes
117
     *
118
     * @return $this
119
     */
120
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
121
    {
122
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
123
124
        return $this;
125
    }
126
127
    /**
128
     * @param int $maximumCrawlCount
129
     *
130
     * @return $this
131
     */
132
    public function setMaximumCrawlCount(int $maximumCrawlCount)
133
    {
134
        $this->maximumCrawlCount = $maximumCrawlCount;
135
136
        return $this;
137
    }
138
139
    /**
140
     * @param int $maximumDepth
141
     *
142
     * @return $this
143
     */
144
    public function setMaximumDepth(int $maximumDepth)
145
    {
146
        $this->maximumDepth = $maximumDepth;
147
148
        return $this;
149
    }
150
151
    /**
152
     * @param CrawlQueue $crawlQueue
153
     *
154
     * @return $this
155
     */
156
    public function setCrawlQueue(CrawlQueue $crawlQueue)
157
    {
158
        $this->crawlQueue = $crawlQueue;
159
160
        return $this;
161
    }
162
163
    /**
164
     * @return $this
165
     */
166
    public function executeJavaScript()
167
    {
168
        $this->executeJavaScript = true;
169
170
        return $this;
171
    }
172
173
    /**
174
     * @return $this
175
     */
176
    public function doNotExecuteJavaScript()
177
    {
178
        $this->executeJavaScript = false;
179
180
        return $this;
181
    }
182
183
    /**
184
     * @return $this
185
     */
186
    public function honorMetaRobotsFollowPolicy()
187
    {
188
        $this->honorMetaRobotsFollowPolicy = true;
189
190
        return $this;
191
    }
192
193
    /**
194
     * @return $this
195
     */
196
    public function doNotHonorMetaRobotsFollowPolicy()
197
    {
198
        $this->honorMetaRobotsFollowPolicy = false;
199
200
        return $this;
201
    }
202
203
    /**
204
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
205
     *
206
     * @return $this
207
     */
208
    public function setCrawlObserver($crawlObservers)
209
    {
210
        if (! is_array($crawlObservers)) {
211
            $crawlObservers = [$crawlObservers];
212
        }
213
214
        return $this->setCrawlObservers($crawlObservers);
215
    }
216
217
    public function setCrawlObservers(array $crawlObservers)
218
    {
219
        $this->crawlObservers = $crawlObservers;
220
221
        return $this;
222
    }
223
224
    public function addCrawlObserver(CrawlObserver $crawlObserver)
225
    {
226
        $this->crawlObservers[] = $crawlObserver;
227
228
        return $this;
229
    }
230
231
    /**
232
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
233
     *
234
     * @return $this
235
     */
236
    public function setCrawlProfile(CrawlProfile $crawlProfile)
237
    {
238
        $this->crawlProfile = $crawlProfile;
239
240
        return $this;
241
    }
242
243
    /**
244
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
245
     */
246
    public function startCrawling($baseUrl)
247
    {
248
        if (! $baseUrl instanceof UriInterface) {
249
            $baseUrl = new Uri($baseUrl);
250
        }
251
252
        if ($baseUrl->getScheme() === '') {
253
            $baseUrl = $baseUrl->withScheme('http');
254
        }
255
256
        if ($baseUrl->getPath() === '') {
257
            $baseUrl = $baseUrl->withPath('/');
258
        }
259
260
        $this->baseUrl = $baseUrl;
261
262
        $crawlUrl = CrawlUrl::create($this->baseUrl);
263
264
        $this->addToCrawlQueue($crawlUrl);
265
266
        $this->depthTree = new Node((string) $this->baseUrl);
267
268
        $this->startCrawlingQueue();
269
270
        foreach ($this->crawlObservers as $crawlObserver) {
271
            $crawlObserver->finishedCrawling();
272
        }
273
    }
274
275
    protected function startCrawlingQueue()
276
    {
277
        while ($this->crawlQueue->hasPendingUrls()) {
278
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
279
                'concurrency' => $this->concurrency,
280
                'options' => $this->client->getConfig(),
281
                'fulfilled' => function (ResponseInterface $response, int $index) {
282
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
283
                    $this->handleCrawled($response, $crawlUrl);
284
285
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
286
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
287
                            return;
288
                        }
289
                    }
290
291
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
292
293
                    $this->addAllLinksToCrawlQueue(
294
                        $body,
295
                        $crawlUrl->url
296
                    );
297
                },
298
                'rejected' => function (RequestException $exception, int $index) {
299
                    $this->handleCrawlFailed(
300
                        $exception,
301
                        $this->crawlQueue->getUrlById($index),
302
                        $exception
0 ignored issues
show
Unused Code introduced by
The call to Crawler::handleCrawlFailed() has too many arguments starting with $exception.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
303
                    );
304
                },
305
            ]);
306
307
            $promise = $pool->promise();
308
            $promise->wait();
309
        }
310
    }
311
312
    public function endsWith($haystack, $needle)
313
    {
314
        return strrpos($haystack, $needle) + strlen($needle) ===
315
            strlen($haystack);
316
    }
317
318
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
319
    {
320
        $bodyStream->rewind();
321
322
        $body = $bodyStream->read($readMaximumBytes);
323
324
        return $body;
325
    }
326
327
    /**
328
     * @param ResponseInterface|null $response
329
     * @param CrawlUrl $crawlUrl
330
     */
331
    protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)
332
    {
333
        foreach ($this->crawlObservers as $crawlObserver) {
334
            $crawlObserver->crawled(
335
                $crawlUrl->url,
336
                $response,
337
                $crawlUrl->foundOnUrl
338
            );
339
        }
340
    }
341
342
    /**
343
     * @param RequestException $exception
344
     * @param CrawlUrl $crawlUrl
345
     */
346
    protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)
347
    {
348
        foreach ($this->crawlObservers as $crawlObserver) {
349
            $crawlObserver->crawlFailed(
350
                $crawlUrl->url,
351
                $exception,
352
                $crawlUrl->foundOnUrl
353
            );
354
        }
355
    }
356
357
    protected function getCrawlRequests(): Generator
358
    {
359
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
360
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
361
                $this->crawlQueue->markAsProcessed($crawlUrl);
362
                continue;
363
            }
364
365
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
366
                continue;
367
            }
368
369
            foreach ($this->crawlObservers as $crawlObserver) {
370
                $crawlObserver->willCrawl($crawlUrl->url);
371
            }
372
373
            $this->crawlQueue->markAsProcessed($crawlUrl);
374
375
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
376
        }
377
    }
378
379
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
380
    {
381
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
382
383
        collect($allLinks)
384
            ->filter(function (UriInterface $url) {
385
                return $this->hasCrawlableScheme($url);
386
            })
387
            ->map(function (UriInterface $url) {
388
                return $this->normalizeUrl($url);
389
            })
390
            ->filter(function (UriInterface $url) {
391
                return $this->crawlProfile->shouldCrawl($url);
392
            })
393
            ->reject(function (UriInterface $url) {
394
                return $this->crawlQueue->has($url);
395
            })
396
            ->each(function (UriInterface $url) use ($foundOnUrl) {
397
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
398
399
                if (strpos($url->getPath(), '/tel:') === 0) {
400
                    return;
401
                }
402
403
                if (! $this->shouldCrawl($node)) {
404
                    return;
405
                }
406
407
                if ($this->maximumCrawlCountReached()) {
408
                    return;
409
                }
410
411
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
412
413
                $this->addToCrawlQueue($crawlUrl);
414
            });
415
    }
416
417
    protected function shouldCrawl(Node $node): bool
418
    {
419
        if (is_null($this->maximumDepth)) {
420
            return true;
421
        }
422
423
        return $node->getDepth() <= $this->maximumDepth;
424
    }
425
426
    /**
427
     * @param string $html
428
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
429
     *
430
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
431
     */
432
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
433
    {
434
        if ($this->executeJavaScript) {
435
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
436
        }
437
438
        $domCrawler = new DomCrawler($html, $foundOnUrl);
439
        if ($this->honorMetaRobotsFollowPolicy === true) {
440
            $shouldFollow = $domCrawler
441
                    ->filterXPath('//head/meta[@name="robots"]/@content')
442
                    ->reduce(function (DomCrawler $node, $i) {
0 ignored issues
show
Unused Code introduced by
The parameter $i is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
443
                        return strpos($node->text(), 'nofollow') !== false;
444
                    })
445
                    ->count() === 0;
446
            if ($shouldFollow === false) {
447
                return;
448
            }
449
        }
450
451
        return collect($domCrawler->filterXpath('//a')->links())
452
            ->reject(function (Link $link) {
453
                return $link->getNode()->getAttribute('rel') === 'nofollow';
454
            })
455
            ->map(function (Link $link) {
456
                try {
457
                    return new Uri($link->getUri());
458
                } catch (InvalidArgumentException $exception) {
459
                    return;
460
                }
461
            })
462
            ->filter();
463
    }
464
465
    protected function normalizeUrl(UriInterface $url): UriInterface
466
    {
467
        return $url->withFragment('');
468
    }
469
470
    protected function hasCrawlableScheme(UriInterface $uri): bool
471
    {
472
        return in_array($uri->getScheme(), ['http', 'https']);
473
    }
474
475
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
476
    {
477
        $returnNode = null;
478
479
        if ($node->getValue() === (string) $parentUrl) {
480
            $newNode = new Node((string) $url);
481
482
            $node->addChild($newNode);
483
484
            return $newNode;
485
        }
486
487
        foreach ($node->getChildren() as $currentNode) {
488
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
489
490
            if (! is_null($returnNode)) {
491
                break;
492
            }
493
        }
494
495
        return $returnNode;
496
    }
497
498
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
499
    {
500
        $browsershot = $this->getBrowsershot();
501
502
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
503
504
        return html_entity_decode($html);
505
    }
506
507
    protected function getBrowsershot(): Browsershot
508
    {
509
        if ($this->browsershot) {
510
            return $this->browsershot;
511
        }
512
513
        $this->browsershot = new Browsershot();
514
515
        return $this->browsershot;
516
    }
517
518
    public function setBrowsershot(Browsershot $browsershot)
519
    {
520
        $this->browsershot = $browsershot;
521
522
        return $this;
523
    }
524
525
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
526
    {
527
        $this->crawledUrlCount++;
528
529
        $this->crawlQueue->add($crawlUrl);
530
531
        return $this;
532
    }
533
534
    protected function maximumCrawlCountReached(): bool
535
    {
536
        if (is_null($this->maximumCrawlCount)) {
537
            return false;
538
        }
539
540
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
541
    }
542
}
543