Completed
Push — master ( 4627b3...ac75db )
by Freek
04:07
created

Crawler::setCrawlObservers()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use InvalidArgumentException;
12
use GuzzleHttp\RequestOptions;
13
use Illuminate\Support\Collection;
14
use Psr\Http\Message\UriInterface;
15
use Spatie\Browsershot\Browsershot;
16
use Psr\Http\Message\StreamInterface;
17
use Symfony\Component\DomCrawler\Link;
18
use Psr\Http\Message\ResponseInterface;
19
use Spatie\Crawler\CrawlQueue\CrawlQueue;
20
use GuzzleHttp\Exception\RequestException;
21
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
22
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
23
24
class Crawler
25
{
26
    /** @var \GuzzleHttp\Client */
27
    protected $client;
28
29
    /** @var \Psr\Http\Message\UriInterface */
30
    protected $baseUrl;
31
32
    /** @var array[\Spatie\Crawler\CrawlObserver] */
33
    protected $crawlObservers;
34
35
    /** @var \Spatie\Crawler\CrawlProfile */
36
    protected $crawlProfile;
37
38
    /** @var int */
39
    protected $concurrency;
40
41
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
42
    protected $crawlQueue;
43
44
    /** @var int */
45
    protected $crawledUrlCount = 0;
46
47
    /** @var int|null */
48
    protected $maximumCrawlCount = null;
49
50
    /** @var int */
51
    protected $maximumResponseSize = 1024 * 1024 * 2;
52
53
    /** @var int|null */
54
    protected $maximumDepth = null;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var false */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    protected static $defaultClientOptions = [
66
        RequestOptions::COOKIES => true,
67
        RequestOptions::CONNECT_TIMEOUT => 10,
68
        RequestOptions::TIMEOUT => 10,
69
        RequestOptions::ALLOW_REDIRECTS => false,
70
    ];
71
72
    /**
73
     * @param array $clientOptions
74
     *
75
     * @return static
76
     */
77
    public static function create(array $clientOptions = [])
78
    {
79
        $clientOptions = (count($clientOptions))
80
            ? $clientOptions
81
            : self::$defaultClientOptions;
82
83
        $client = new Client($clientOptions);
84
85
        return new static($client);
86
    }
87
88
    public function __construct(Client $client, int $concurrency = 10)
89
    {
90
        $this->client = $client;
91
92
        $this->concurrency = $concurrency;
93
94
        $this->crawlProfile = new CrawlAllUrls();
95
96
        $this->crawlQueue = new CollectionCrawlQueue();
97
    }
98
99
    /**
100
     * @param int $concurrency
101
     *
102
     * @return $this
103
     */
104
    public function setConcurrency(int $concurrency)
105
    {
106
        $this->concurrency = $concurrency;
107
108
        return $this;
109
    }
110
111
    /**
112
     * Responses that are larger that then specified value will be ignored.
113
     *
114
     * @param int $maximumResponseSizeInBytes
115
     *
116
     * @return $this
117
     */
118
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
119
    {
120
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
121
122
        return $this;
123
    }
124
125
    /**
126
     * @param int $maximumCrawlCount
127
     *
128
     * @return $this
129
     */
130
    public function setMaximumCrawlCount(int $maximumCrawlCount)
131
    {
132
        $this->maximumCrawlCount = $maximumCrawlCount;
133
134
        return $this;
135
    }
136
137
    /**
138
     * @param int $maximumDepth
139
     *
140
     * @return $this
141
     */
142
    public function setMaximumDepth(int $maximumDepth)
143
    {
144
        $this->maximumDepth = $maximumDepth;
145
146
        return $this;
147
    }
148
149
    /**
150
     * @param CrawlQueue $crawlQueue
151
     * @return $this
152
     */
153
    public function setCrawlQueue(CrawlQueue $crawlQueue)
154
    {
155
        $this->crawlQueue = $crawlQueue;
156
157
        return $this;
158
    }
159
160
    /**
161
     * @return $this
162
     */
163
    public function executeJavaScript()
164
    {
165
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
166
167
        return $this;
168
    }
169
170
    /**
171
     * @return $this
172
     */
173
    public function doNotExecuteJavaScript()
174
    {
175
        $this->executeJavaScript = false;
176
177
        return $this;
178
    }
179
180
    /**
181
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObserver
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
Documentation introduced by
There is no parameter named $crawlObserver. Did you maybe mean $crawlObservers?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function. It has, however, found a similar but not annotated parameter which might be a good fit.

Consider the following example. The parameter $ireland is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $ireland
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was changed, but the annotation was not.

Loading history...
182
     *
183
     * @return $this
184
     */
185
    public function setCrawlObserver($crawlObservers)
186
    {
187
        if (! is_array($crawlObservers)) {
188
            $crawlObservers = [$crawlObservers];
189
        }
190
191
        return $this->setCrawlObservers($crawlObservers);
192
    }
193
194
    public function setCrawlObservers(array $crawlObservers)
195
    {
196
        $this->crawlObservers = $crawlObservers;
197
198
        return $this;
199
    }
200
201
    public function addCrawlObserver(CrawlObserver $crawlObserver)
202
    {
203
        $this->crawlObservers[] = $crawlObserver;
204
205
        return $this;
206
    }
207
208
    /**
209
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
210
     *
211
     * @return $this
212
     */
213
    public function setCrawlProfile(CrawlProfile $crawlProfile)
214
    {
215
        $this->crawlProfile = $crawlProfile;
216
217
        return $this;
218
    }
219
220
    /**
221
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
222
     */
223
    public function startCrawling($baseUrl)
224
    {
225
        if (! $baseUrl instanceof UriInterface) {
226
            $baseUrl = new Uri($baseUrl);
227
        }
228
229
        if ($baseUrl->getScheme() === '') {
230
            $baseUrl = $baseUrl->withScheme('http');
231
        }
232
233
        if ($baseUrl->getPath() === '') {
234
            $baseUrl = $baseUrl->withPath('/');
235
        }
236
237
        $this->baseUrl = $baseUrl;
238
239
        $crawlUrl = CrawlUrl::create($this->baseUrl);
240
241
        $this->addToCrawlQueue($crawlUrl);
242
243
        $this->depthTree = new Node((string) $this->baseUrl);
244
245
        $this->startCrawlingQueue();
246
247
        foreach($this->crawlObservers as $crawlObserver) {
248
            $crawlObserver->finishedCrawling();
249
        }
250
    }
251
252
    protected function startCrawlingQueue()
253
    {
254
        while ($this->crawlQueue->hasPendingUrls()) {
255
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
256
                'concurrency' => $this->concurrency,
257
                'options' => $this->client->getConfig(),
258
                'fulfilled' => function (ResponseInterface $response, int $index) {
259
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
260
                    $this->handleResponse($response, $crawlUrl);
261
262
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
263
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
264
                            return;
265
                        }
266
                    }
267
268
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
269
270
                    $this->addAllLinksToCrawlQueue(
271
                        $body,
272
                        $crawlUrl->url
273
                    );
274
                },
275
                'rejected' => function (RequestException $exception, int $index) {
276
                    $this->handleResponse(
277
                        $exception->getResponse(),
278
                        $this->crawlQueue->getUrlById($index)
279
                    );
280
                },
281
            ]);
282
283
            $promise = $pool->promise();
284
            $promise->wait();
285
        }
286
    }
287
288
    public function endsWith($haystack, $needle)
289
    {
290
        return strrpos($haystack, $needle) + strlen($needle) ===
291
            strlen($haystack);
292
    }
293
294
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
295
    {
296
        $bodyStream->rewind();
297
298
        $body = $bodyStream->read($readMaximumBytes);
299
300
        return $body;
301
    }
302
303
    /**
304
     * @param ResponseInterface|null $response
305
     * @param CrawlUrl $crawlUrl
306
     */
307
    protected function handleResponse($response, CrawlUrl $crawlUrl)
308
    {
309
        foreach($this->crawlObservers as $crawlObserver) {
310
            $crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
311
        }
312
    }
313
314
    protected function getCrawlRequests(): Generator
315
    {
316
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
317
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
318
                continue;
319
            }
320
321
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
322
                continue;
323
            }
324
325
            foreach($this->crawlObservers as $crawlObserver) {
326
                $crawlObserver->willCrawl($crawlUrl->url);
327
            }
328
329
            $this->crawlQueue->markAsProcessed($crawlUrl);
330
331
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
332
        }
333
    }
334
335
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
336
    {
337
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
338
339
        collect($allLinks)
340
            ->filter(function (UriInterface $url) {
341
                return $this->hasCrawlableScheme($url);
342
            })
343
            ->map(function (UriInterface $url) {
344
                return $this->normalizeUrl($url);
345
            })
346
            ->filter(function (UriInterface $url) {
347
                return $this->crawlProfile->shouldCrawl($url);
348
            })
349
            ->reject(function (UriInterface $url) {
350
                return $this->crawlQueue->has($url);
351
            })
352
            ->each(function (UriInterface $url) use ($foundOnUrl) {
353
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
354
355
                if (! $this->shouldCrawl($node)) {
356
                    return;
357
                }
358
359
                if ($this->maximumCrawlCountReached()) {
360
                    return;
361
                }
362
363
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
364
365
                $this->addToCrawlQueue($crawlUrl);
366
            });
367
    }
368
369
    protected function shouldCrawl(Node $node): bool
370
    {
371
        if (is_null($this->maximumDepth)) {
372
            return true;
373
        }
374
375
        return $node->getDepth() <= $this->maximumDepth;
376
    }
377
378
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection
379
    {
380
        if ($this->executeJavaScript) {
381
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
382
        }
383
384
        $domCrawler = new DomCrawler($html, $foundOnUrl);
385
386
        return collect($domCrawler->filterXpath('//a')->links())
387
            ->map(function (Link $link) {
388
                try {
389
                    return new Uri($link->getUri());
390
                } catch (InvalidArgumentException $exception) {
391
                    return;
392
                }
393
            })
394
            ->filter();
395
    }
396
397
    protected function normalizeUrl(UriInterface $url): UriInterface
398
    {
399
        return $url->withFragment('');
400
    }
401
402
    protected function hasCrawlableScheme(UriInterface $uri): bool
403
    {
404
        return in_array($uri->getScheme(), ['http', 'https']);
405
    }
406
407
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
408
    {
409
        $returnNode = null;
410
411
        if ($node->getValue() === (string) $parentUrl) {
412
            $newNode = new Node((string) $url);
413
414
            $node->addChild($newNode);
415
416
            return $newNode;
417
        }
418
419
        foreach ($node->getChildren() as $currentNode) {
420
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
421
422
            if (! is_null($returnNode)) {
423
                break;
424
            }
425
        }
426
427
        return $returnNode;
428
    }
429
430
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
431
    {
432
        $browsershot = $this->getBrowsershot();
433
434
        $html = $browsershot->url((string) $foundOnUrl)->bodyHtml();
435
436
        return html_entity_decode($html);
437
    }
438
439
    protected function getBrowsershot(): Browsershot
440
    {
441
        if ($this->browsershot) {
442
            return $this->browsershot;
443
        }
444
445
        $this->browsershot = new Browsershot();
446
447
        return $this->browsershot;
448
    }
449
450
    public function setBrowsershot(Browsershot $browsershot)
451
    {
452
        $this->browsershot = $browsershot;
453
454
        return $this;
455
    }
456
457
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
458
    {
459
        $this->crawledUrlCount++;
460
461
        $this->crawlQueue->add($crawlUrl);
462
463
        return $this;
464
    }
465
466
    protected function maximumCrawlCountReached(): bool
467
    {
468
        if (is_null($this->maximumCrawlCount)) {
469
            return false;
470
        }
471
472
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
473
    }
474
}
475