Completed
Pull Request — master (#118)
by Thomas
05:05
created

Crawler::noSandbox()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use InvalidArgumentException;
12
use GuzzleHttp\RequestOptions;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Psr\Http\Message\StreamInterface;
16
use Symfony\Component\DomCrawler\Link;
17
use Psr\Http\Message\ResponseInterface;
18
use Spatie\Crawler\CrawlQueue\CrawlQueue;
19
use Tightenco\Collect\Support\Collection;
20
use GuzzleHttp\Exception\RequestException;
21
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
22
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
23
24
class Crawler
25
{
26
    /** @var \GuzzleHttp\Client */
27
    protected $client;
28
29
    /** @var \Psr\Http\Message\UriInterface */
30
    protected $baseUrl;
31
32
    /** @var array[\Spatie\Crawler\CrawlObserver] */
33
    protected $crawlObservers;
34
35
    /** @var \Spatie\Crawler\CrawlProfile */
36
    protected $crawlProfile;
37
38
    /** @var int */
39
    protected $concurrency;
40
41
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
42
    protected $crawlQueue;
43
44
    /** @var int */
45
    protected $crawledUrlCount = 0;
46
47
    /** @var int|null */
48
    protected $maximumCrawlCount = null;
49
50
    /** @var int */
51
    protected $maximumResponseSize = 1024 * 1024 * 2;
52
53
    /** @var int|null */
54
    protected $maximumDepth = null;
55
56
    /** @var \Tree\Node\Node */
57
    protected $depthTree;
58
59
    /** @var bool */
60
    protected $executeJavaScript = false;
61
62
    /** @var Browsershot */
63
    protected $browsershot = null;
64
65
    /** @var bool */
66
    protected $noSandbox = false;
67
68
    protected static $defaultClientOptions = [
69
        RequestOptions::COOKIES => true,
70
        RequestOptions::CONNECT_TIMEOUT => 10,
71
        RequestOptions::TIMEOUT => 10,
72
        RequestOptions::ALLOW_REDIRECTS => false,
73
    ];
74
75
    /**
76
     * @param array $clientOptions
77
     *
78
     * @return static
79
     */
80
    public static function create(array $clientOptions = [])
81
    {
82
        $clientOptions = (count($clientOptions))
83
            ? $clientOptions
84
            : self::$defaultClientOptions;
85
86
        $client = new Client($clientOptions);
87
88
        return new static($client);
89
    }
90
91
    public function __construct(Client $client, int $concurrency = 10)
92
    {
93
        $this->client = $client;
94
95
        $this->concurrency = $concurrency;
96
97
        $this->crawlProfile = new CrawlAllUrls();
98
99
        $this->crawlQueue = new CollectionCrawlQueue();
100
    }
101
102
    /**
103
     * @param int $concurrency
104
     *
105
     * @return $this
106
     */
107
    public function setConcurrency(int $concurrency)
108
    {
109
        $this->concurrency = $concurrency;
110
111
        return $this;
112
    }
113
114
    /**
115
     * Responses that are larger that then specified value will be ignored.
116
     *
117
     * @param int $maximumResponseSizeInBytes
118
     *
119
     * @return $this
120
     */
121
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes)
122
    {
123
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
124
125
        return $this;
126
    }
127
128
    /**
129
     * @param int $maximumCrawlCount
130
     *
131
     * @return $this
132
     */
133
    public function setMaximumCrawlCount(int $maximumCrawlCount)
134
    {
135
        $this->maximumCrawlCount = $maximumCrawlCount;
136
137
        return $this;
138
    }
139
140
    /**
141
     * @param int $maximumDepth
142
     *
143
     * @return $this
144
     */
145
    public function setMaximumDepth(int $maximumDepth)
146
    {
147
        $this->maximumDepth = $maximumDepth;
148
149
        return $this;
150
    }
151
152
    /**
153
     * @param CrawlQueue $crawlQueue
154
     * @return $this
155
     */
156
    public function setCrawlQueue(CrawlQueue $crawlQueue)
157
    {
158
        $this->crawlQueue = $crawlQueue;
159
160
        return $this;
161
    }
162
163
    /**
164
     * @return $this
165
     */
166
    public function executeJavaScript()
167
    {
168
        $this->executeJavaScript = true;
169
170
        return $this;
171
    }
172
173
    /**
174
     * @return $this
175
     */
176
    public function doNotExecuteJavaScript()
177
    {
178
        $this->executeJavaScript = false;
179
180
        return $this;
181
    }
182
183
    /**
184
     * @return $this
185
     */
186
    public function noSandbox()
187
    {
188
        $this->noSandbox = true;
189
190
        return $this;
191
    }
192
193
    /**
194
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
195
     *
196
     * @return $this
197
     */
198
    public function setCrawlObserver($crawlObservers)
199
    {
200
        if (! is_array($crawlObservers)) {
201
            $crawlObservers = [$crawlObservers];
202
        }
203
204
        return $this->setCrawlObservers($crawlObservers);
205
    }
206
207
    public function setCrawlObservers(array $crawlObservers)
208
    {
209
        $this->crawlObservers = $crawlObservers;
210
211
        return $this;
212
    }
213
214
    public function addCrawlObserver(CrawlObserver $crawlObserver)
215
    {
216
        $this->crawlObservers[] = $crawlObserver;
217
218
        return $this;
219
    }
220
221
    /**
222
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
223
     *
224
     * @return $this
225
     */
226
    public function setCrawlProfile(CrawlProfile $crawlProfile)
227
    {
228
        $this->crawlProfile = $crawlProfile;
229
230
        return $this;
231
    }
232
233
    /**
234
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
235
     */
236
    public function startCrawling($baseUrl)
237
    {
238
        if (! $baseUrl instanceof UriInterface) {
239
            $baseUrl = new Uri($baseUrl);
240
        }
241
242
        if ($baseUrl->getScheme() === '') {
243
            $baseUrl = $baseUrl->withScheme('http');
244
        }
245
246
        if ($baseUrl->getPath() === '') {
247
            $baseUrl = $baseUrl->withPath('/');
248
        }
249
250
        $this->baseUrl = $baseUrl;
251
252
        $crawlUrl = CrawlUrl::create($this->baseUrl);
253
254
        $this->addToCrawlQueue($crawlUrl);
255
256
        $this->depthTree = new Node((string) $this->baseUrl);
257
258
        $this->startCrawlingQueue();
259
260
        foreach ($this->crawlObservers as $crawlObserver) {
261
            $crawlObserver->finishedCrawling();
262
        }
263
    }
264
265
    protected function startCrawlingQueue()
266
    {
267
        while ($this->crawlQueue->hasPendingUrls()) {
268
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
269
                'concurrency' => $this->concurrency,
270
                'options' => $this->client->getConfig(),
271
                'fulfilled' => function (ResponseInterface $response, int $index) {
272
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
273
                    $this->handleResponse($response, $crawlUrl);
274
275
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
276
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
277
                            return;
278
                        }
279
                    }
280
281
                    $body = $this->convertBodyToString($response->getBody(), $this->maximumResponseSize);
282
283
                    $this->addAllLinksToCrawlQueue(
284
                        $body,
285
                        $crawlUrl->url
286
                    );
287
                },
288
                'rejected' => function (RequestException $exception, int $index) {
289
                    $this->handleResponse(
290
                        $exception->getResponse(),
291
                        $this->crawlQueue->getUrlById($index)
292
                    );
293
                },
294
            ]);
295
296
            $promise = $pool->promise();
297
            $promise->wait();
298
        }
299
    }
300
301
    public function endsWith($haystack, $needle)
302
    {
303
        return strrpos($haystack, $needle) + strlen($needle) ===
304
            strlen($haystack);
305
    }
306
307
    protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
308
    {
309
        $bodyStream->rewind();
310
311
        $body = $bodyStream->read($readMaximumBytes);
312
313
        return $body;
314
    }
315
316
    /**
317
     * @param ResponseInterface|null $response
318
     * @param CrawlUrl $crawlUrl
319
     */
320
    protected function handleResponse($response, CrawlUrl $crawlUrl)
321
    {
322
        foreach ($this->crawlObservers as $crawlObserver) {
323
            $crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
324
        }
325
    }
326
327
    protected function getCrawlRequests(): Generator
328
    {
329
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
330
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
331
                $this->crawlQueue->markAsProcessed($crawlUrl);
332
                continue;
333
            }
334
335
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
336
                continue;
337
            }
338
339
            foreach ($this->crawlObservers as $crawlObserver) {
340
                $crawlObserver->willCrawl($crawlUrl->url);
341
            }
342
343
            $this->crawlQueue->markAsProcessed($crawlUrl);
344
345
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
346
        }
347
    }
348
349
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
350
    {
351
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
352
353
        collect($allLinks)
354
            ->filter(function (UriInterface $url) {
355
                return $this->hasCrawlableScheme($url);
356
            })
357
            ->map(function (UriInterface $url) {
358
                return $this->normalizeUrl($url);
359
            })
360
            ->filter(function (UriInterface $url) {
361
                return $this->crawlProfile->shouldCrawl($url);
362
            })
363
            ->reject(function (UriInterface $url) {
364
                return $this->crawlQueue->has($url);
365
            })
366
            ->each(function (UriInterface $url) use ($foundOnUrl) {
367
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
368
369
                if (! $this->shouldCrawl($node)) {
370
                    return;
371
                }
372
373
                if ($this->maximumCrawlCountReached()) {
374
                    return;
375
                }
376
377
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
378
379
                $this->addToCrawlQueue($crawlUrl);
380
            });
381
    }
382
383
    protected function shouldCrawl(Node $node): bool
384
    {
385
        if (is_null($this->maximumDepth)) {
386
            return true;
387
        }
388
389
        return $node->getDepth() <= $this->maximumDepth;
390
    }
391
392
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection
393
    {
394
        if ($this->executeJavaScript) {
395
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
396
        }
397
398
        $domCrawler = new DomCrawler($html, $foundOnUrl);
399
400
        return collect($domCrawler->filterXpath('//a')->links())
401
            ->map(function (Link $link) {
402
                try {
403
                    return new Uri($link->getUri());
404
                } catch (InvalidArgumentException $exception) {
405
                    return;
406
                }
407
            })
408
            ->filter();
409
    }
410
411
    protected function normalizeUrl(UriInterface $url): UriInterface
412
    {
413
        return $url->withFragment('');
414
    }
415
416
    protected function hasCrawlableScheme(UriInterface $uri): bool
417
    {
418
        return in_array($uri->getScheme(), ['http', 'https']);
419
    }
420
421
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
422
    {
423
        $returnNode = null;
424
425
        if ($node->getValue() === (string) $parentUrl) {
426
            $newNode = new Node((string) $url);
427
428
            $node->addChild($newNode);
429
430
            return $newNode;
431
        }
432
433
        foreach ($node->getChildren() as $currentNode) {
434
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
435
436
            if (! is_null($returnNode)) {
437
                break;
438
            }
439
        }
440
441
        return $returnNode;
442
    }
443
444
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
445
    {
446
        $browsershot = $this->getBrowsershot();
447
448
        $html = $this->noSandbox
449
            ? $browsershot->url((string) $foundOnUrl)->noSandbox()->bodyHtml()
450
            : $browsershot->url((string) $foundOnUrl)->bodyHtml();
451
452
        return html_entity_decode($html);
453
    }
454
455
    protected function getBrowsershot(): Browsershot
456
    {
457
        if ($this->browsershot) {
458
            return $this->browsershot;
459
        }
460
461
        $this->browsershot = new Browsershot();
462
463
        return $this->browsershot;
464
    }
465
466
    public function setBrowsershot(Browsershot $browsershot)
467
    {
468
        $this->browsershot = $browsershot;
469
470
        return $this;
471
    }
472
473
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
474
    {
475
        $this->crawledUrlCount++;
476
477
        $this->crawlQueue->add($crawlUrl);
478
479
        return $this;
480
    }
481
482
    protected function maximumCrawlCountReached(): bool
483
    {
484
        if (is_null($this->maximumCrawlCount)) {
485
            return false;
486
        }
487
488
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
489
    }
490
}
491