Completed
Pull Request — master (#150)
by Brent
01:40
created

Crawler::getFailedHandler()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 7
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 7
rs 9.4285
cc 1
eloc 4
nc 1
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Spatie\Crawler\Handlers\CrawlRequestFailed;
7
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
8
use Tree\Node\Node;
9
use GuzzleHttp\Pool;
10
use GuzzleHttp\Client;
11
use GuzzleHttp\Psr7\Uri;
12
use GuzzleHttp\Psr7\Request;
13
use Spatie\Robots\RobotsTxt;
14
use InvalidArgumentException;
15
use GuzzleHttp\RequestOptions;
16
use Psr\Http\Message\UriInterface;
17
use Spatie\Browsershot\Browsershot;
18
use Symfony\Component\DomCrawler\Link;
19
use Spatie\Crawler\CrawlQueue\CrawlQueue;
20
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
21
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
22
23
class Crawler
24
{
25
    /** @var \GuzzleHttp\Client */
26
    protected $client;
27
28
    /** @var \Psr\Http\Message\UriInterface */
29
    protected $baseUrl;
30
31
    /** @var array[\Spatie\Crawler\CrawlObserver] */
32
    protected $crawlObservers;
33
34
    /** @var \Spatie\Crawler\CrawlProfile */
35
    protected $crawlProfile;
36
37
    /** @var int */
38
    protected $concurrency;
39
40
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
41
    protected $crawlQueue;
42
43
    /** @var int */
44
    protected $crawledUrlCount = 0;
45
46
    /** @var int|null */
47
    protected $maximumCrawlCount = null;
48
49
    /** @var int */
50
    protected $maximumResponseSize = 1024 * 1024 * 2;
51
52
    /** @var int|null */
53
    protected $maximumDepth = null;
54
55
    /** @var bool */
56
    protected $respectRobots = true;
57
58
    /** @var \Tree\Node\Node */
59
    protected $depthTree;
60
61
    /** @var bool */
62
    protected $executeJavaScript = false;
63
64
    /** @var Browsershot */
65
    protected $browsershot = null;
66
67
    /** @var \Spatie\Robots\RobotsTxt */
68
    protected $robotsTxt = null;
69
70
    protected static $defaultClientOptions = [
71
        RequestOptions::COOKIES => true,
72
        RequestOptions::CONNECT_TIMEOUT => 10,
73
        RequestOptions::TIMEOUT => 10,
74
        RequestOptions::ALLOW_REDIRECTS => false,
75
    ];
76
77
    /**
78
     * @param array $clientOptions
79
     *
80
     * @return static
81
     */
82
    public static function create(array $clientOptions = [])
83
    {
84
        $clientOptions = (count($clientOptions))
85
            ? $clientOptions
86
            : self::$defaultClientOptions;
87
88
        $client = new Client($clientOptions);
89
90
        return new static($client);
91
    }
92
93
    public function __construct(Client $client, int $concurrency = 10)
94
    {
95
        $this->client = $client;
96
97
        $this->concurrency = $concurrency;
98
99
        $this->crawlProfile = new CrawlAllUrls();
100
101
        $this->crawlQueue = new CollectionCrawlQueue();
102
    }
103
104
    public function setConcurrency(int $concurrency): self
105
    {
106
        $this->concurrency = $concurrency;
107
108
        return $this;
109
    }
110
111
    public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self
112
    {
113
        $this->maximumResponseSize = $maximumResponseSizeInBytes;
114
115
        return $this;
116
    }
117
118
    public function setMaximumCrawlCount(int $maximumCrawlCount): self
119
    {
120
        $this->maximumCrawlCount = $maximumCrawlCount;
121
122
        return $this;
123
    }
124
125
    public function setMaximumDepth(int $maximumDepth): self
126
    {
127
        $this->maximumDepth = $maximumDepth;
128
129
        return $this;
130
    }
131
132
    public function ignoreRobots(): self
133
    {
134
        $this->respectRobots = false;
135
136
        return $this;
137
    }
138
139
    public function respectRobots(): self
140
    {
141
        $this->respectRobots = true;
142
143
        return $this;
144
    }
145
146
    public function setCrawlQueue(CrawlQueue $crawlQueue): self
147
    {
148
        $this->crawlQueue = $crawlQueue;
149
150
        return $this;
151
    }
152
153
    public function executeJavaScript(): self
154
    {
155
        $this->executeJavaScript = true;
156
157
        return $this;
158
    }
159
160
    public function doNotExecuteJavaScript(): self
161
    {
162
        $this->executeJavaScript = false;
163
164
        return $this;
165
    }
166
167
    /**
168
     * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers
0 ignored issues
show
Documentation introduced by
The doc-type \Spatie\Crawler\CrawlObs...\Crawler\CrawlObserver] could not be parsed: Expected "]" at position 4, but found "\Spatie\Crawler\CrawlObserver". (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
169
     *
170
     * @return $this
171
     */
172
    public function setCrawlObserver($crawlObservers)
173
    {
174
        if (! is_array($crawlObservers)) {
175
            $crawlObservers = [$crawlObservers];
176
        }
177
178
        return $this->setCrawlObservers($crawlObservers);
179
    }
180
181
    public function setCrawlObservers(array $crawlObservers): self
182
    {
183
        $this->crawlObservers = $crawlObservers;
184
185
        return $this;
186
    }
187
188
    public function addCrawlObserver(CrawlObserver $crawlObserver): self
189
    {
190
        $this->crawlObservers[] = $crawlObserver;
191
192
        return $this;
193
    }
194
195
    public function setCrawlProfile(CrawlProfile $crawlProfile): self
196
    {
197
        $this->crawlProfile = $crawlProfile;
198
199
        return $this;
200
    }
201
202
    /**
203
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
204
     */
205
    public function startCrawling($baseUrl)
206
    {
207
        if (! $baseUrl instanceof UriInterface) {
208
            $baseUrl = new Uri($baseUrl);
209
        }
210
211
        if ($baseUrl->getScheme() === '') {
212
            $baseUrl = $baseUrl->withScheme('http');
213
        }
214
215
        if ($baseUrl->getPath() === '') {
216
            $baseUrl = $baseUrl->withPath('/');
217
        }
218
219
        $this->baseUrl = $baseUrl;
220
221
        $crawlUrl = CrawlUrl::create($this->baseUrl);
222
223
        $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
224
225
        if ($this->robotsTxt->allows((string) $crawlUrl->url)) {
226
            $this->addToCrawlQueue($crawlUrl);
227
        }
228
229
        $this->depthTree = new Node((string) $this->baseUrl);
230
231
        $this->startCrawlingQueue();
232
233
        foreach ($this->crawlObservers as $crawlObserver) {
234
            $crawlObserver->finishedCrawling();
235
        }
236
    }
237
238
    protected function startCrawlingQueue()
239
    {
240
        $fulfilledHandler = $this->getFulfilledHandler();
241
242
        $failedHandler= $this->getFailedHandler();
243
244
        while ($this->crawlQueue->hasPendingUrls()) {
245
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
246
                'concurrency' => $this->concurrency,
247
                'options' => $this->client->getConfig(),
248
                'fulfilled' => $fulfilledHandler,
249
                'rejected' => $failedHandler,
250
            ]);
251
252
            $promise = $pool->promise();
253
254
            $promise->wait();
255
        }
256
    }
257
258
    public function endsWith($haystack, $needle)
259
    {
260
        return strrpos($haystack, $needle) + strlen($needle) ===
261
            strlen($haystack);
262
    }
263
264
    protected function createRobotsTxt(UriInterface $uri): RobotsTxt
265
    {
266
        return RobotsTxt::create($uri->withPath('/robots.txt'));
267
    }
268
269
    protected function getCrawlRequests(): Generator
270
    {
271
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
272
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
273
                $this->crawlQueue->markAsProcessed($crawlUrl);
274
                continue;
275
            }
276
277
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
278
                continue;
279
            }
280
281
            foreach ($this->crawlObservers as $crawlObserver) {
282
                $crawlObserver->willCrawl($crawlUrl->url);
283
            }
284
285
            $this->crawlQueue->markAsProcessed($crawlUrl);
286
287
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
288
        }
289
    }
290
291
    public function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
292
    {
293
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
294
295
        collect($allLinks)
296
            ->filter(function (UriInterface $url) {
297
                return $this->hasCrawlableScheme($url);
298
            })
299
            ->map(function (UriInterface $url) {
300
                return $this->normalizeUrl($url);
301
            })
302
            ->filter(function (UriInterface $url) {
303
                return $this->crawlProfile->shouldCrawl($url);
304
            })
305
            ->reject(function (UriInterface $url) {
306
                return $this->crawlQueue->has($url);
307
            })
308
            ->each(function (UriInterface $url) use ($foundOnUrl) {
309
                $node = $this->addToDepthTree($this->depthTree, $url, $foundOnUrl);
310
311
                if (strpos($url->getPath(), '/tel:') === 0) {
312
                    return;
313
                }
314
315
                if (! $this->shouldCrawl($node)) {
316
                    return;
317
                }
318
319
                if ($this->maximumCrawlCountReached()) {
320
                    return;
321
                }
322
323
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
324
325
                $this->addToCrawlQueue($crawlUrl);
326
            });
327
    }
328
329
    protected function shouldCrawl(Node $node): bool
330
    {
331
        if ($this->respectRobots) {
332
            return $this->robotsTxt->allows($node->getValue());
333
        }
334
335
        if (is_null($this->maximumDepth)) {
336
            return true;
337
        }
338
339
        return $node->getDepth() <= $this->maximumDepth;
340
    }
341
342
    /**
343
     * @param string                         $html
344
     * @param \Psr\Http\Message\UriInterface $foundOnUrl
345
     *
346
     * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null
347
     */
348
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl)
349
    {
350
        if ($this->executeJavaScript) {
351
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
352
        }
353
354
        $domCrawler = new DomCrawler($html, $foundOnUrl);
355
356
        return collect($domCrawler->filterXpath('//a')->links())
357
            ->reject(function (Link $link) {
358
                return $link->getNode()->getAttribute('rel') === 'nofollow';
359
            })
360
            ->map(function (Link $link) {
361
                try {
362
                    return new Uri($link->getUri());
363
                } catch (InvalidArgumentException $exception) {
364
                    return;
365
                }
366
            })
367
            ->filter();
368
    }
369
370
    protected function normalizeUrl(UriInterface $url): UriInterface
371
    {
372
        return $url->withFragment('');
373
    }
374
375
    protected function hasCrawlableScheme(UriInterface $uri): bool
376
    {
377
        return in_array($uri->getScheme(), ['http', 'https']);
378
    }
379
380
    protected function addToDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
381
    {
382
        $returnNode = null;
383
384
        if ($node->getValue() === (string) $parentUrl) {
385
            $newNode = new Node((string) $url);
386
387
            $node->addChild($newNode);
388
389
            return $newNode;
390
        }
391
392
        foreach ($node->getChildren() as $currentNode) {
393
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
394
395
            if (! is_null($returnNode)) {
396
                break;
397
            }
398
        }
399
400
        return $returnNode;
401
    }
402
403
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
404
    {
405
        $browsershot = $this->getBrowsershot();
406
407
        $html = $browsershot->setUrl((string) $foundOnUrl)->bodyHtml();
408
409
        return html_entity_decode($html);
410
    }
411
412
    protected function getBrowsershot(): Browsershot
413
    {
414
        if (! $this->browsershot) {
415
            $this->browsershot = new Browsershot();
416
        }
417
418
        return $this->browsershot;
419
    }
420
421
    public function setBrowsershot(Browsershot $browsershot)
422
    {
423
        $this->browsershot = $browsershot;
424
425
        return $this;
426
    }
427
428
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
429
    {
430
        $this->crawledUrlCount++;
431
432
        $this->crawlQueue->add($crawlUrl);
433
434
        return $this;
435
    }
436
437
    protected function maximumCrawlCountReached(): bool
438
    {
439
        if (is_null($this->maximumCrawlCount)) {
440
            return false;
441
        }
442
443
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
444
    }
445
446
    protected function getFulfilledHandler(): CrawlRequestFulfilled
447
    {
448
        return new CrawlRequestFulfilled(
449
            $this,
450
            $this->baseUrl,
451
            $this->crawlQueue,
452
            $this->crawlProfile,
453
            $this->crawlObservers,
454
            $this->maximumResponseSize,
455
            $this->respectRobots
456
        );
457
    }
458
459
    protected function getFailedHandler(): CrawlRequestFailed
460
    {
461
        return new CrawlRequestFailed(
462
            $this->crawlQueue,
463
            $this->crawlObservers
464
        );
465
    }
466
}
467