Completed
Pull Request — master (#102)
by Brent
01:25
created

Crawler::hasCrawlableScheme()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use GuzzleHttp\RequestOptions;
12
use Illuminate\Support\Collection;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Symfony\Component\DomCrawler\Link;
16
use Psr\Http\Message\ResponseInterface;
17
use Spatie\Crawler\CrawlQueue\CrawlQueue;
18
use GuzzleHttp\Exception\RequestException;
19
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
20
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
21
22
class Crawler
23
{
24
    /** @var \GuzzleHttp\Client */
25
    protected $client;
26
27
    /** @var \Psr\Http\Message\UriInterface */
28
    protected $baseUrl;
29
30
    /** @var \Spatie\Crawler\CrawlObserver */
31
    protected $crawlObserver;
32
33
    /** @var \Spatie\Crawler\CrawlProfile */
34
    protected $crawlProfile;
35
36
    /** @var int */
37
    protected $concurrency;
38
39
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
40
    protected $crawlQueue;
41
42
    /** @var int */
43
    protected $crawledUrlCount = 0;
44
45
    /** @var int|null */
46
    protected $maximumCrawlCount = null;
47
48
    /** @var int|null */
49
    protected $maximumDepth = null;
50
51
    /** @var \Tree\Node\Node */
52
    protected $depthTree;
53
54
    /** @var false */
55
    protected $executeJavaScript = false;
56
57
    /** @var Browsershot */
58
    protected $browsershot = null;
59
60
    protected static $defaultClientOptions = [
61
        RequestOptions::COOKIES => true,
62
        RequestOptions::CONNECT_TIMEOUT => 10,
63
        RequestOptions::TIMEOUT => 10,
64
        RequestOptions::ALLOW_REDIRECTS => false,
65
    ];
66
67
    /**
68
     * @param array $clientOptions
69
     *
70
     * @return static
71
     */
72
    public static function create(array $clientOptions = [])
73
    {
74
        $clientOptions = (count($clientOptions))
75
            ? $clientOptions
76
            : self::$defaultClientOptions;
77
78
        $client = new Client($clientOptions);
79
80
        return new static($client);
81
    }
82
83
    public function __construct(Client $client, int $concurrency = 10)
84
    {
85
        $this->client = $client;
86
87
        $this->concurrency = $concurrency;
88
89
        $this->crawlProfile = new CrawlAllUrls();
90
91
        $this->crawlQueue = new CollectionCrawlQueue();
92
    }
93
94
    /**
95
     * @param int $concurrency
96
     *
97
     * @return $this
98
     */
99
    public function setConcurrency(int $concurrency)
100
    {
101
        $this->concurrency = $concurrency;
102
103
        return $this;
104
    }
105
106
    /**
107
     * @param int $maximumCrawlCount
108
     *
109
     * @return $this
110
     */
111
    public function setMaximumCrawlCount(int $maximumCrawlCount)
112
    {
113
        $this->maximumCrawlCount = $maximumCrawlCount;
114
115
        return $this;
116
    }
117
118
    /**
119
     * @param int $maximumDepth
120
     *
121
     * @return $this
122
     */
123
    public function setMaximumDepth(int $maximumDepth)
124
    {
125
        $this->maximumDepth = $maximumDepth;
126
127
        return $this;
128
    }
129
130
    /**
131
     * @param CrawlQueue $crawlQueue
132
     * @return $this
133
     */
134
    public function setCrawlQueue(CrawlQueue $crawlQueue)
135
    {
136
        $this->crawlQueue = $crawlQueue;
137
138
        return $this;
139
    }
140
141
    /**
142
     * @return $this
143
     */
144
    public function executeJavaScript()
145
    {
146
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
147
148
        return $this;
149
    }
150
151
    /**
152
     * @return $this
153
     */
154
    public function doNotExecuteJavaScript()
155
    {
156
        $this->executeJavaScript = false;
157
158
        return $this;
159
    }
160
161
    /**
162
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
163
     *
164
     * @return $this
165
     */
166
    public function setCrawlObserver(CrawlObserver $crawlObserver)
167
    {
168
        $this->crawlObserver = $crawlObserver;
169
170
        return $this;
171
    }
172
173
    /**
174
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
175
     *
176
     * @return $this
177
     */
178
    public function setCrawlProfile(CrawlProfile $crawlProfile)
179
    {
180
        $this->crawlProfile = $crawlProfile;
181
182
        return $this;
183
    }
184
185
    /**
186
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
187
     */
188
    public function startCrawling($baseUrl)
189
    {
190
        if (! $baseUrl instanceof UriInterface) {
191
            $baseUrl = new Uri($baseUrl);
192
        }
193
194
        if ($baseUrl->getScheme() === '') {
195
            $baseUrl = $baseUrl->withScheme('http');
196
        }
197
198
        if ($baseUrl->getPath() === '') {
199
            $baseUrl = $baseUrl->withPath('/');
200
        }
201
202
        $this->baseUrl = $baseUrl;
203
204
        $crawlUrl = CrawlUrl::create($this->baseUrl);
205
206
        $this->addToCrawlQueue($crawlUrl);
207
208
        $this->depthTree = new Node((string) $this->baseUrl);
209
210
        $this->startCrawlingQueue();
211
212
        $this->crawlObserver->finishedCrawling();
213
    }
214
215
    protected function startCrawlingQueue()
216
    {
217
        while ($this->crawlQueue->hasPendingUrls()) {
218
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
219
                'concurrency' => $this->concurrency,
220
                'options' => $this->client->getConfig(),
221
                'fulfilled' => function (ResponseInterface $response, int $index) {
222
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
223
                    $this->handleResponse($response, $crawlUrl);
224
225
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
226
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
227
                            return;
228
                        }
229
                    }
230
231
                    $this->addAllLinksToCrawlQueue(
232
                        (string) $response->getBody(),
233
                        $crawlUrl->url
234
                    );
235
                },
236
                'rejected' => function (RequestException $exception, int $index) {
237
                    $this->handleResponse(
238
                        $exception->getResponse(),
239
                        $this->crawlQueue->getUrlById($index)
240
                    );
241
                },
242
            ]);
243
244
            $promise = $pool->promise();
245
            $promise->wait();
246
        }
247
    }
248
249
    /**
250
     * @param ResponseInterface|null $response
251
     * @param CrawlUrl $crawlUrl
252
     */
253
    protected function handleResponse($response, CrawlUrl $crawlUrl)
254
    {
255
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
256
    }
257
258
    protected function getCrawlRequests(): Generator
259
    {
260
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
261
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
262
                continue;
263
            }
264
265
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
266
                continue;
267
            }
268
269
            $this->crawlObserver->willCrawl($crawlUrl->url);
270
271
            $this->crawlQueue->markAsProcessed($crawlUrl);
272
273
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
274
        }
275
    }
276
277
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
278
    {
279
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
280
281
        collect($allLinks)
282
            ->filter(function (UriInterface $url) {
283
                return $this->hasCrawlableScheme($url);
284
            })
285
            ->map(function (UriInterface $url) {
286
                return $this->normalizeUrl($url);
287
            })
288
            ->filter(function (UriInterface $url) {
289
                return $this->crawlProfile->shouldCrawl($url);
290
            })
291
            ->reject(function (UriInterface $url) {
292
                return $this->crawlQueue->has($url);
293
            })
294
            ->each(function (UriInterface $url) use ($foundOnUrl) {
295
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
296
297
                if (! $this->shouldCrawl($node)) {
298
                    return;
299
                }
300
301
                if ($this->maximumCrawlCountReached()) {
302
                    return;
303
                }
304
305
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
306
307
                $this->addToCrawlQueue($crawlUrl);
308
            });
309
    }
310
311
    protected function shouldCrawl(Node $node): bool
312
    {
313
        if (is_null($this->maximumDepth)) {
314
            return true;
315
        }
316
317
        return $node->getDepth() <= $this->maximumDepth;
318
    }
319
320
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection
321
    {
322
        if ($this->executeJavaScript) {
323
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
324
        }
325
326
        $domCrawler = new DomCrawler($html, $foundOnUrl);
327
328
        return collect($domCrawler->filterXpath('//a')->links())
329
            ->map(function (Link $link) {
330
                return new Uri($link->getUri());
331
            });
332
    }
333
334
    protected function normalizeUrl(UriInterface $url): UriInterface
335
    {
336
        return $url->withFragment('');
337
    }
338
339
    protected function hasCrawlableScheme(UriInterface $uri): bool
340
    {
341
        return in_array($uri->getScheme(), ['http', 'https']);
342
    }
343
344
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
345
    {
346
        $returnNode = null;
347
348
        if ($node->getValue() === (string) $parentUrl) {
349
            $newNode = new Node((string) $url);
350
351
            $node->addChild($newNode);
352
353
            return $newNode;
354
        }
355
356
        foreach ($node->getChildren() as $currentNode) {
357
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
358
359
            if (! is_null($returnNode)) {
360
                break;
361
            }
362
        }
363
364
        return $returnNode;
365
    }
366
367
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
368
    {
369
        $browsershot = $this->getBrowsershot();
370
371
        $html = $browsershot->url((string) $foundOnUrl)->bodyHtml();
372
373
        return html_entity_decode($html);
374
    }
375
376
    protected function getBrowsershot(): Browsershot
377
    {
378
        if ($this->browsershot) {
379
            return $this->browsershot;
380
        }
381
382
        $this->browsershot = new Browsershot();
383
384
        return $this->browsershot;
385
    }
386
387
    public function setBrowsershot(Browsershot $browsershot)
388
    {
389
        $this->browsershot = $browsershot;
390
391
        return $this;
392
    }
393
394
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
395
    {
396
        $this->crawledUrlCount++;
397
398
        $this->crawlQueue->add($crawlUrl);
399
400
        return $this;
401
    }
402
403
    protected function maximumCrawlCountReached(): bool
404
    {
405
        if (is_null($this->maximumCrawlCount)) {
406
            return false;
407
        }
408
409
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
410
    }
411
}
412