Completed
Pull Request — master (#102)
by Brent
01:25
created

Crawler::maximumCrawlCountReached()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 8
rs 9.4285
cc 2
eloc 4
nc 2
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Uri;
10
use GuzzleHttp\Psr7\Request;
11
use GuzzleHttp\RequestOptions;
12
use Illuminate\Support\Collection;
13
use Psr\Http\Message\UriInterface;
14
use Spatie\Browsershot\Browsershot;
15
use Symfony\Component\DomCrawler\Link;
16
use Psr\Http\Message\ResponseInterface;
17
use Spatie\Crawler\CrawlQueue\CrawlQueue;
18
use GuzzleHttp\Exception\RequestException;
19
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
20
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
21
22
class Crawler
23
{
24
    /** @var \GuzzleHttp\Client */
25
    protected $client;
26
27
    /** @var \Psr\Http\Message\UriInterface */
28
    protected $baseUrl;
29
30
    /** @var \Spatie\Crawler\CrawlObserver */
31
    protected $crawlObserver;
32
33
    /** @var \Spatie\Crawler\CrawlProfile */
34
    protected $crawlProfile;
35
36
    /** @var int */
37
    protected $concurrency;
38
39
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
40
    protected $crawlQueue;
41
42
    /** @var int */
43
    protected $crawledUrlCount = 0;
44
45
    /** @var int|null */
46
    protected $maximumCrawlCount = null;
47
48
    /** @var int|null */
49
    protected $maximumDepth = null;
50
51
    /** @var \Tree\Node\Node */
52
    protected $depthTree;
53
54
    /** @var false */
55
    protected $executeJavaScript = false;
56
57
    /** @var Browsershot */
58
    protected $browsershot = null;
59
60
    protected static $defaultClientOptions = [
61
        RequestOptions::COOKIES => true,
62
        RequestOptions::CONNECT_TIMEOUT => 10,
63
        RequestOptions::TIMEOUT => 10,
64
        RequestOptions::ALLOW_REDIRECTS => false,
65
    ];
66
67
    /**
68
     * @param array $clientOptions
69
     *
70
     * @return static
71
     */
72
    public static function create(array $clientOptions = [])
73
    {
74
        $clientOptions = (count($clientOptions))
75
            ? $clientOptions
76
            : self::$defaultClientOptions;
77
78
        $client = new Client($clientOptions);
79
80
        return new static($client);
81
    }
82
83
    public function __construct(Client $client, int $concurrency = 10)
84
    {
85
        $this->client = $client;
86
87
        $this->concurrency = $concurrency;
88
89
        $this->crawlProfile = new CrawlAllUrls();
90
91
        $this->crawlQueue = new CollectionCrawlQueue();
92
    }
93
94
    /**
95
     * @param int $concurrency
96
     *
97
     * @return $this
98
     */
99
    public function setConcurrency(int $concurrency)
100
    {
101
        $this->concurrency = $concurrency;
102
103
        return $this;
104
    }
105
106
    /**
107
     * @param int $maximumCrawlCount
108
     *
109
     * @return $this
110
     */
111
    public function setMaximumCrawlCount(int $maximumCrawlCount)
112
    {
113
        $this->maximumCrawlCount = $maximumCrawlCount;
114
115
        return $this;
116
    }
117
118
    /**
119
     * @param int $maximumDepth
120
     *
121
     * @return $this
122
     */
123
    public function setMaximumDepth(int $maximumDepth)
124
    {
125
        $this->maximumDepth = $maximumDepth;
126
127
        return $this;
128
    }
129
130
    /**
131
     * @param CrawlQueue $crawlQueue
132
     * @return $this
133
     */
134
    public function setCrawlQueue(CrawlQueue $crawlQueue)
135
    {
136
        $this->crawlQueue = $crawlQueue;
137
138
        return $this;
139
    }
140
141
    /**
142
     * @return $this
143
     */
144
    public function executeJavaScript()
145
    {
146
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
147
148
        return $this;
149
    }
150
151
    /**
152
     * @return $this
153
     */
154
    public function doNotExecuteJavaScript()
155
    {
156
        $this->executeJavaScript = false;
157
158
        return $this;
159
    }
160
161
    /**
162
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
163
     *
164
     * @return $this
165
     */
166
    public function setCrawlObserver(CrawlObserver $crawlObserver)
167
    {
168
        $this->crawlObserver = $crawlObserver;
169
170
        return $this;
171
    }
172
173
    /**
174
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
175
     *
176
     * @return $this
177
     */
178
    public function setCrawlProfile(CrawlProfile $crawlProfile)
179
    {
180
        $this->crawlProfile = $crawlProfile;
181
182
        return $this;
183
    }
184
185
    /**
186
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
187
     */
188
    public function startCrawling($baseUrl)
189
    {
190
        if (! $baseUrl instanceof UriInterface) {
191
            $baseUrl = new Uri($baseUrl);
192
        }
193
194
        if ($baseUrl->getPath() === '') {
195
            $baseUrl = $baseUrl->withPath('/');
196
        }
197
198
        $this->baseUrl = $baseUrl;
199
200
        $crawlUrl = CrawlUrl::create($baseUrl);
201
202
        $this->addToCrawlQueue($crawlUrl);
203
204
        $this->depthTree = new Node((string) $this->baseUrl);
205
206
        $this->startCrawlingQueue();
207
208
        $this->crawlObserver->finishedCrawling();
209
    }
210
211
    protected function startCrawlingQueue()
212
    {
213
        while ($this->crawlQueue->hasPendingUrls()) {
214
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
215
                'concurrency' => $this->concurrency,
216
                'options' => $this->client->getConfig(),
217
                'fulfilled' => function (ResponseInterface $response, int $index) {
218
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
219
                    $this->handleResponse($response, $crawlUrl);
220
221
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
222
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
223
                            return;
224
                        }
225
                    }
226
227
                    $this->addAllLinksToCrawlQueue(
228
                        (string) $response->getBody(),
229
                        $crawlUrl->url
230
                    );
231
                },
232
                'rejected' => function (RequestException $exception, int $index) {
233
                    $this->handleResponse(
234
                        $exception->getResponse(),
235
                        $this->crawlQueue->getUrlById($index)
236
                    );
237
                },
238
            ]);
239
240
            $promise = $pool->promise();
241
            $promise->wait();
242
        }
243
    }
244
245
    /**
246
     * @param ResponseInterface|null $response
247
     * @param CrawlUrl $crawlUrl
248
     */
249
    protected function handleResponse($response, CrawlUrl $crawlUrl)
250
    {
251
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
252
    }
253
254
    protected function getCrawlRequests(): Generator
255
    {
256
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
257
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
258
                continue;
259
            }
260
261
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
262
                continue;
263
            }
264
265
            $this->crawlObserver->willCrawl($crawlUrl->url);
266
267
            $this->crawlQueue->markAsProcessed($crawlUrl);
268
269
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
270
        }
271
    }
272
273
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
274
    {
275
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
276
277
        collect($allLinks)
278
            ->filter(function (UriInterface $url) {
279
                return $this->hasCrawlableScheme($url);
280
            })
281
            ->map(function (UriInterface $url) {
282
                return $this->normalizeUrl($url);
283
            })
284
            ->filter(function (UriInterface $url) {
285
                return $this->crawlProfile->shouldCrawl($url);
286
            })
287
            ->reject(function (UriInterface $url) {
288
                return $this->crawlQueue->has($url);
289
            })
290
            ->each(function (UriInterface $url) use ($foundOnUrl) {
291
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
292
293
                if (! $this->shouldCrawl($node)) {
294
                    return;
295
                }
296
297
                if ($this->maximumCrawlCountReached()) {
298
                    return;
299
                }
300
301
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
302
303
                $this->addToCrawlQueue($crawlUrl);
304
            });
305
    }
306
307
    protected function shouldCrawl(Node $node): bool
308
    {
309
        if (is_null($this->maximumDepth)) {
310
            return true;
311
        }
312
313
        return $node->getDepth() <= $this->maximumDepth;
314
    }
315
316
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection
317
    {
318
        if ($this->executeJavaScript) {
319
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
320
        }
321
322
        $domCrawler = new DomCrawler($html, $foundOnUrl);
323
324
        return collect($domCrawler->filterXpath('//a')->links())
325
            ->map(function (Link $link) {
326
                return new Uri($link->getUri());
327
            });
328
    }
329
330
    protected function normalizeUrl(UriInterface $url): UriInterface
331
    {
332
        return $url->withFragment('');
333
    }
334
335
    protected function hasCrawlableScheme(UriInterface $uri): bool
336
    {
337
        return in_array($uri->getScheme(), ['http', 'https']);
338
    }
339
340
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
341
    {
342
        $returnNode = null;
343
344
        if ($node->getValue() === (string) $parentUrl) {
345
            $newNode = new Node((string) $url);
346
347
            $node->addChild($newNode);
348
349
            return $newNode;
350
        }
351
352
        foreach ($node->getChildren() as $currentNode) {
353
            $returnNode = $this->addToDepthTree($currentNode, $url, $parentUrl);
354
355
            if (! is_null($returnNode)) {
356
                break;
357
            }
358
        }
359
360
        return $returnNode;
361
    }
362
363
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
364
    {
365
        $browsershot = $this->getBrowsershot();
366
367
        $html = $browsershot->url((string) $foundOnUrl)->bodyHtml();
368
369
        return html_entity_decode($html);
370
    }
371
372
    protected function getBrowsershot(): Browsershot
373
    {
374
        if ($this->browsershot) {
375
            return $this->browsershot;
376
        }
377
378
        $this->browsershot = new Browsershot();
379
380
        return $this->browsershot;
381
    }
382
383
    public function setBrowsershot(Browsershot $browsershot)
384
    {
385
        $this->browsershot = $browsershot;
386
387
        return $this;
388
    }
389
390
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
391
    {
392
        $this->crawledUrlCount++;
393
394
        $this->crawlQueue->add($crawlUrl);
395
396
        return $this;
397
    }
398
399
    protected function maximumCrawlCountReached(): bool
400
    {
401
        if (is_null($this->maximumCrawlCount)) {
402
            return false;
403
        }
404
405
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
406
    }
407
}
408