Completed
Pull Request — master (#86)
by Peter
01:05
created

Crawler::shouldCrawl()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 8
rs 9.4285
cc 2
eloc 4
nc 2
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Request;
10
use GuzzleHttp\RequestOptions;
11
use Illuminate\Support\Collection;
12
use Spatie\Browsershot\Browsershot;
13
use Symfony\Component\DomCrawler\Link;
14
use Psr\Http\Message\ResponseInterface;
15
use GuzzleHttp\Exception\RequestException;
16
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
17
18
class Crawler
19
{
20
    /** @var \GuzzleHttp\Client */
21
    protected $client;
22
23
    /** @var \Spatie\Crawler\Url */
24
    protected $baseUrl;
25
26
    /** @var \Spatie\Crawler\CrawlObserver */
27
    protected $crawlObserver;
28
29
    /** @var \Spatie\Crawler\CrawlProfile */
30
    protected $crawlProfile;
31
32
    /** @var int */
33
    protected $concurrency;
34
35
    /** @var \Spatie\Crawler\CrawlQueue */
36
    protected $crawlQueue;
37
38
    /** @var int */
39
    protected $crawledUrlCount = 0;
40
41
    /** @var int|null */
42
    protected $maximumCrawlCount = null;
43
44
    /** @var int|null */
45
    protected $maximumDepth = null;
46
47
    /** @var \Tree\Node\Node */
48
    protected $depthTree;
49
50
    /** @var false */
51
    protected $executeJavaScript = false;
52
53
    /** @var string|null */
54
    protected $pathToChromeBinary = null;
55
56
    protected static $defaultClientOptions = [
57
        RequestOptions::COOKIES => true,
58
        RequestOptions::CONNECT_TIMEOUT => 10,
59
        RequestOptions::TIMEOUT => 10,
60
        RequestOptions::ALLOW_REDIRECTS => false,
61
    ];
62
63
    /**
64
     * @param array $clientOptions
65
     *
66
     * @return static
67
     */
68
    public static function create(array $clientOptions = [])
69
    {
70
        $clientOptions = (count($clientOptions))
71
            ? $clientOptions
72
            : self::$defaultClientOptions;
73
74
        $client = new Client($clientOptions);
75
76
        return new static($client);
77
    }
78
79
    public function __construct(Client $client, int $concurrency = 10)
80
    {
81
        $this->client = $client;
82
83
        $this->concurrency = $concurrency;
84
85
        $this->crawlProfile = new CrawlAllUrls();
86
87
        $this->crawlQueue = new CrawlQueue();
88
    }
89
90
    /**
91
     * @param int $concurrency
92
     *
93
     * @return $this
94
     */
95
    public function setConcurrency(int $concurrency)
96
    {
97
        $this->concurrency = $concurrency;
98
99
        return $this;
100
    }
101
102
    /**
103
     * @param int $maximumCrawlCount
104
     *
105
     * @return $this
106
     */
107
    public function setMaximumCrawlCount(int $maximumCrawlCount)
108
    {
109
        $this->maximumCrawlCount = $maximumCrawlCount;
110
111
        return $this;
112
    }
113
114
    /**
115
     * @param int $maximumDepth
116
     *
117
     * @return $this
118
     */
119
    public function setMaximumDepth(int $maximumDepth)
120
    {
121
        $this->maximumDepth = $maximumDepth;
122
123
        return $this;
124
    }
125
126
    /**
127
     * @return $this
128
     */
129
    public function executeJavaScript($pathToChromeBinary = null)
130
    {
131
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
132
133
        $this->pathToChromeBinary = $pathToChromeBinary;
134
135
        return $this;
136
    }
137
138
    /**
139
     * @return $this
140
     */
141
    public function doNotExecuteJavaScript()
142
    {
143
        $this->executeJavaScript = false;
144
145
        return $this;
146
    }
147
148
    /**
149
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
150
     *
151
     * @return $this
152
     */
153
    public function setCrawlObserver(CrawlObserver $crawlObserver)
154
    {
155
        $this->crawlObserver = $crawlObserver;
156
157
        return $this;
158
    }
159
160
    /**
161
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
162
     *
163
     * @return $this
164
     */
165
    public function setCrawlProfile(CrawlProfile $crawlProfile)
166
    {
167
        $this->crawlProfile = $crawlProfile;
168
169
        return $this;
170
    }
171
172
    /**
173
     * @param \Spatie\Crawler\Url|string $baseUrl
174
     */
175
    public function startCrawling($baseUrl)
176
    {
177
        if (! $baseUrl instanceof Url) {
178
            $baseUrl = Url::create($baseUrl);
179
        }
180
181
        $this->baseUrl = $baseUrl;
182
183
        $crawlUrl = CrawlUrl::create($baseUrl);
184
185
        $this->addToCrawlQueue($crawlUrl);
186
187
        $this->depthTree = new Node((string) $this->baseUrl);
188
189
        $this->startCrawlingQueue();
190
191
        $this->crawlObserver->finishedCrawling();
192
    }
193
194
    protected function startCrawlingQueue()
195
    {
196
        while ($this->crawlQueue->hasPendingUrls()) {
197
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
198
                'concurrency' => $this->concurrency,
199
                'options' => $this->client->getConfig(),
200
                'fulfilled' => function (ResponseInterface $response, int $index) {
201
                    $this->handleResponse($response, $index);
202
203
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
204
205
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
206
                        if ($crawlUrl->url->host !== $this->baseUrl->host) {
207
                            return;
208
                        }
209
                    }
210
211
                    $this->addAllLinksToCrawlQueue(
212
                        (string) $response->getBody(),
213
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
214
                    );
215
                },
216
                'rejected' => function (RequestException $exception, int $index) {
217
                    $this->handleResponse($exception->getResponse(), $index);
218
                },
219
            ]);
220
221
            $promise = $pool->promise();
222
            $promise->wait();
223
224
            $this->crawlQueue->removeProcessedUrlsFromPending();
225
        }
226
    }
227
228
    /**
229
     * @param ResponseInterface|null $response
230
     * @param int $index
231
     */
232
    protected function handleResponse($response, int $index)
233
    {
234
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
235
236
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
237
    }
238
239
    protected function getCrawlRequests(): Generator
240
    {
241
        $i = 0;
242
243
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
244
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
245
                $i++;
246
                continue;
247
            }
248
249
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
250
                $i++;
251
                continue;
252
            }
253
254
            $this->crawlObserver->willCrawl($crawlUrl->url);
255
256
            $this->crawlQueue->markAsProcessed($crawlUrl);
257
258
            yield new Request('GET', (string) $crawlUrl->url);
259
            $i++;
260
        }
261
    }
262
263
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
264
    {
265
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
266
267
        collect($allLinks)
268
            ->filter(function (Url $url) {
269
                return $url->hasCrawlableScheme();
270
            })
271
            ->map(function (Url $url) {
272
                return $this->normalizeUrl($url);
273
            })
274
            ->filter(function (Url $url) {
275
                return $this->crawlProfile->shouldCrawl($url);
276
            })
277
            ->reject(function ($url) {
278
                return $this->crawlQueue->has($url);
279
            })
280
            ->each(function (Url $url) use ($foundOnUrl) {
281
                $node = $this->addtoDepthTree($this->depthTree, (string) $url, $foundOnUrl);
282
283
                if (! $this->shouldCrawl($node)) {
284
                    return;
285
                }
286
287
                if ($this->maximumCrawlCountReached()) {
288
                    return;
289
                }
290
291
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
292
293
                $this->addToCrawlQueue($crawlUrl);
294
            });
295
    }
296
297
    protected function shouldCrawl(Node $node): bool
298
    {
299
        if (is_null($this->maximumDepth)) {
300
            return true;
301
        }
302
303
        return $node->getDepth() <= $this->maximumDepth;
304
    }
305
306
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
307
    {
308
        if ($this->executeJavaScript) {
309
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
310
        }
311
312
        $domCrawler = new DomCrawler($html, $foundOnUrl);
313
314
        return collect($domCrawler->filterXpath('//a')->links())
315
            ->map(function (Link $link) {
316
                return Url::create($link->getUri());
317
            });
318
    }
319
320
    protected function normalizeUrl(Url $url): Url
321
    {
322
        return $url->removeFragment();
323
    }
324
325
    protected function addtoDepthTree(Node $node, string $url, string $parentUrl)
326
    {
327
        $returnNode = null;
328
329
        if ($node->getValue() === $parentUrl) {
330
            $newNode = new Node($url);
331
332
            $node->addChild($newNode);
333
334
            return $newNode;
335
        }
336
337
        foreach ($node->getChildren() as $currentNode) {
338
            $returnNode = $this->addtoDepthTree($currentNode, $url, $parentUrl);
339
340
            if (! is_null($returnNode)) {
341
                break;
342
            }
343
        }
344
345
        return $returnNode;
346
    }
347
348
    protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string
349
    {
350
        $browsershot = Browsershot::url((string) $foundOnUrl);
351
352
        if ($this->pathToChromeBinary) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->pathToChromeBinary of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
353
            $browsershot->setChromePath($this->pathToChromeBinary);
354
        }
355
356
        $html = $browsershot->bodyHtml();
357
358
        return html_entity_decode($html);
359
    }
360
361
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
362
    {
363
        $this->crawledUrlCount++;
364
365
        $this->crawlQueue->add($crawlUrl);
366
367
        return $this;
368
    }
369
370
    protected function maximumCrawlCountReached(): bool
371
    {
372
        if (is_null($this->maximumCrawlCount)) {
373
            return false;
374
        }
375
376
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
377
    }
378
}
379