Completed
Pull Request — master (#87)
by Peter
01:11
created

Crawler::getCrawlRequests()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 18
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 18
rs 9.2
c 0
b 0
f 0
cc 4
eloc 9
nc 4
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use Tree\Node\Node;
7
use GuzzleHttp\Pool;
8
use GuzzleHttp\Client;
9
use GuzzleHttp\Psr7\Request;
10
use GuzzleHttp\RequestOptions;
11
use Illuminate\Support\Collection;
12
use Spatie\Browsershot\Browsershot;
13
use Symfony\Component\DomCrawler\Link;
14
use Psr\Http\Message\ResponseInterface;
15
use GuzzleHttp\Exception\RequestException;
16
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
17
18
class Crawler
19
{
20
    /** @var \GuzzleHttp\Client */
21
    protected $client;
22
23
    /** @var \Spatie\Crawler\Url */
24
    protected $baseUrl;
25
26
    /** @var \Spatie\Crawler\CrawlObserver */
27
    protected $crawlObserver;
28
29
    /** @var \Spatie\Crawler\CrawlProfile */
30
    protected $crawlProfile;
31
32
    /** @var int */
33
    protected $concurrency;
34
35
    /** @var \Spatie\Crawler\CrawlQueue */
36
    protected $crawlQueue;
37
38
    /** @var int */
39
    protected $crawledUrlCount = 0;
40
41
    /** @var int|null */
42
    protected $maximumCrawlCount = null;
43
44
    /** @var int|null */
45
    protected $maximumDepth = null;
46
47
    /** @var \Tree\Node\Node */
48
    protected $depthTree;
49
50
    /** @var false */
51
    protected $executeJavaScript = false;
52
53
    /** @var string|null */
54
    protected $pathToChromeBinary = null;
55
56
    protected static $defaultClientOptions = [
57
        RequestOptions::COOKIES => true,
58
        RequestOptions::CONNECT_TIMEOUT => 10,
59
        RequestOptions::TIMEOUT => 10,
60
        RequestOptions::ALLOW_REDIRECTS => false,
61
    ];
62
63
    /**
64
     * @param array $clientOptions
65
     *
66
     * @return static
67
     */
68
    public static function create(array $clientOptions = [])
69
    {
70
        $clientOptions = (count($clientOptions))
71
            ? $clientOptions
72
            : self::$defaultClientOptions;
73
74
        $client = new Client($clientOptions);
75
76
        return new static($client);
77
    }
78
79
    public function __construct(Client $client, int $concurrency = 10)
80
    {
81
        $this->client = $client;
82
83
        $this->concurrency = $concurrency;
84
85
        $this->crawlProfile = new CrawlAllUrls();
86
87
        $this->crawlQueue = new CollectionCrawlQueue();
88
    }
89
90
    /**
91
     * @param int $concurrency
92
     *
93
     * @return $this
94
     */
95
    public function setConcurrency(int $concurrency)
96
    {
97
        $this->concurrency = $concurrency;
98
99
        return $this;
100
    }
101
102
    /**
103
     * @param int $maximumCrawlCount
104
     *
105
     * @return $this
106
     */
107
    public function setMaximumCrawlCount(int $maximumCrawlCount)
108
    {
109
        $this->maximumCrawlCount = $maximumCrawlCount;
110
111
        return $this;
112
    }
113
114
    /**
115
     * @param int $maximumDepth
116
     *
117
     * @return $this
118
     */
119
    public function setMaximumDepth(int $maximumDepth)
120
    {
121
        $this->maximumDepth = $maximumDepth;
122
123
        return $this;
124
    }
125
126
    /**
127
     * @param CrawlQueue $crawlQueue
128
     * @return $this
129
     */
130
    public function setCrawlQueue(CrawlQueue $crawlQueue)
131
    {
132
        $this->crawlQueue = $crawlQueue;
133
134
        return $this;
135
    }
136
137
    /**
138
     * @return $this
139
     */
140
    public function executeJavaScript($pathToChromeBinary = null)
141
    {
142
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
143
144
        $this->pathToChromeBinary = $pathToChromeBinary;
145
146
        return $this;
147
    }
148
149
    /**
150
     * @return $this
151
     */
152
    public function doNotExecuteJavaScript()
153
    {
154
        $this->executeJavaScript = false;
155
156
        return $this;
157
    }
158
159
    /**
160
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
161
     *
162
     * @return $this
163
     */
164
    public function setCrawlObserver(CrawlObserver $crawlObserver)
165
    {
166
        $this->crawlObserver = $crawlObserver;
167
168
        return $this;
169
    }
170
171
    /**
172
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
173
     *
174
     * @return $this
175
     */
176
    public function setCrawlProfile(CrawlProfile $crawlProfile)
177
    {
178
        $this->crawlProfile = $crawlProfile;
179
180
        return $this;
181
    }
182
183
    /**
184
     * @param \Spatie\Crawler\Url|string $baseUrl
185
     */
186
    public function startCrawling($baseUrl)
187
    {
188
        if (! $baseUrl instanceof Url) {
189
            $baseUrl = Url::create($baseUrl);
190
        }
191
192
        $this->baseUrl = $baseUrl;
193
194
        $crawlUrl = CrawlUrl::create($baseUrl);
195
196
        $this->addToCrawlQueue($crawlUrl);
197
198
        $this->depthTree = new Node((string) $this->baseUrl);
199
200
        $this->startCrawlingQueue();
201
202
        $this->crawlObserver->finishedCrawling();
203
    }
204
205
    protected function startCrawlingQueue()
206
    {
207
        while ($this->crawlQueue->hasPendingUrls()) {
208
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
209
                'concurrency' => $this->concurrency,
210
                'options' => $this->client->getConfig(),
211
                'fulfilled' => function (ResponseInterface $response, int $index) {
212
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
213
                    $this->handleResponse($response, $crawlUrl);
214
215
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
216
                        if ($crawlUrl->url->host !== $this->baseUrl->host) {
217
                            return;
218
                        }
219
                    }
220
221
                    $this->addAllLinksToCrawlQueue(
222
                        (string) $response->getBody(),
223
                        $crawlUrl->url
224
                    );
225
                },
226
                'rejected' => function (RequestException $exception, int $index) {
227
                    $this->handleResponse(
228
                        $exception->getResponse(),
229
                        $this->crawlQueue->getUrlById($index)
230
                    );
231
                },
232
            ]);
233
234
            $promise = $pool->promise();
235
            $promise->wait();
236
        }
237
    }
238
239
    /**
240
     * @param ResponseInterface|null $response
241
     * @param CrawlUrl $crawlUrl
242
     */
243
    protected function handleResponse($response, CrawlUrl $crawlUrl)
244
    {
245
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
246
    }
247
248
    protected function getCrawlRequests(): Generator
249
    {
250
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
251
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
252
                continue;
253
            }
254
255
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
256
                continue;
257
            }
258
259
            $this->crawlObserver->willCrawl($crawlUrl->url);
260
261
            $this->crawlQueue->markAsProcessed($crawlUrl);
262
263
            yield $crawlUrl->getId() => new Request('GET', (string) $crawlUrl->url);
264
        }
265
    }
266
267
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
268
    {
269
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
270
271
        collect($allLinks)
272
            ->filter(function (Url $url) {
273
                return $url->hasCrawlableScheme();
274
            })
275
            ->map(function (Url $url) {
276
                return $this->normalizeUrl($url);
277
            })
278
            ->filter(function (Url $url) {
279
                return $this->crawlProfile->shouldCrawl($url);
280
            })
281
            ->reject(function ($url) {
282
                return $this->crawlQueue->has($url);
283
            })
284
            ->each(function (Url $url) use ($foundOnUrl) {
285
                $node = $this->addtoDepthTree($this->depthTree, (string) $url, $foundOnUrl);
286
287
                if (! $this->shouldCrawl($node)) {
288
                    return;
289
                }
290
291
                if ($this->maximumCrawlCountReached()) {
292
                    return;
293
                }
294
295
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
296
297
                $this->addToCrawlQueue($crawlUrl);
298
            });
299
    }
300
301
    protected function shouldCrawl(Node $node): bool
302
    {
303
        if (is_null($this->maximumDepth)) {
304
            return true;
305
        }
306
307
        return $node->getDepth() <= $this->maximumDepth;
308
    }
309
310
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
311
    {
312
        if ($this->executeJavaScript) {
313
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
314
        }
315
316
        $domCrawler = new DomCrawler($html, $foundOnUrl);
317
318
        return collect($domCrawler->filterXpath('//a')->links())
319
            ->map(function (Link $link) {
320
                return Url::create($link->getUri());
321
            });
322
    }
323
324
    protected function normalizeUrl(Url $url): Url
325
    {
326
        return $url->removeFragment();
327
    }
328
329
    protected function addtoDepthTree(Node $node, string $url, string $parentUrl)
330
    {
331
        $returnNode = null;
332
333
        if ($node->getValue() === $parentUrl) {
334
            $newNode = new Node($url);
335
336
            $node->addChild($newNode);
337
338
            return $newNode;
339
        }
340
341
        foreach ($node->getChildren() as $currentNode) {
342
            $returnNode = $this->addtoDepthTree($currentNode, $url, $parentUrl);
343
344
            if (! is_null($returnNode)) {
345
                break;
346
            }
347
        }
348
349
        return $returnNode;
350
    }
351
352
    protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string
353
    {
354
        $browsershot = Browsershot::url((string) $foundOnUrl);
355
356
        if ($this->pathToChromeBinary) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->pathToChromeBinary of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
357
            $browsershot->setChromePath($this->pathToChromeBinary);
358
        }
359
360
        $html = $browsershot->bodyHtml();
361
362
        return html_entity_decode($html);
363
    }
364
365
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
366
    {
367
        $this->crawledUrlCount++;
368
369
        $this->crawlQueue->add($crawlUrl);
370
371
        return $this;
372
    }
373
374
    protected function maximumCrawlCountReached(): bool
375
    {
376
        if (is_null($this->maximumCrawlCount)) {
377
            return false;
378
        }
379
380
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
381
    }
382
}
383