Completed
Pull Request — master (#99)
by Brent
03:55
created

Crawler::hasCrawlableScheme()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 4
rs 10
cc 1
eloc 2
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Psr7\Uri;
7
use Psr\Http\Message\UriInterface;
8
use Tree\Node\Node;
9
use GuzzleHttp\Pool;
10
use GuzzleHttp\Client;
11
use GuzzleHttp\Psr7\Request;
12
use GuzzleHttp\RequestOptions;
13
use Illuminate\Support\Collection;
14
use Spatie\Browsershot\Browsershot;
15
use Symfony\Component\DomCrawler\Link;
16
use Psr\Http\Message\ResponseInterface;
17
use Spatie\Crawler\CrawlQueue\CrawlQueue;
18
use GuzzleHttp\Exception\RequestException;
19
use Spatie\Crawler\CrawlQueue\CollectionCrawlQueue;
20
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
21
22
class Crawler
23
{
24
    /** @var \GuzzleHttp\Client */
25
    protected $client;
26
27
    /** @var \Psr\Http\Message\UriInterface */
28
    protected $baseUrl;
29
30
    /** @var \Spatie\Crawler\CrawlObserver */
31
    protected $crawlObserver;
32
33
    /** @var \Spatie\Crawler\CrawlProfile */
34
    protected $crawlProfile;
35
36
    /** @var int */
37
    protected $concurrency;
38
39
    /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */
40
    protected $crawlQueue;
41
42
    /** @var int */
43
    protected $crawledUrlCount = 0;
44
45
    /** @var int|null */
46
    protected $maximumCrawlCount = null;
47
48
    /** @var int|null */
49
    protected $maximumDepth = null;
50
51
    /** @var \Tree\Node\Node */
52
    protected $depthTree;
53
54
    /** @var false */
55
    protected $executeJavaScript = false;
56
57
    /** @var string|null */
58
    protected $pathToChromeBinary = null;
59
60
    protected static $defaultClientOptions = [
61
        RequestOptions::COOKIES => true,
62
        RequestOptions::CONNECT_TIMEOUT => 10,
63
        RequestOptions::TIMEOUT => 10,
64
        RequestOptions::ALLOW_REDIRECTS => false,
65
    ];
66
67
    /**
68
     * @param array $clientOptions
69
     *
70
     * @return static
71
     */
72
    public static function create(array $clientOptions = [])
73
    {
74
        $clientOptions = (count($clientOptions))
75
            ? $clientOptions
76
            : self::$defaultClientOptions;
77
78
        $client = new Client($clientOptions);
79
80
        return new static($client);
81
    }
82
83
    public function __construct(Client $client, int $concurrency = 10)
84
    {
85
        $this->client = $client;
86
87
        $this->concurrency = $concurrency;
88
89
        $this->crawlProfile = new CrawlAllUrls();
90
91
        $this->crawlQueue = new CollectionCrawlQueue();
92
    }
93
94
    /**
95
     * @param int $concurrency
96
     *
97
     * @return $this
98
     */
99
    public function setConcurrency(int $concurrency)
100
    {
101
        $this->concurrency = $concurrency;
102
103
        return $this;
104
    }
105
106
    /**
107
     * @param int $maximumCrawlCount
108
     *
109
     * @return $this
110
     */
111
    public function setMaximumCrawlCount(int $maximumCrawlCount)
112
    {
113
        $this->maximumCrawlCount = $maximumCrawlCount;
114
115
        return $this;
116
    }
117
118
    /**
119
     * @param int $maximumDepth
120
     *
121
     * @return $this
122
     */
123
    public function setMaximumDepth(int $maximumDepth)
124
    {
125
        $this->maximumDepth = $maximumDepth;
126
127
        return $this;
128
    }
129
130
    /**
131
     * @param CrawlQueue $crawlQueue
132
     * @return $this
133
     */
134
    public function setCrawlQueue(CrawlQueue $crawlQueue)
135
    {
136
        $this->crawlQueue = $crawlQueue;
137
138
        return $this;
139
    }
140
141
    /**
142
     * @return $this
143
     */
144
    public function executeJavaScript($pathToChromeBinary = null)
145
    {
146
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
147
148
        $this->pathToChromeBinary = $pathToChromeBinary;
149
150
        return $this;
151
    }
152
153
    /**
154
     * @return $this
155
     */
156
    public function doNotExecuteJavaScript()
157
    {
158
        $this->executeJavaScript = false;
159
160
        return $this;
161
    }
162
163
    /**
164
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
165
     *
166
     * @return $this
167
     */
168
    public function setCrawlObserver(CrawlObserver $crawlObserver)
169
    {
170
        $this->crawlObserver = $crawlObserver;
171
172
        return $this;
173
    }
174
175
    /**
176
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
177
     *
178
     * @return $this
179
     */
180
    public function setCrawlProfile(CrawlProfile $crawlProfile)
181
    {
182
        $this->crawlProfile = $crawlProfile;
183
184
        return $this;
185
    }
186
187
    /**
188
     * @param \Psr\Http\Message\UriInterface|string $baseUrl
189
     */
190
    public function startCrawling($baseUrl)
191
    {
192
        if (! $baseUrl instanceof UriInterface) {
193
            $baseUrl = new Uri($baseUrl);
194
        }
195
196
        if ($baseUrl->getPath() === '') {
197
            $baseUrl = $baseUrl->withPath('/');
198
        }
199
200
        $this->baseUrl = $baseUrl;
201
202
        $crawlUrl = CrawlUrl::create($baseUrl);
203
204
        $this->addToCrawlQueue($crawlUrl);
205
206
        $this->depthTree = new Node((string) $this->baseUrl);
207
208
        $this->startCrawlingQueue();
209
210
        $this->crawlObserver->finishedCrawling();
211
    }
212
213
    protected function startCrawlingQueue()
214
    {
215
        while ($this->crawlQueue->hasPendingUrls()) {
216
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
217
                'concurrency' => $this->concurrency,
218
                'options' => $this->client->getConfig(),
219
                'fulfilled' => function (ResponseInterface $response, int $index) {
220
                    $crawlUrl = $this->crawlQueue->getUrlById($index);
221
                    $this->handleResponse($response, $crawlUrl);
222
223
                    if (! $this->crawlProfile instanceof CrawlSubdomains) {
224
                        if ($crawlUrl->url->getHost() !== $this->baseUrl->getHost()) {
225
                            return;
226
                        }
227
                    }
228
229
                    $this->addAllLinksToCrawlQueue(
230
                        (string) $response->getBody(),
231
                        $crawlUrl->url
232
                    );
233
                },
234
                'rejected' => function (RequestException $exception, int $index) {
235
                    $this->handleResponse(
236
                        $exception->getResponse(),
237
                        $this->crawlQueue->getUrlById($index)
238
                    );
239
                },
240
            ]);
241
242
            $promise = $pool->promise();
243
            $promise->wait();
244
        }
245
    }
246
247
    /**
248
     * @param ResponseInterface|null $response
249
     * @param CrawlUrl $crawlUrl
250
     */
251
    protected function handleResponse($response, CrawlUrl $crawlUrl)
252
    {
253
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
254
    }
255
256
    protected function getCrawlRequests(): Generator
257
    {
258
        while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) {
259
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
260
                continue;
261
            }
262
263
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
264
                continue;
265
            }
266
267
            $this->crawlObserver->willCrawl($crawlUrl->url);
268
269
            $this->crawlQueue->markAsProcessed($crawlUrl);
270
271
            yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
272
        }
273
    }
274
275
    protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)
276
    {
277
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
278
279
        collect($allLinks)
280
            ->filter(function (UriInterface $url) {
281
                return $this->hasCrawlableScheme($url);
282
            })
283
            ->map(function (UriInterface $url) {
284
                return $this->normalizeUrl($url);
285
            })
286
            ->filter(function (UriInterface $url) {
287
                return $this->crawlProfile->shouldCrawl($url);
288
            })
289
            ->reject(function (UriInterface $url) {
290
                return $this->crawlQueue->has($url);
291
            })
292
            ->each(function (UriInterface $url) use ($foundOnUrl) {
293
                $node = $this->addtoDepthTree($this->depthTree, $url, $foundOnUrl);
294
295
                if (! $this->shouldCrawl($node)) {
296
                    return;
297
                }
298
299
                if ($this->maximumCrawlCountReached()) {
300
                    return;
301
                }
302
303
                $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
304
305
                $this->addToCrawlQueue($crawlUrl);
306
            });
307
    }
308
309
    protected function shouldCrawl(Node $node): bool
310
    {
311
        if (is_null($this->maximumDepth)) {
312
            return true;
313
        }
314
315
        return $node->getDepth() <= $this->maximumDepth;
316
    }
317
318
    protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection
319
    {
320
        if ($this->executeJavaScript) {
321
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
322
        }
323
324
        $domCrawler = new DomCrawler($html, $foundOnUrl);
325
326
        return collect($domCrawler->filterXpath('//a')->links())
327
            ->map(function (Link $link) {
328
                return new Uri($link->getUri());
329
            });
330
    }
331
332
    protected function normalizeUrl(UriInterface $url): UriInterface
333
    {
334
        return $url->withFragment('');
335
    }
336
337
    protected function hasCrawlableScheme(UriInterface $uri): bool
338
    {
339
        return in_array($uri->getScheme(), ['http', 'https']);
340
    }
341
342
    protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)
343
    {
344
        $returnNode = null;
345
346
        if ($node->getValue() === (string) $parentUrl) {
347
            $newNode = new Node((string) $url);
348
349
            $node->addChild($newNode);
350
351
            return $newNode;
352
        }
353
354
        foreach ($node->getChildren() as $currentNode) {
355
            $returnNode = $this->addtoDepthTree($currentNode, $url, $parentUrl);
356
357
            if (! is_null($returnNode)) {
358
                break;
359
            }
360
        }
361
362
        return $returnNode;
363
    }
364
365
    protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string
366
    {
367
        $browsershot = Browsershot::url((string) $foundOnUrl);
368
369
        if ($this->pathToChromeBinary) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->pathToChromeBinary of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
370
            $browsershot->setChromePath($this->pathToChromeBinary);
371
        }
372
373
        $html = $browsershot->bodyHtml();
374
375
        return html_entity_decode($html);
376
    }
377
378
    protected function addToCrawlQueue(CrawlUrl $crawlUrl)
379
    {
380
        $this->crawledUrlCount++;
381
382
        $this->crawlQueue->add($crawlUrl);
383
384
        return $this;
385
    }
386
387
    protected function maximumCrawlCountReached(): bool
388
    {
389
        if (is_null($this->maximumCrawlCount)) {
390
            return false;
391
        }
392
393
        return $this->crawledUrlCount >= $this->maximumCrawlCount;
394
    }
395
}
396