Completed
Push — master ( e3f379...17fb44 )
by Freek
01:13
created

Crawler::doNotExecuteJavaScript()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Spatie\Browsershot\Browsershot;
12
use Symfony\Component\DomCrawler\Link;
13
use Psr\Http\Message\ResponseInterface;
14
use GuzzleHttp\Exception\RequestException;
15
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
16
17
class Crawler
18
{
19
    /** @var \GuzzleHttp\Client */
20
    protected $client;
21
22
    /** @var \Spatie\Crawler\Url */
23
    protected $baseUrl;
24
25
    /** @var \Spatie\Crawler\CrawlObserver */
26
    protected $crawlObserver;
27
28
    /** @var \Spatie\Crawler\CrawlProfile */
29
    protected $crawlProfile;
30
31
    /** @var int */
32
    protected $concurrency;
33
34
    /** @var \Spatie\Crawler\CrawlQueue */
35
    protected $crawlQueue;
36
37
    /** @var false */
38
    protected $executeJavaScript = false;
39
40
    /** @var string|null */
41
    protected $pathToChromeBinary = null;
42
43
    /**
44
     * @param array $clientOptions
45
     *
46
     * @return static
47
     */
48
    public static function create(array $clientOptions = [])
49
    {
50
        $hasClientOpts = (bool) count($clientOptions);
51
        $client = new Client($hasClientOpts ? $clientOptions : [
52
                RequestOptions::COOKIES => true,
53
                RequestOptions::CONNECT_TIMEOUT => 10,
54
                RequestOptions::TIMEOUT => 10,
55
                RequestOptions::ALLOW_REDIRECTS => false,
56
            ]);
57
58
        return new static($client);
59
    }
60
61
    public function __construct(Client $client, int $concurrency = 10)
62
    {
63
        $this->client = $client;
64
65
        $this->concurrency = $concurrency;
66
67
        $this->crawlProfile = new CrawlAllUrls();
68
69
        $this->crawlQueue = new CrawlQueue();
70
    }
71
72
    /**
73
     * @param int $concurrency
74
     *
75
     * @return $this
76
     */
77
    public function setConcurrency(int $concurrency)
78
    {
79
        $this->concurrency = $concurrency;
80
81
        return $this;
82
    }
83
84
    /**
85
     * @return $this
86
     */
87
    public function executeJavaScript($pathToChromeBinary = null)
88
    {
89
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
90
91
        $this->pathToChromeBinary = $pathToChromeBinary;
92
93
        return $this;
94
    }
95
96
    /**
97
     * @return $this
98
     */
99
    public function doNotExecuteJavaScript()
100
    {
101
        $this->executeJavaScript = false;
102
103
        return $this;
104
    }
105
106
    /**
107
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
108
     *
109
     * @return $this
110
     */
111
    public function setCrawlObserver(CrawlObserver $crawlObserver)
112
    {
113
        $this->crawlObserver = $crawlObserver;
114
115
        return $this;
116
    }
117
118
    /**
119
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
120
     *
121
     * @return $this
122
     */
123
    public function setCrawlProfile(CrawlProfile $crawlProfile)
124
    {
125
        $this->crawlProfile = $crawlProfile;
126
127
        return $this;
128
    }
129
130
    /**
131
     * @param \Spatie\Crawler\Url|string $baseUrl
132
     */
133
    public function startCrawling($baseUrl)
134
    {
135
        if (! $baseUrl instanceof Url) {
136
            $baseUrl = Url::create($baseUrl);
137
        }
138
139
        $this->baseUrl = $baseUrl;
140
141
        $crawlUrl = CrawlUrl::create($baseUrl);
142
143
        $this->crawlQueue->add($crawlUrl);
144
145
        $this->startCrawlingQueue();
146
147
        $this->crawlObserver->finishedCrawling();
148
    }
149
150
    protected function startCrawlingQueue()
151
    {
152
        while ($this->crawlQueue->hasPendingUrls()) {
153
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
154
                'concurrency' => $this->concurrency,
155
                'options' => $this->client->getConfig(),
156
                'fulfilled' => function (ResponseInterface $response, int $index) {
157
                    $this->handleResponse($response, $index);
158
159
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
160
161
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
162
                        return;
163
                    }
164
165
                    $this->addAllLinksToCrawlQueue(
166
                        (string) $response->getBody(),
167
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
168
                    );
169
                },
170
                'rejected' => function (RequestException $exception, int $index) {
171
                    $this->handleResponse($exception->getResponse(), $index);
172
                },
173
            ]);
174
175
            $promise = $pool->promise();
176
            $promise->wait();
177
178
            $this->crawlQueue->removeProcessedUrlsFromPending();
179
        }
180
    }
181
182
    /**
183
     * @param ResponseInterface|null $response
184
     * @param int $index
185
     */
186
    protected function handleResponse($response, int $index)
187
    {
188
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
189
190
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
191
    }
192
193
    protected function getCrawlRequests(): Generator
194
    {
195
        $i = 0;
196
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
197
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
198
                $i++;
199
                continue;
200
            }
201
202
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
203
                $i++;
204
                continue;
205
            }
206
207
            $this->crawlObserver->willCrawl($crawlUrl->url);
208
209
            $this->crawlQueue->markAsProcessed($crawlUrl);
210
211
            yield new Request('GET', (string) $crawlUrl->url);
212
            $i++;
213
        }
214
    }
215
216
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
217
    {
218
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
219
220
        collect($allLinks)
221
            ->filter(function (Url $url) {
222
                return $url->hasCrawlableScheme();
223
            })
224
            ->map(function (Url $url) use ($foundOnUrl) {
225
                return $this->normalizeUrl($url);
226
            })
227
            ->filter(function (Url $url) {
228
                return $this->crawlProfile->shouldCrawl($url);
229
            })
230
            ->reject(function ($url) {
231
                return $this->crawlQueue->has($url);
232
            })
233
            ->each(function (Url $url) use ($foundOnUrl) {
234
                $this->crawlQueue->add(
235
                    CrawlUrl::create($url, $foundOnUrl)
236
                );
237
            });
238
    }
239
240
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
241
    {
242
        if ($this->executeJavaScript) {
243
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
244
        }
245
246
        $domCrawler = new DomCrawler($html, $foundOnUrl);
247
248
        return collect($domCrawler->filterXpath('//a')->links())
249
            ->map(function (Link $link) {
250
                return Url::create($link->getUri());
251
            });
252
    }
253
254
    protected function normalizeUrl(Url $url): Url
255
    {
256
        return $url->removeFragment();
257
    }
258
259
    protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string
260
    {
261
        $browsershot = Browsershot::url((string) $foundOnUrl);
262
263
        if ($this->pathToChromeBinary) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->pathToChromeBinary of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
264
            $browsershot->setChromePath($this->pathToChromeBinary);
265
        }
266
267
        $html = $browsershot->bodyHtml();
268
269
        return html_entity_decode($html);
270
    }
271
}
272