Completed
Pull Request — master (#68)
by Freek
02:41
created

Crawler::create()   A

Complexity

Conditions 2
Paths 1

Size

Total Lines 12
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 12
rs 9.4285
cc 2
eloc 8
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Spatie\Browsershot\Browsershot;
12
use Symfony\Component\DomCrawler\Link;
13
use Psr\Http\Message\ResponseInterface;
14
use GuzzleHttp\Exception\RequestException;
15
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
16
17
class Crawler
18
{
19
    /** @var \GuzzleHttp\Client */
20
    protected $client;
21
22
    /** @var \Spatie\Crawler\Url */
23
    protected $baseUrl;
24
25
    /** @var \Spatie\Crawler\CrawlObserver */
26
    protected $crawlObserver;
27
28
    /** @var \Spatie\Crawler\CrawlProfile */
29
    protected $crawlProfile;
30
31
    /** @var int */
32
    protected $concurrency;
33
34
    /** @var \Spatie\Crawler\CrawlQueue */
35
    protected $crawlQueue;
36
37
    /** @var false */
38
    protected $executeJavaScript = false;
39
40
    /**
41
     * @param array $clientOptions
42
     *
43
     * @return static
44
     */
45
    public static function create(array $clientOptions = [])
46
    {
47
        $hasClientOpts = (bool) count($clientOptions);
48
        $client = new Client($hasClientOpts ? $clientOptions : [
49
                RequestOptions::COOKIES => true,
50
                RequestOptions::CONNECT_TIMEOUT => 10,
51
                RequestOptions::TIMEOUT => 10,
52
                RequestOptions::ALLOW_REDIRECTS => false,
53
            ]);
54
55
        return new static($client);
56
    }
57
58
    public function __construct(Client $client, int $concurrency = 10)
59
    {
60
        $this->client = $client;
61
62
        $this->concurrency = $concurrency;
63
64
        $this->crawlProfile = new CrawlAllUrls();
65
66
        $this->crawlQueue = new CrawlQueue();
67
    }
68
69
    /**
70
     * @param int $concurrency
71
     *
72
     * @return $this
73
     */
74
    public function setConcurrency(int $concurrency)
75
    {
76
        $this->concurrency = $concurrency;
77
78
        return $this;
79
    }
80
81
    /**
82
     * @return $this
83
     */
84
    public function executeJavaScript()
85
    {
86
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
87
88
        return $this;
89
    }
90
91
    /**
92
     * @return $this
93
     */
94
    public function doNotExecuteJavaScript()
95
    {
96
        $this->executeJavaScript = false;
97
98
        return $this;
99
    }
100
101
    /**
102
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
103
     *
104
     * @return $this
105
     */
106
    public function setCrawlObserver(CrawlObserver $crawlObserver)
107
    {
108
        $this->crawlObserver = $crawlObserver;
109
110
        return $this;
111
    }
112
113
    /**
114
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
115
     *
116
     * @return $this
117
     */
118
    public function setCrawlProfile(CrawlProfile $crawlProfile)
119
    {
120
        $this->crawlProfile = $crawlProfile;
121
122
        return $this;
123
    }
124
125
    /**
126
     * @param \Spatie\Crawler\Url|string $baseUrl
127
     */
128
    public function startCrawling($baseUrl)
129
    {
130
        if (! $baseUrl instanceof Url) {
131
            $baseUrl = Url::create($baseUrl);
132
        }
133
134
        $this->baseUrl = $baseUrl;
135
136
        $crawlUrl = CrawlUrl::create($baseUrl);
137
138
        $this->crawlQueue->add($crawlUrl);
139
140
        $this->startCrawlingQueue();
141
142
        $this->crawlObserver->finishedCrawling();
143
    }
144
145
    protected function startCrawlingQueue()
146
    {
147
        while ($this->crawlQueue->hasPendingUrls()) {
148
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
149
                'concurrency' => $this->concurrency,
150
                'options' => $this->client->getConfig(),
151
                'fulfilled' => function (ResponseInterface $response, int $index) {
152
                    $this->handleResponse($response, $index);
153
154
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
155
156
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
157
                        return;
158
                    }
159
160
                    $this->addAllLinksToCrawlQueue(
161
                        (string) $response->getBody(),
162
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
163
                    );
164
                },
165
                'rejected' => function (RequestException $exception, int $index) {
166
                    $this->handleResponse($exception->getResponse(), $index);
167
                },
168
            ]);
169
170
            $promise = $pool->promise();
171
            $promise->wait();
172
173
            $this->crawlQueue->removeProcessedUrlsFromPending();
174
        }
175
    }
176
177
    /**
178
     * @param ResponseInterface|null $response
179
     * @param int $index
180
     */
181
    protected function handleResponse($response, int $index)
182
    {
183
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
184
185
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
186
    }
187
188
    protected function getCrawlRequests(): Generator
189
    {
190
        $i = 0;
191
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
192
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
193
                $i++;
194
                continue;
195
            }
196
197
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
198
                $i++;
199
                continue;
200
            }
201
202
            $this->crawlObserver->willCrawl($crawlUrl->url);
203
204
            $this->crawlQueue->markAsProcessed($crawlUrl);
205
206
            yield new Request('GET', (string) $crawlUrl->url);
207
            $i++;
208
        }
209
    }
210
211
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
212
    {
213
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
214
215
        collect($allLinks)
216
            ->filter(function (Url $url) {
217
                return $url->hasCrawlableScheme();
218
            })
219
            ->map(function (Url $url) use ($foundOnUrl) {
220
                return $this->normalizeUrl($url);
221
            })
222
            ->filter(function (Url $url) {
223
                return $this->crawlProfile->shouldCrawl($url);
224
            })
225
            ->reject(function ($url) {
226
                return $this->crawlQueue->has($url);
227
            })
228
            ->each(function (Url $url) use ($foundOnUrl) {
229
                $this->crawlQueue->add(
230
                    CrawlUrl::create($url, $foundOnUrl)
231
                );
232
            });
233
    }
234
235
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
236
    {
237
        if ($this->executeJavaScript) {
238
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
239
        }
240
241
        $domCrawler = new DomCrawler($html, $foundOnUrl);
242
243
        return collect($domCrawler->filterXpath('//a')->links())
244
            ->map(function (Link $link) {
245
                return Url::create($link->getUri());
246
            });
247
    }
248
249
    protected function normalizeUrl(Url $url): Url
250
    {
251
        return $url->removeFragment();
252
    }
253
254
    protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string
255
    {
256
        $html = Browsershot::url((string) $foundOnUrl)->bodyHtml();
257
258
        return html_entity_decode($html);
259
    }
260
}
261