Completed
Push — master ( c83bef...20fddf )
by Freek
09:52
created

Crawler::crawlUrl()   B

Complexity

Conditions 6
Paths 8

Size

Total Lines 30
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 30
rs 8.439
c 0
b 0
f 0
cc 6
eloc 16
nc 8
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Client;
7
use GuzzleHttp\Exception\RequestException;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use GuzzleHttp\Pool;
11
use Illuminate\Support\Collection;
12
use Psr\Http\Message\ResponseInterface;
13
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
14
15
class Crawler
16
{
17
    /**
18
     * @var \GuzzleHttp\Client
19
     */
20
    protected $client;
21
22
    /**
23
     * @var \Spatie\Crawler\Url;
24
     */
25
    protected $baseUrl;
26
27
    /**
28
     * @var \Spatie\Crawler\CrawlObserver
29
     */
30
    protected $crawlObserver;
31
32
    /**
33
     * @var \Spatie\Crawler\CrawlProfile
34
     */
35
    protected $crawlProfile;
36
37
    /**
38
     * @var int
39
     */
40
    protected $concurrency;
41
42
    /**
43
     * @var \Spatie\Crawler\CrawlQueue
44
     */
45
    protected $crawlQueue;
46
47
    /**
48
     * @param array $clientOptions
49
     *
50
     * @return static
51
     */
52
    public static function create(array $clientOptions = null)
53
    {
54
        $client = new Client($clientOptions ?? [
55
                RequestOptions::ALLOW_REDIRECTS => false,
56
                RequestOptions::COOKIES => true,
57
            ]);
58
59
        return new static($client);
60
    }
61
62
    public function __construct(Client $client, int $concurrency = 10)
63
    {
64
        $this->client = $client;
65
66
        $this->concurrency = $concurrency;
67
68
        $this->crawlProfile = new CrawlAllUrls();
69
70
        $this->crawlQueue = new CrawlQueue();
71
    }
72
73
    /**
74
     * @param int $concurrency
75
     *
76
     * @return $this
77
     */
78
    public function setConcurrency(int $concurrency)
79
    {
80
        $this->concurrency = $concurrency;
81
82
        return $this;
83
    }
84
85
    /**
86
     * Set the crawl observer.
87
     *
88
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
89
     *
90
     * @return $this
91
     */
92
    public function setCrawlObserver(CrawlObserver $crawlObserver)
93
    {
94
        $this->crawlObserver = $crawlObserver;
95
96
        return $this;
97
    }
98
99
    /**
100
     * Set the crawl profile.
101
     *
102
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
103
     *
104
     * @return $this
105
     */
106
    public function setCrawlProfile(CrawlProfile $crawlProfile)
107
    {
108
        $this->crawlProfile = $crawlProfile;
109
110
        return $this;
111
    }
112
113
    /**
114
     * Start the crawling process.
115
     *
116
     * @param \Spatie\Crawler\Url|string $baseUrl
117
     */
118
    public function startCrawling($baseUrl)
119
    {
120
        if (! $baseUrl instanceof Url) {
121
            $baseUrl = Url::create($baseUrl);
122
        }
123
124
        $this->baseUrl = $baseUrl;
125
126
        $crawlUrl = CrawlUrl::create($baseUrl);
127
128
        $this->crawlQueue->add($crawlUrl);
129
130
        $this->startCrawlingQueue();
131
132
        $this->crawlObserver->finishedCrawling();
133
    }
134
135
    /**
136
     * Crawl urls in the currentPool.
137
     */
138
    protected function startCrawlingQueue()
139
    {
140
        while ($this->crawlQueue->hasPendingUrls()) {
141
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
142
                'concurrency' => $this->concurrency,
143
                'fulfilled' => function (ResponseInterface $response, int $index) {
144
                    $this->handleResponse($response, $index);
145
146
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
147
148
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
149
                        return;
150
                    }
151
152
                    $this->addAllLinksToCrawlQueue(
153
                        (string) $response->getBody(),
154
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
155
                    );
156
                },
157
                'rejected' => function (RequestException $exception, int $index) {
158
                    $this->handleResponse($exception->getResponse(), $index);
0 ignored issues
show
Bug introduced by
It seems like $exception->getResponse() can be null; however, handleResponse() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
159
                },
160
            ]);
161
162
            $promise = $pool->promise();
163
            $promise->wait();
164
165
            $this->crawlQueue->removeProcessedUrlsFromPending();
166
        }
167
    }
168
169
    protected function handleResponse(ResponseInterface $response, int $index)
170
    {
171
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
172
173
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
174
    }
175
176
    protected function getCrawlRequests(): Generator
177
    {
178
        $i = 0;
179
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
180
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
181
                $i++;
182
                continue;
183
            }
184
185
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
186
                $i++;
187
                continue;
188
            }
189
190
            $this->crawlObserver->willCrawl($crawlUrl->url);
191
192
            $this->crawlQueue->markAsProcessed($crawlUrl);
193
194
            yield new Request('GET', (string) $crawlUrl->url);
195
            $i++;
196
        }
197
    }
198
199
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
200
    {
201
        $allLinks = $this->extractAllLinks($html);
202
203
        collect($allLinks)
204
            ->reject(function (Url $url) {
205
                return
206
                    $url->isEmailUrl() ||
207
                    $url->isTelUrl() ||
208
                    $url->isJavascript();
209
            })
210
            ->map(function (Url $url) {
211
                return $this->normalizeUrl($url);
212
            })
213
            ->filter(function (Url $url) {
214
                return $this->crawlProfile->shouldCrawl($url);
215
            })
216
            ->each(function (Url $url) use ($foundOnUrl) {
217
                if (! $this->crawlQueue->has($url)) {
218
                    $crawlUrl = CrawlUrl::create($url, $foundOnUrl);
219
220
                    $this->crawlQueue->add($crawlUrl);
221
                }
222
            });
223
    }
224
225
    protected function extractAllLinks(string $html): Collection
226
    {
227
        $domCrawler = new DomCrawler($html);
228
229
        $allUrls = collect($domCrawler->filterXpath('//a')
230
            ->extract(['href']))
231
            ->map(function ($url) {
232
                return Url::create($url);
233
            });
234
235
        return $allUrls;
236
    }
237
238
    /**
239
     * Normalize the given url.
240
     *
241
     * @param \Spatie\Crawler\Url $url
242
     *
243
     * @return $this
244
     */
245
    protected function normalizeUrl(Url $url)
246
    {
247
        if ($url->isRelative()) {
248
            $url->setScheme($this->baseUrl->scheme)
249
                ->setHost($this->baseUrl->host)
250
                ->setPort($this->baseUrl->port);
251
        }
252
253
        if ($url->isProtocolIndependent()) {
254
            $url->setScheme($this->baseUrl->scheme);
255
        }
256
257
        return $url->removeFragment();
258
    }
259
}
260