Completed
Pull Request — master (#41)
by Sebastian
03:51 queued 02:03
created

Crawler::addAllLinksToCrawlQueue()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 23
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 23
rs 9.0856
c 0
b 0
f 0
cc 1
eloc 14
nc 1
nop 2
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Psr\Http\Message\ResponseInterface;
12
use GuzzleHttp\Exception\RequestException;
13
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
14
15
class Crawler
16
{
17
    /** @var \GuzzleHttp\Client */
18
    protected $client;
19
20
    /** @var \Spatie\Crawler\Url */
21
    protected $baseUrl;
22
23
    /** @var \Spatie\Crawler\CrawlObserver */
24
    protected $crawlObserver;
25
26
    /** @var \Spatie\Crawler\CrawlProfile */
27
    protected $crawlProfile;
28
29
    /** @var int */
30
    protected $concurrency;
31
32
    /** @var \Spatie\Crawler\CrawlQueue */
33
    protected $crawlQueue;
34
35
    /**
36
     * @param array $clientOptions
37
     *
38
     * @return static
39
     */
40
    public static function create(array $clientOptions = null)
41
    {
42
        $client = new Client($clientOptions ?? [
43
                RequestOptions::ALLOW_REDIRECTS => false,
44
                RequestOptions::COOKIES => true,
45
            ]);
46
47
        return new static($client);
48
    }
49
50
    public function __construct(Client $client, int $concurrency = 10)
51
    {
52
        $this->client = $client;
53
54
        $this->concurrency = $concurrency;
55
56
        $this->crawlProfile = new CrawlAllUrls();
57
58
        $this->crawlQueue = new CrawlQueue();
59
    }
60
61
    /**
62
     * @param int $concurrency
63
     *
64
     * @return $this
65
     */
66
    public function setConcurrency(int $concurrency)
67
    {
68
        $this->concurrency = $concurrency;
69
70
        return $this;
71
    }
72
73
    /**
74
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
75
     *
76
     * @return $this
77
     */
78
    public function setCrawlObserver(CrawlObserver $crawlObserver)
79
    {
80
        $this->crawlObserver = $crawlObserver;
81
82
        return $this;
83
    }
84
85
    /**
86
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
87
     *
88
     * @return $this
89
     */
90
    public function setCrawlProfile(CrawlProfile $crawlProfile)
91
    {
92
        $this->crawlProfile = $crawlProfile;
93
94
        return $this;
95
    }
96
97
    /**
98
     * @param \Spatie\Crawler\Url|string $baseUrl
99
     */
100
    public function startCrawling($baseUrl)
101
    {
102
        if (! $baseUrl instanceof Url) {
103
            $baseUrl = Url::create($baseUrl);
104
        }
105
106
        $this->baseUrl = $baseUrl;
107
108
        $crawlUrl = CrawlUrl::create($baseUrl);
109
110
        $this->crawlQueue->add($crawlUrl);
111
112
        $this->startCrawlingQueue();
113
114
        $this->crawlObserver->finishedCrawling();
115
    }
116
117
    protected function startCrawlingQueue()
118
    {
119
        while ($this->crawlQueue->hasPendingUrls()) {
120
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
121
                'concurrency' => $this->concurrency,
122
                'fulfilled' => function (ResponseInterface $response, int $index) {
123
                    $this->handleResponse($response, $index);
124
125
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
126
127
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
128
                        return;
129
                    }
130
131
                    $this->addAllLinksToCrawlQueue(
132
                        (string) $response->getBody(),
133
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
134
                    );
135
                },
136
                'rejected' => function (RequestException $exception, int $index) {
137
                    $this->handleResponse($exception->getResponse(), $index);
0 ignored issues
show
Bug introduced by
It seems like $exception->getResponse() can be null; however, handleResponse() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
138
                },
139
            ]);
140
141
            $promise = $pool->promise();
142
            $promise->wait();
143
144
            $this->crawlQueue->removeProcessedUrlsFromPending();
145
        }
146
    }
147
148
    protected function handleResponse(ResponseInterface $response, int $index)
149
    {
150
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
151
152
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
153
    }
154
155
    protected function getCrawlRequests(): Generator
156
    {
157
        $i = 0;
158
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
159
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
160
                $i++;
161
                continue;
162
            }
163
164
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
165
                $i++;
166
                continue;
167
            }
168
169
            $this->crawlObserver->willCrawl($crawlUrl->url);
170
171
            $this->crawlQueue->markAsProcessed($crawlUrl);
172
173
            yield new Request('GET', (string) $crawlUrl->url);
174
            $i++;
175
        }
176
    }
177
178
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
179
    {
180
        $allLinks = $this->extractAllLinks($html);
181
182
        collect($allLinks)
183
            ->map(function (Url $url) {
184
                return $this->normalizeUrl($url);
185
            })
186
            ->filter(function (Url $url) {
187
                return $url->hasCrawlableScheme();
188
            })
189
            ->filter(function (Url $url) {
190
                return $this->crawlProfile->shouldCrawl($url);
191
            })
192
            ->reject(function ($url) {
193
                return $this->crawlQueue->has($url);
194
            })
195
            ->each(function (Url $url) use ($foundOnUrl) {
196
                $this->crawlQueue->add(
197
                    CrawlUrl::create($url, $foundOnUrl)
198
                );
199
            });
200
    }
201
202
    protected function extractAllLinks(string $html): Collection
203
    {
204
        $domCrawler = new DomCrawler($html);
205
206
        return collect($domCrawler->filterXpath('//a')->extract(['href']))
207
            ->map(function ($url) {
208
                return Url::create($url);
209
            });
210
    }
211
212
    /**
213
     * @param \Spatie\Crawler\Url $url
214
     *
215
     * @return \Spatie\Crawler\Url
216
     */
217
    protected function normalizeUrl(Url $url): Url
218
    {
219
        if ($url->isRelative()) {
220
            $url->setScheme($this->baseUrl->scheme)
221
                ->setHost($this->baseUrl->host)
222
                ->setPort($this->baseUrl->port);
223
        }
224
225
        if ($url->isProtocolIndependent()) {
226
            $url->setScheme($this->baseUrl->scheme);
227
        }
228
229
        return $url->removeFragment();
230
    }
231
}
232