Completed
Pull Request — master (#65)
by
unknown
01:15
created

Crawler   B

Complexity

Total Complexity 25

Size/Duplication

Total Lines 262
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 16

Importance

Changes 10
Bugs 2 Features 1
Metric Value
wmc 25
c 10
b 2
f 1
lcom 1
cbo 16
dl 0
loc 262
rs 8.4614

14 Methods

Rating   Name   Duplication   Size   Complexity  
A create() 0 12 2
A __construct() 0 10 1
A setConcurrency() 0 6 1
A setDepth() 0 6 1
A setCrawlObserver() 0 6 1
A setCrawlProfile() 0 6 1
A startCrawling() 0 18 2
B startCrawlingQueue() 0 31 3
A handleResponse() 0 6 1
B getCrawlRequests() 0 22 4
B addAllLinksToCrawlQueue() 0 30 3
A extractAllLinks() 0 9 1
A normalizeUrl() 0 4 1
A addToLinkTree() 0 11 3
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Symfony\Component\DomCrawler\Link;
12
use Psr\Http\Message\ResponseInterface;
13
use GuzzleHttp\Exception\RequestException;
14
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
15
use Tree\Node\Node;
16
17
class Crawler
18
{
19
    /** @var \GuzzleHttp\Client */
20
    protected $client;
21
22
    /** @var \Spatie\Crawler\Url */
23
    protected $baseUrl;
24
25
    /** @var \Spatie\Crawler\CrawlObserver */
26
    protected $crawlObserver;
27
28
    /** @var \Spatie\Crawler\CrawlProfile */
29
    protected $crawlProfile;
30
31
    /** @var int */
32
    protected $concurrency;
33
34
    /** @var \Spatie\Crawler\CrawlQueue */
35
    protected $crawlQueue;
36
37
    /** @var int */
38
    protected $depth = 0;
39
40
    /** @var \Tree\Node\Node */
41
    protected $linkTree;
42
43
    /**
44
     * @param array $clientOptions
45
     *
46
     * @return static
47
     */
48
    public static function create(array $clientOptions = [])
49
    {
50
        $hasClientOpts = (bool) count($clientOptions);
51
        $client = new Client($hasClientOpts ? $clientOptions : [
52
                RequestOptions::COOKIES => true,
53
                RequestOptions::CONNECT_TIMEOUT => 10,
54
                RequestOptions::TIMEOUT => 10,
55
                RequestOptions::ALLOW_REDIRECTS => false,
56
            ]);
57
58
        return new static($client);
59
    }
60
61
    public function __construct(Client $client, int $concurrency = 10)
62
    {
63
        $this->client = $client;
64
65
        $this->concurrency = $concurrency;
66
67
        $this->crawlProfile = new CrawlAllUrls();
68
69
        $this->crawlQueue = new CrawlQueue();
70
    }
71
72
    /**
73
     * @param int $concurrency
74
     *
75
     * @return $this
76
     */
77
    public function setConcurrency(int $concurrency)
78
    {
79
        $this->concurrency = $concurrency;
80
81
        return $this;
82
    }
83
84
	/**
85
	 * @param int $depth
86
	 *
87
	 * @return $this
88
	 */
89
	public function setDepth(int $depth)
90
	{
91
		$this->depth = $depth;
92
93
		return $this;
94
	}
95
96
    /**
97
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
98
     *
99
     * @return $this
100
     */
101
    public function setCrawlObserver(CrawlObserver $crawlObserver)
102
    {
103
        $this->crawlObserver = $crawlObserver;
104
105
        return $this;
106
    }
107
108
    /**
109
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
110
     *
111
     * @return $this
112
     */
113
    public function setCrawlProfile(CrawlProfile $crawlProfile)
114
    {
115
        $this->crawlProfile = $crawlProfile;
116
117
        return $this;
118
    }
119
120
    /**
121
     * @param \Spatie\Crawler\Url|string $baseUrl
122
     */
123
    public function startCrawling($baseUrl)
124
    {
125
        if (! $baseUrl instanceof Url) {
126
            $baseUrl = Url::create($baseUrl);
127
        }
128
129
        $this->baseUrl = $baseUrl;
130
131
        $crawlUrl = CrawlUrl::create($baseUrl);
132
133
        $this->crawlQueue->add($crawlUrl);
134
135
        $this->linkTree = new Node((string)$this->baseUrl);
136
137
        $this->startCrawlingQueue();
138
139
        $this->crawlObserver->finishedCrawling();
140
    }
141
142
    protected function startCrawlingQueue()
143
    {
144
    	while ($this->crawlQueue->hasPendingUrls()) {
145
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
146
                'concurrency' => $this->concurrency,
147
                'options' => $this->client->getConfig(),
148
                'fulfilled' => function (ResponseInterface $response, int $index) {
149
                    $this->handleResponse($response, $index);
150
151
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
152
153
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
154
                        return;
155
                    }
156
157
                    $this->addAllLinksToCrawlQueue(
158
                        (string) $response->getBody(),
159
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
160
                    );
161
                },
162
                'rejected' => function (RequestException $exception, int $index) {
163
                    $this->handleResponse($exception->getResponse(), $index);
164
                },
165
            ]);
166
167
            $promise = $pool->promise();
168
            $promise->wait();
169
170
            $this->crawlQueue->removeProcessedUrlsFromPending();
171
        }
172
    }
173
174
    /**
175
     * @param ResponseInterface|null $response
176
     * @param int $index
177
     */
178
    protected function handleResponse($response, int $index)
179
    {
180
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
181
182
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
183
    }
184
185
    protected function getCrawlRequests(): Generator
186
    {
187
        $i = 0;
188
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
189
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
190
                $i++;
191
                continue;
192
            }
193
194
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
195
                $i++;
196
                continue;
197
            }
198
199
            $this->crawlObserver->willCrawl($crawlUrl->url);
200
201
            $this->crawlQueue->markAsProcessed($crawlUrl);
202
203
            yield new Request('GET', (string) $crawlUrl->url);
204
            $i++;
205
        }
206
    }
207
208
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
209
    {
210
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
211
212
        collect($allLinks)
213
            ->filter(function (Url $url) {
214
                return $url->hasCrawlableScheme();
215
            })
216
            ->map(function (Url $url) {
217
                return $this->normalizeUrl($url);
218
            })
219
            ->filter(function (Url $url) {
220
                return $this->crawlProfile->shouldCrawl($url);
221
            })
222
            ->reject(function ($url) {
223
                return $this->crawlQueue->has($url);
224
            })
225
            ->each(function (Url $url) use ($foundOnUrl) {
226
227
				$newNode = null;
228
229
            	$this->addToLinkTree($this->linkTree, (string)$url, $foundOnUrl, $newNode);
230
231
				if(($this->depth == 0) || ($newNode->getDepth() <= $this->depth)) {
232
					$this->crawlQueue->add(
233
						CrawlUrl::create($url, $foundOnUrl)
234
					);
235
				}
236
            });
237
    }
238
239
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
240
    {
241
        $domCrawler = new DomCrawler($html, $foundOnUrl);
242
243
        return collect($domCrawler->filterXpath('//a')->links())
244
            ->map(function (Link $link) {
245
                return Url::create($link->getUri());
246
            });
247
    }
248
249
    /**
250
     * @param \Spatie\Crawler\Url $url
251
     *
252
     * @return \Spatie\Crawler\Url
253
     */
254
    protected function normalizeUrl(Url $url): Url
255
    {
256
        return $url->removeFragment();
257
    }
258
259
260
	/**
261
	 * @param $node \Tree\Node\Node
262
	 * @param $url string
263
	 * @param $parentUrl string
264
	 * @param $newNode \Tree\Node\Node
265
	 */
266
	protected function addToLinkTree($node, string $url, string $parentUrl, &$newNode) {
267
268
    	if($node->getValue() == $parentUrl) {
269
    		$newNode = new Node($url);
270
			$node->addChild($newNode);
271
		}
272
273
		foreach($node->getChildren() as $currentNode) {
274
			$this->addToLinkTree($currentNode, $url, $parentUrl, $newNode);
275
		}
276
	}
277
278
}
279