Completed
Pull Request — master (#65)
by
unknown
01:11
created

Crawler::setMaximumDepth()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 6
rs 9.4285
cc 1
eloc 3
nc 1
nop 1
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Symfony\Component\DomCrawler\Link;
12
use Psr\Http\Message\ResponseInterface;
13
use GuzzleHttp\Exception\RequestException;
14
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
15
use Tree\Node\Node;
16
17
class Crawler
18
{
19
    /** @var \GuzzleHttp\Client */
20
    protected $client;
21
22
    /** @var \Spatie\Crawler\Url */
23
    protected $baseUrl;
24
25
    /** @var \Spatie\Crawler\CrawlObserver */
26
    protected $crawlObserver;
27
28
    /** @var \Spatie\Crawler\CrawlProfile */
29
    protected $crawlProfile;
30
31
    /** @var int */
32
    protected $concurrency;
33
34
    /** @var \Spatie\Crawler\CrawlQueue */
35
    protected $crawlQueue;
36
37
    /** @var int */
38
    protected $maximumDepth = 0;
39
40
    /** @var \Tree\Node\Node */
41
    protected $linkTree;
42
43
    /**
44
     * @param array $clientOptions
45
     *
46
     * @return static
47
     */
48
    public static function create(array $clientOptions = [])
49
    {
50
        $hasClientOpts = (bool) count($clientOptions);
51
        $client = new Client($hasClientOpts ? $clientOptions : [
52
                RequestOptions::COOKIES => true,
53
                RequestOptions::CONNECT_TIMEOUT => 10,
54
                RequestOptions::TIMEOUT => 10,
55
                RequestOptions::ALLOW_REDIRECTS => false,
56
            ]);
57
58
        return new static($client);
59
    }
60
61
    public function __construct(Client $client, int $concurrency = 10)
62
    {
63
        $this->client = $client;
64
65
        $this->concurrency = $concurrency;
66
67
        $this->crawlProfile = new CrawlAllUrls();
68
69
        $this->crawlQueue = new CrawlQueue();
70
    }
71
72
    /**
73
     * @param int $concurrency
74
     *
75
     * @return $this
76
     */
77
    public function setConcurrency(int $concurrency)
78
    {
79
        $this->concurrency = $concurrency;
80
81
        return $this;
82
    }
83
84
	/**
85
	 * @param int $maximumDepth
86
	 *
87
	 * @return $this
88
	 */
89
	public function setMaximumDepth(int $maximumDepth)
90
	{
91
		$this->maximumDepth = $maximumDepth;
92
93
		return $this;
94
	}
95
96
    /**
97
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
98
     *
99
     * @return $this
100
     */
101
    public function setCrawlObserver(CrawlObserver $crawlObserver)
102
    {
103
        $this->crawlObserver = $crawlObserver;
104
105
        return $this;
106
    }
107
108
    /**
109
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
110
     *
111
     * @return $this
112
     */
113
    public function setCrawlProfile(CrawlProfile $crawlProfile)
114
    {
115
        $this->crawlProfile = $crawlProfile;
116
117
        return $this;
118
    }
119
120
    /**
121
     * @param \Spatie\Crawler\Url|string $baseUrl
122
     */
123
    public function startCrawling($baseUrl)
124
    {
125
        if (! $baseUrl instanceof Url) {
126
            $baseUrl = Url::create($baseUrl);
127
        }
128
129
        $this->baseUrl = $baseUrl;
130
131
        $crawlUrl = CrawlUrl::create($baseUrl);
132
133
        $this->crawlQueue->add($crawlUrl);
134
135
        $this->linkTree = new Node((string)$this->baseUrl);
136
137
        $this->startCrawlingQueue();
138
139
        $this->crawlObserver->finishedCrawling();
140
    }
141
142
    protected function startCrawlingQueue()
143
    {
144
    	while ($this->crawlQueue->hasPendingUrls()) {
145
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
146
                'concurrency' => $this->concurrency,
147
                'options' => $this->client->getConfig(),
148
                'fulfilled' => function (ResponseInterface $response, int $index) {
149
                    $this->handleResponse($response, $index);
150
151
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
152
153
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
154
                        return;
155
                    }
156
157
                    $this->addAllLinksToCrawlQueue(
158
                        (string) $response->getBody(),
159
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
160
                    );
161
                },
162
                'rejected' => function (RequestException $exception, int $index) {
163
                    $this->handleResponse($exception->getResponse(), $index);
164
                },
165
            ]);
166
167
            $promise = $pool->promise();
168
            $promise->wait();
169
170
            $this->crawlQueue->removeProcessedUrlsFromPending();
171
        }
172
    }
173
174
    /**
175
     * @param ResponseInterface|null $response
176
     * @param int $index
177
     */
178
    protected function handleResponse($response, int $index)
179
    {
180
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
181
182
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
183
    }
184
185
    protected function getCrawlRequests(): Generator
186
    {
187
        $i = 0;
188
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
189
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
190
                $i++;
191
                continue;
192
            }
193
194
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
195
                $i++;
196
                continue;
197
            }
198
199
            $this->crawlObserver->willCrawl($crawlUrl->url);
200
201
            $this->crawlQueue->markAsProcessed($crawlUrl);
202
203
            yield new Request('GET', (string) $crawlUrl->url);
204
            $i++;
205
        }
206
    }
207
208
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
209
    {
210
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
211
212
        collect($allLinks)
213
            ->filter(function (Url $url) {
214
                return $url->hasCrawlableScheme();
215
            })
216
            ->map(function (Url $url) {
217
                return $this->normalizeUrl($url);
218
            })
219
            ->filter(function (Url $url) {
220
                return $this->crawlProfile->shouldCrawl($url);
221
            })
222
            ->reject(function ($url) {
223
                return $this->crawlQueue->has($url);
224
            })
225
            ->each(function (Url $url) use ($foundOnUrl) {
226
227
            	$node = $this->addToLinkTree($this->linkTree, (string)$url, $foundOnUrl);
228
229
				if(($this->maximumDepth === 0) || ($node->getDepth() <= $this->maximumDepth)) {
230
					$this->crawlQueue->add(
231
						CrawlUrl::create($url, $foundOnUrl)
232
					);
233
				}
234
            });
235
    }
236
237
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
238
    {
239
        $domCrawler = new DomCrawler($html, $foundOnUrl);
240
241
        return collect($domCrawler->filterXpath('//a')->links())
242
            ->map(function (Link $link) {
243
                return Url::create($link->getUri());
244
            });
245
    }
246
247
    /**
248
     * @param \Spatie\Crawler\Url $url
249
     *
250
     * @return \Spatie\Crawler\Url
251
     */
252
    protected function normalizeUrl(Url $url): Url
253
    {
254
        return $url->removeFragment();
255
    }
256
257
258
	/**
259
	 * @param $node \Tree\Node\Node
260
	 * @param $url string
261
	 * @param $parentUrl string
262
	 */
263
	protected function addToLinkTree($node, string $url, string $parentUrl) {
264
		
265
		$returnNode = null;
266
267
    	if($node->getValue() == $parentUrl) {
268
    		$newNode = new Node($url);
269
			$node->addChild($newNode);
270
			return $newNode;
271
		}
272
273
		foreach($node->getChildren() as $currentNode) {
274
			$returnNode = $this->addToLinkTree($currentNode, $url, $parentUrl);
275
276
			if($returnNode != null) {
277
				break;
278
			}
279
		}
280
281
		return $returnNode;
282
	}
283
284
}
285