Completed
Push — master ( ea5032...a753e8 )
by Freek
01:13
created

Crawler::addToLinkTree()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 18
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 18
rs 9.2
cc 4
eloc 11
nc 4
nop 3
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Spatie\Browsershot\Browsershot;
12
use Symfony\Component\DomCrawler\Link;
13
use Psr\Http\Message\ResponseInterface;
14
use GuzzleHttp\Exception\RequestException;
15
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
16
use Tree\Node\Node;
17
18
class Crawler
19
{
20
    /** @var \GuzzleHttp\Client */
21
    protected $client;
22
23
    /** @var \Spatie\Crawler\Url */
24
    protected $baseUrl;
25
26
    /** @var \Spatie\Crawler\CrawlObserver */
27
    protected $crawlObserver;
28
29
    /** @var \Spatie\Crawler\CrawlProfile */
30
    protected $crawlProfile;
31
32
    /** @var int */
33
    protected $concurrency;
34
35
    /** @var \Spatie\Crawler\CrawlQueue */
36
    protected $crawlQueue;
37
38
	/** @var int */
39
	protected $maximumDepth = 0;
40
41
	/** @var \Tree\Node\Node */
42
	protected $depthTree;
43
44
    /** @var false */
45
    protected $executeJavaScript = false;
46
47
    /** @var string|null */
48
    protected $pathToChromeBinary = null;
49
50
    /**
51
     * @param array $clientOptions
52
     *
53
     * @return static
54
     */
55
    public static function create(array $clientOptions = [])
56
    {
57
        $hasClientOpts = (bool) count($clientOptions);
58
        $client = new Client($hasClientOpts ? $clientOptions : [
59
                RequestOptions::COOKIES => true,
60
                RequestOptions::CONNECT_TIMEOUT => 10,
61
                RequestOptions::TIMEOUT => 10,
62
                RequestOptions::ALLOW_REDIRECTS => false,
63
            ]);
64
65
        return new static($client);
66
    }
67
68
    public function __construct(Client $client, int $concurrency = 10)
69
    {
70
        $this->client = $client;
71
72
        $this->concurrency = $concurrency;
73
74
        $this->crawlProfile = new CrawlAllUrls();
75
76
        $this->crawlQueue = new CrawlQueue();
77
    }
78
79
    /**
80
     * @param int $concurrency
81
     *
82
     * @return $this
83
     */
84
    public function setConcurrency(int $concurrency)
85
    {
86
        $this->concurrency = $concurrency;
87
88
        return $this;
89
    }
90
91
	/**
92
	 * @param int $maximumDepth
93
	 *
94
	 * @return $this
95
	 */
96
	public function setMaximumDepth(int $maximumDepth) {
97
98
		$this->maximumDepth = $maximumDepth;
99
100
		return $this;
101
	}
102
103
    /**
104
     * @return $this
105
     */
106
    public function executeJavaScript($pathToChromeBinary = null)
107
    {
108
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
109
110
        $this->pathToChromeBinary = $pathToChromeBinary;
111
112
        return $this;
113
    }
114
115
    /**
116
     * @return $this
117
     */
118
    public function doNotExecuteJavaScript()
119
    {
120
        $this->executeJavaScript = false;
121
122
        return $this;
123
    }
124
125
    /**
126
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
127
     *
128
     * @return $this
129
     */
130
    public function setCrawlObserver(CrawlObserver $crawlObserver)
131
    {
132
        $this->crawlObserver = $crawlObserver;
133
134
        return $this;
135
    }
136
137
    /**
138
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
139
     *
140
     * @return $this
141
     */
142
    public function setCrawlProfile(CrawlProfile $crawlProfile)
143
    {
144
        $this->crawlProfile = $crawlProfile;
145
146
        return $this;
147
    }
148
149
    /**
150
     * @param \Spatie\Crawler\Url|string $baseUrl
151
     */
152
    public function startCrawling($baseUrl)
153
    {
154
        if (! $baseUrl instanceof Url) {
155
            $baseUrl = Url::create($baseUrl);
156
        }
157
158
        $this->baseUrl = $baseUrl;
159
160
        $crawlUrl = CrawlUrl::create($baseUrl);
161
162
        $this->crawlQueue->add($crawlUrl);
163
164
		$this->depthTree = new Node((string)$this->baseUrl);
165
166
        $this->startCrawlingQueue();
167
168
        $this->crawlObserver->finishedCrawling();
169
    }
170
171
    protected function startCrawlingQueue()
172
    {
173
        while ($this->crawlQueue->hasPendingUrls()) {
174
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
175
                'concurrency' => $this->concurrency,
176
                'options' => $this->client->getConfig(),
177
                'fulfilled' => function (ResponseInterface $response, int $index) {
178
                    $this->handleResponse($response, $index);
179
180
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
181
182
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
183
                        return;
184
                    }
185
186
                    $this->addAllLinksToCrawlQueue(
187
                        (string) $response->getBody(),
188
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
189
                    );
190
                },
191
                'rejected' => function (RequestException $exception, int $index) {
192
                    $this->handleResponse($exception->getResponse(), $index);
193
                },
194
            ]);
195
196
            $promise = $pool->promise();
197
            $promise->wait();
198
199
            $this->crawlQueue->removeProcessedUrlsFromPending();
200
        }
201
    }
202
203
    /**
204
     * @param ResponseInterface|null $response
205
     * @param int $index
206
     */
207
    protected function handleResponse($response, int $index)
208
    {
209
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
210
211
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
212
    }
213
214
    protected function getCrawlRequests(): Generator
215
    {
216
        $i = 0;
217
218
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
219
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
220
                $i++;
221
                continue;
222
            }
223
224
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
225
                $i++;
226
                continue;
227
            }
228
229
            $this->crawlObserver->willCrawl($crawlUrl->url);
230
231
            $this->crawlQueue->markAsProcessed($crawlUrl);
232
233
            yield new Request('GET', (string) $crawlUrl->url);
234
            $i++;
235
        }
236
    }
237
238
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
239
    {
240
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
241
242
        collect($allLinks)
243
            ->filter(function (Url $url) {
244
                return $url->hasCrawlableScheme();
245
            })
246
            ->map(function (Url $url) {
247
                return $this->normalizeUrl($url);
248
            })
249
            ->filter(function (Url $url) {
250
                return $this->crawlProfile->shouldCrawl($url);
251
            })
252
            ->reject(function ($url) {
253
                return $this->crawlQueue->has($url);
254
            })
255
            ->each(function (Url $url) use ($foundOnUrl) {
256
				$node = $this->addtoDepthTree($this->depthTree, (string)$url, $foundOnUrl);
257
258
				if($this->shouldCrawlAtDepth($node->getDepth())) {
259
					$this->crawlQueue->add(
260
						CrawlUrl::create($url, $foundOnUrl)
261
					);
262
				}
263
            });
264
    }
265
266
    protected function shouldCrawlAtDepth(int $depth): bool
267
    {
268
        if ($this->maximumDepth === 0) {
269
            return true;
270
        }
271
272
        return $depth <= $this->maximumDepth;
273
    }
274
275
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
276
    {
277
        if ($this->executeJavaScript) {
278
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
279
        }
280
281
        $domCrawler = new DomCrawler($html, $foundOnUrl);
282
283
        return collect($domCrawler->filterXpath('//a')->links())
284
            ->map(function (Link $link) {
285
                return Url::create($link->getUri());
286
            });
287
    }
288
289
    protected function normalizeUrl(Url $url): Url
290
    {
291
        return $url->removeFragment();
292
    }
293
294
	protected function addtoDepthTree(Node $node, string $url, string $parentUrl) {
295
296
		$returnNode = null;
297
298
		if($node->getValue() === $parentUrl) {
299
			$newNode = new Node($url);
300
301
			$node->addChild($newNode);
302
303
			return $newNode;
304
		}
305
306
		foreach($node->getChildren() as $currentNode) {
307
			$returnNode = $this->addtoDepthTree($currentNode, $url, $parentUrl);
308
309
			if(! is_null($returnNode)) {
310
				break;
311
			}
312
		}
313
314
		return $returnNode;
315
	}
316
317
    protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string
318
    {
319
        $browsershot = Browsershot::url((string) $foundOnUrl);
320
321
        if ($this->pathToChromeBinary) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->pathToChromeBinary of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
322
            $browsershot->setChromePath($this->pathToChromeBinary);
323
        }
324
325
        $html = $browsershot->bodyHtml();
326
327
        return html_entity_decode($html);
328
    }
329
}
330