Completed
Pull Request — master (#65)
by
unknown
01:20
created

Crawler::setCrawlProfile()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nc 1
nop 1
dl 0
loc 6
rs 9.4285
c 0
b 0
f 0
1
<?php
2
3
namespace Spatie\Crawler;
4
5
use Generator;
6
use GuzzleHttp\Pool;
7
use GuzzleHttp\Client;
8
use GuzzleHttp\Psr7\Request;
9
use GuzzleHttp\RequestOptions;
10
use Illuminate\Support\Collection;
11
use Spatie\Browsershot\Browsershot;
12
use Symfony\Component\DomCrawler\Link;
13
use Psr\Http\Message\ResponseInterface;
14
use GuzzleHttp\Exception\RequestException;
15
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
16
use Tree\Node\Node;
17
18
class Crawler
19
{
20
    /** @var \GuzzleHttp\Client */
21
    protected $client;
22
23
    /** @var \Spatie\Crawler\Url */
24
    protected $baseUrl;
25
26
    /** @var \Spatie\Crawler\CrawlObserver */
27
    protected $crawlObserver;
28
29
    /** @var \Spatie\Crawler\CrawlProfile */
30
    protected $crawlProfile;
31
32
    /** @var int */
33
    protected $concurrency;
34
35
    /** @var \Spatie\Crawler\CrawlQueue */
36
    protected $crawlQueue;
37
38
	/** @var int */
39
	protected $maximumDepth = 0;
40
41
	/** @var \Tree\Node\Node */
42
	protected $linkTree;
43
44
    /** @var false */
45
    protected $executeJavaScript = false;
46
47
    /** @var string|null */
48
    protected $pathToChromeBinary = null;
49
50
    /**
51
     * @param array $clientOptions
52
     *
53
     * @return static
54
     */
55
    public static function create(array $clientOptions = [])
56
    {
57
        $hasClientOpts = (bool) count($clientOptions);
58
        $client = new Client($hasClientOpts ? $clientOptions : [
59
                RequestOptions::COOKIES => true,
60
                RequestOptions::CONNECT_TIMEOUT => 10,
61
                RequestOptions::TIMEOUT => 10,
62
                RequestOptions::ALLOW_REDIRECTS => false,
63
            ]);
64
65
        return new static($client);
66
    }
67
68
    public function __construct(Client $client, int $concurrency = 10)
69
    {
70
        $this->client = $client;
71
72
        $this->concurrency = $concurrency;
73
74
        $this->crawlProfile = new CrawlAllUrls();
75
76
        $this->crawlQueue = new CrawlQueue();
77
    }
78
79
    /**
80
     * @param int $concurrency
81
     *
82
     * @return $this
83
     */
84
    public function setConcurrency(int $concurrency)
85
    {
86
        $this->concurrency = $concurrency;
87
88
        return $this;
89
    }
90
91
	/**
92
	 * @param int $maximumDepth
93
	 *
94
	 * @return $this
95
	 */
96
	public function setMaximumDepth(int $maximumDepth) {
97
98
		$this->maximumDepth = $maximumDepth;
99
100
		return $this;
101
	}
102
103
    /**
104
     * @return $this
105
     */
106
    public function executeJavaScript($pathToChromeBinary = null)
107
    {
108
        $this->executeJavaScript = true;
0 ignored issues
show
Documentation Bug introduced by
The property $executeJavaScript was declared of type false, but true is of type boolean. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
109
110
        $this->pathToChromeBinary = $pathToChromeBinary;
111
112
        return $this;
113
    }
114
115
    /**
116
     * @return $this
117
     */
118
    public function doNotExecuteJavaScript()
119
    {
120
        $this->executeJavaScript = false;
121
122
        return $this;
123
    }
124
125
    /**
126
     * @param \Spatie\Crawler\CrawlObserver $crawlObserver
127
     *
128
     * @return $this
129
     */
130
    public function setCrawlObserver(CrawlObserver $crawlObserver)
131
    {
132
        $this->crawlObserver = $crawlObserver;
133
134
        return $this;
135
    }
136
137
    /**
138
     * @param \Spatie\Crawler\CrawlProfile $crawlProfile
139
     *
140
     * @return $this
141
     */
142
    public function setCrawlProfile(CrawlProfile $crawlProfile)
143
    {
144
        $this->crawlProfile = $crawlProfile;
145
146
        return $this;
147
    }
148
149
    /**
150
     * @param \Spatie\Crawler\Url|string $baseUrl
151
     */
152
    public function startCrawling($baseUrl)
153
    {
154
        if (! $baseUrl instanceof Url) {
155
            $baseUrl = Url::create($baseUrl);
156
        }
157
158
        $this->baseUrl = $baseUrl;
159
160
        $crawlUrl = CrawlUrl::create($baseUrl);
161
162
        $this->crawlQueue->add($crawlUrl);
163
164
		$this->linkTree = new Node((string)$this->baseUrl);
165
166
        $this->startCrawlingQueue();
167
168
        $this->crawlObserver->finishedCrawling();
169
    }
170
171
    protected function startCrawlingQueue()
172
    {
173
        while ($this->crawlQueue->hasPendingUrls()) {
174
            $pool = new Pool($this->client, $this->getCrawlRequests(), [
175
                'concurrency' => $this->concurrency,
176
                'options' => $this->client->getConfig(),
177
                'fulfilled' => function (ResponseInterface $response, int $index) {
178
                    $this->handleResponse($response, $index);
179
180
                    $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
181
182
                    if ($crawlUrl->url->host !== $this->baseUrl->host) {
183
                        return;
184
                    }
185
186
                    $this->addAllLinksToCrawlQueue(
187
                        (string) $response->getBody(),
188
                        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index)->url
189
                    );
190
                },
191
                'rejected' => function (RequestException $exception, int $index) {
192
                    $this->handleResponse($exception->getResponse(), $index);
193
                },
194
            ]);
195
196
            $promise = $pool->promise();
197
            $promise->wait();
198
199
            $this->crawlQueue->removeProcessedUrlsFromPending();
200
        }
201
    }
202
203
    /**
204
     * @param ResponseInterface|null $response
205
     * @param int $index
206
     */
207
    protected function handleResponse($response, int $index)
208
    {
209
        $crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($index);
210
211
        $this->crawlObserver->hasBeenCrawled($crawlUrl->url, $response, $crawlUrl->foundOnUrl);
212
    }
213
214
    protected function getCrawlRequests(): Generator
215
    {
216
        $i = 0;
217
        while ($crawlUrl = $this->crawlQueue->getPendingUrlAtIndex($i)) {
218
            if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) {
219
                $i++;
220
                continue;
221
            }
222
223
            if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) {
224
                $i++;
225
                continue;
226
            }
227
228
            $this->crawlObserver->willCrawl($crawlUrl->url);
229
230
            $this->crawlQueue->markAsProcessed($crawlUrl);
231
232
            yield new Request('GET', (string) $crawlUrl->url);
233
            $i++;
234
        }
235
    }
236
237
    protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl)
238
    {
239
        $allLinks = $this->extractAllLinks($html, $foundOnUrl);
240
241
        collect($allLinks)
242
            ->filter(function (Url $url) {
243
                return $url->hasCrawlableScheme();
244
            })
245
            ->map(function (Url $url) use ($foundOnUrl) {
246
                return $this->normalizeUrl($url);
247
            })
248
            ->filter(function (Url $url) {
249
                return $this->crawlProfile->shouldCrawl($url);
250
            })
251
            ->reject(function ($url) {
252
                return $this->crawlQueue->has($url);
253
            })
254
            ->each(function (Url $url) use ($foundOnUrl) {
255
				$node = $this->addToLinkTree($this->linkTree, (string)$url, $foundOnUrl);
256
257
				if(($this->maximumDepth == 0) || ($node->getDepth() <= $this->maximumDepth)) {
258
					$this->crawlQueue->add(
259
						CrawlUrl::create($url, $foundOnUrl)
260
					);
261
				}
262
            });
263
    }
264
265
    protected function extractAllLinks(string $html, Url $foundOnUrl): Collection
266
    {
267
        if ($this->executeJavaScript) {
268
            $html = $this->getBodyAfterExecutingJavaScript($foundOnUrl);
269
        }
270
271
        $domCrawler = new DomCrawler($html, $foundOnUrl);
272
273
        return collect($domCrawler->filterXpath('//a')->links())
274
            ->map(function (Link $link) {
275
                return Url::create($link->getUri());
276
            });
277
    }
278
279
    protected function normalizeUrl(Url $url): Url
280
    {
281
        return $url->removeFragment();
282
    }
283
284
	/**
285
	 * @param $node \Tree\Node\Node
286
	 * @param $url string
287
	 * @param $parentUrl string
288
	 */
289
	protected function addToLinkTree(Node $node, string $url, string $parentUrl) {
290
291
		$returnNode = null;
292
		if($node->getValue() == $parentUrl) {
293
			$newNode = new Node($url);
294
			$node->addChild($newNode);
295
296
			return $newNode;
297
		}
298
		foreach($node->getChildren() as $currentNode) {
299
			$returnNode = $this->addToLinkTree($currentNode, $url, $parentUrl);
300
			if($returnNode !== null) {
301
				break;
302
			}
303
		}
304
305
		return $returnNode;
306
	}
307
308
    protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string
309
    {
310
        $browsershot = Browsershot::url((string) $foundOnUrl);
311
312
        if ($this->pathToChromeBinary) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->pathToChromeBinary of type string|null is loosely compared to true; this is ambiguous if the string can be empty. You might want to explicitly use !== null instead.

In PHP, under loose comparison (like ==, or !=, or switch conditions), values of different types might be equal.

For string values, the empty string '' is a special case, in particular the following results might be unexpected:

''   == false // true
''   == null  // true
'ab' == false // false
'ab' == null  // false

// It is often better to use strict comparison
'' === false // false
'' === null  // false
Loading history...
313
            $browsershot->setChromePath($this->pathToChromeBinary);
314
        }
315
316
        $html = $browsershot->bodyHtml();
317
318
        return html_entity_decode($html);
319
    }
320
321
}
322