This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | namespace Spatie\Crawler; |
||
4 | |||
5 | use Generator; |
||
6 | use GuzzleHttp\Client; |
||
7 | use GuzzleHttp\Pool; |
||
8 | use GuzzleHttp\Psr7\Request; |
||
9 | use GuzzleHttp\Psr7\Uri; |
||
10 | use GuzzleHttp\RequestOptions; |
||
11 | use Psr\Http\Message\UriInterface; |
||
12 | use Spatie\Browsershot\Browsershot; |
||
13 | use Spatie\Crawler\CrawlQueue\ArrayCrawlQueue; |
||
14 | use Spatie\Crawler\CrawlQueue\CrawlQueue; |
||
15 | use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; |
||
16 | use Spatie\Crawler\Handlers\CrawlRequestFailed; |
||
17 | use Spatie\Crawler\Handlers\CrawlRequestFulfilled; |
||
18 | use Spatie\Robots\RobotsTxt; |
||
19 | use Tree\Node\Node; |
||
20 | |||
21 | class Crawler |
||
22 | { |
||
23 | public const DEFAULT_USER_AGENT = '*'; |
||
24 | |||
25 | /** @var \GuzzleHttp\Client */ |
||
26 | protected $client; |
||
27 | |||
28 | /** @var \Psr\Http\Message\UriInterface */ |
||
29 | protected $baseUrl; |
||
30 | |||
31 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
32 | protected $crawlObservers; |
||
33 | |||
34 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
35 | protected $crawlProfile; |
||
36 | |||
37 | /** @var int */ |
||
38 | protected $concurrency; |
||
39 | |||
40 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
41 | protected $crawlQueue; |
||
42 | |||
43 | /** @var int */ |
||
44 | protected $crawledUrlCount = 0; |
||
45 | |||
46 | /** @var int|null */ |
||
47 | protected $maximumCrawlCount = null; |
||
48 | |||
49 | /** @var int */ |
||
50 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
51 | |||
52 | /** @var int|null */ |
||
53 | protected $maximumDepth = null; |
||
54 | |||
55 | /** @var bool */ |
||
56 | protected $respectRobots = true; |
||
57 | |||
58 | /** @var \Tree\Node\Node */ |
||
59 | protected $depthTree; |
||
60 | |||
61 | /** @var bool */ |
||
62 | protected $executeJavaScript = false; |
||
63 | |||
64 | /** @var Browsershot */ |
||
65 | protected $browsershot = null; |
||
66 | |||
67 | /** @var \Spatie\Robots\RobotsTxt */ |
||
68 | protected $robotsTxt = null; |
||
69 | |||
70 | /** @var string */ |
||
71 | protected $crawlRequestFulfilledClass; |
||
72 | |||
73 | /** @var string */ |
||
74 | protected $crawlRequestFailedClass; |
||
75 | |||
76 | /** @var int */ |
||
77 | protected $delayBetweenRequests = 0; |
||
78 | |||
79 | /** @var */ |
||
80 | protected static $defaultClientOptions = [ |
||
81 | RequestOptions::COOKIES => true, |
||
82 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
83 | RequestOptions::TIMEOUT => 10, |
||
84 | RequestOptions::ALLOW_REDIRECTS => false, |
||
85 | RequestOptions::HEADERS => [ |
||
86 | 'User-Agent' => self::DEFAULT_USER_AGENT, |
||
87 | ], |
||
88 | ]; |
||
89 | |||
90 | public static function create(array $clientOptions = []): Crawler |
||
91 | { |
||
92 | $clientOptions = (count($clientOptions)) |
||
93 | ? $clientOptions |
||
94 | : static::$defaultClientOptions; |
||
95 | |||
96 | $client = new Client($clientOptions); |
||
97 | |||
98 | return new static($client); |
||
99 | } |
||
100 | |||
101 | public function __construct(Client $client, int $concurrency = 10) |
||
102 | { |
||
103 | $this->client = $client; |
||
104 | |||
105 | $this->concurrency = $concurrency; |
||
106 | |||
107 | $this->crawlProfile = new CrawlAllUrls(); |
||
108 | |||
109 | $this->crawlQueue = new ArrayCrawlQueue(); |
||
110 | |||
111 | $this->crawlObservers = new CrawlObserverCollection(); |
||
112 | |||
113 | $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class; |
||
114 | |||
115 | $this->crawlRequestFailedClass = CrawlRequestFailed::class; |
||
116 | } |
||
117 | |||
118 | public function setConcurrency(int $concurrency): Crawler |
||
119 | { |
||
120 | $this->concurrency = $concurrency; |
||
121 | |||
122 | return $this; |
||
123 | } |
||
124 | |||
125 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler |
||
126 | { |
||
127 | $this->maximumResponseSize = $maximumResponseSizeInBytes; |
||
128 | |||
129 | return $this; |
||
130 | } |
||
131 | |||
132 | public function getMaximumResponseSize(): ?int |
||
133 | { |
||
134 | return $this->maximumResponseSize; |
||
135 | } |
||
136 | |||
137 | public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler |
||
138 | { |
||
139 | $this->maximumCrawlCount = $maximumCrawlCount; |
||
140 | |||
141 | return $this; |
||
142 | } |
||
143 | |||
144 | public function getMaximumCrawlCount(): ?int |
||
145 | { |
||
146 | return $this->maximumCrawlCount; |
||
147 | } |
||
148 | |||
149 | public function getCrawlerUrlCount(): int |
||
150 | { |
||
151 | return $this->crawledUrlCount; |
||
152 | } |
||
153 | |||
154 | public function setMaximumDepth(int $maximumDepth): Crawler |
||
155 | { |
||
156 | $this->maximumDepth = $maximumDepth; |
||
157 | |||
158 | return $this; |
||
159 | } |
||
160 | |||
161 | public function getMaximumDepth(): ?int |
||
162 | { |
||
163 | return $this->maximumDepth; |
||
164 | } |
||
165 | |||
166 | /** |
||
167 | * @param int $delay The delay in milliseconds. |
||
168 | * |
||
169 | * @return Crawler |
||
170 | */ |
||
171 | public function setDelayBetweenRequests(int $delay): Crawler |
||
172 | { |
||
173 | $this->delayBetweenRequests = ($delay * 1000); |
||
174 | |||
175 | return $this; |
||
176 | } |
||
177 | |||
178 | /** |
||
179 | * @return int The delay in milliseconds. |
||
180 | */ |
||
181 | public function getDelayBetweenRequests(): int |
||
182 | { |
||
183 | return $this->delayBetweenRequests; |
||
184 | } |
||
185 | |||
186 | public function ignoreRobots(): Crawler |
||
187 | { |
||
188 | $this->respectRobots = false; |
||
189 | |||
190 | return $this; |
||
191 | } |
||
192 | |||
193 | public function respectRobots(): Crawler |
||
194 | { |
||
195 | $this->respectRobots = true; |
||
196 | |||
197 | return $this; |
||
198 | } |
||
199 | |||
200 | public function mustRespectRobots(): bool |
||
201 | { |
||
202 | return $this->respectRobots; |
||
203 | } |
||
204 | |||
205 | public function getRobotsTxt(): RobotsTxt |
||
206 | { |
||
207 | return $this->robotsTxt; |
||
208 | } |
||
209 | |||
210 | public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler |
||
211 | { |
||
212 | $this->crawlQueue = $crawlQueue; |
||
213 | |||
214 | return $this; |
||
215 | } |
||
216 | |||
217 | public function getCrawlQueue(): CrawlQueue |
||
218 | { |
||
219 | return $this->crawlQueue; |
||
220 | } |
||
221 | |||
222 | public function executeJavaScript(): Crawler |
||
223 | { |
||
224 | $this->executeJavaScript = true; |
||
225 | |||
226 | return $this; |
||
227 | } |
||
228 | |||
229 | public function doNotExecuteJavaScript(): Crawler |
||
230 | { |
||
231 | $this->executeJavaScript = false; |
||
232 | |||
233 | return $this; |
||
234 | } |
||
235 | |||
236 | public function mayExecuteJavascript(): bool |
||
237 | { |
||
238 | return $this->executeJavaScript; |
||
239 | } |
||
240 | |||
241 | /** |
||
242 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
0 ignored issues
–
show
|
|||
243 | * |
||
244 | * @return $this |
||
245 | */ |
||
246 | public function setCrawlObserver($crawlObservers): Crawler |
||
247 | { |
||
248 | if (! is_array($crawlObservers)) { |
||
249 | $crawlObservers = [$crawlObservers]; |
||
250 | } |
||
251 | |||
252 | return $this->setCrawlObservers($crawlObservers); |
||
253 | } |
||
254 | |||
255 | public function setCrawlObservers(array $crawlObservers): Crawler |
||
256 | { |
||
257 | $this->crawlObservers = new CrawlObserverCollection($crawlObservers); |
||
258 | |||
259 | return $this; |
||
260 | } |
||
261 | |||
262 | public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler |
||
263 | { |
||
264 | $this->crawlObservers->addObserver($crawlObserver); |
||
265 | |||
266 | return $this; |
||
267 | } |
||
268 | |||
269 | public function getCrawlObservers(): CrawlObserverCollection |
||
270 | { |
||
271 | return $this->crawlObservers; |
||
272 | } |
||
273 | |||
274 | public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler |
||
275 | { |
||
276 | $this->crawlProfile = $crawlProfile; |
||
277 | |||
278 | return $this; |
||
279 | } |
||
280 | |||
281 | public function getCrawlProfile(): CrawlProfile |
||
282 | { |
||
283 | return $this->crawlProfile; |
||
284 | } |
||
285 | |||
286 | public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler |
||
287 | { |
||
288 | $baseClass = CrawlRequestFulfilled::class; |
||
289 | |||
290 | if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) { |
||
291 | throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass); |
||
292 | } |
||
293 | |||
294 | $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass; |
||
295 | |||
296 | return $this; |
||
297 | } |
||
298 | |||
299 | public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler |
||
300 | { |
||
301 | $baseClass = CrawlRequestFailed::class; |
||
302 | |||
303 | if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) { |
||
304 | throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass); |
||
305 | } |
||
306 | |||
307 | $this->crawlRequestFailedClass = $crawlRequestFailedClass; |
||
308 | |||
309 | return $this; |
||
310 | } |
||
311 | |||
312 | public function setBrowsershot(Browsershot $browsershot) |
||
313 | { |
||
314 | $this->browsershot = $browsershot; |
||
315 | |||
316 | return $this; |
||
317 | } |
||
318 | |||
319 | public function setUserAgent(string $userAgent): Crawler |
||
320 | { |
||
321 | $clientOptions = $this->client->getConfig(); |
||
322 | |||
323 | $headers = array_change_key_case($clientOptions['headers']); |
||
324 | $headers['user-agent'] = $userAgent; |
||
325 | |||
326 | $clientOptions['headers'] = $headers; |
||
327 | |||
328 | $this->client = new Client($clientOptions); |
||
329 | |||
330 | return $this; |
||
331 | } |
||
332 | |||
333 | public function getUserAgent(): string |
||
334 | { |
||
335 | $headers = $this->client->getConfig('headers'); |
||
336 | |||
337 | foreach (array_keys($headers) as $name) { |
||
338 | if (strtolower($name) === 'user-agent') { |
||
339 | return (string) $headers[$name]; |
||
340 | } |
||
341 | } |
||
342 | |||
343 | return static::DEFAULT_USER_AGENT; |
||
344 | } |
||
345 | |||
346 | public function getBrowsershot(): Browsershot |
||
347 | { |
||
348 | if (! $this->browsershot) { |
||
349 | $this->browsershot = new Browsershot(); |
||
350 | } |
||
351 | |||
352 | return $this->browsershot; |
||
353 | } |
||
354 | |||
355 | public function getBaseUrl(): UriInterface |
||
356 | { |
||
357 | return $this->baseUrl; |
||
358 | } |
||
359 | |||
360 | /** |
||
361 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
362 | */ |
||
363 | public function startCrawling($baseUrl) |
||
364 | { |
||
365 | if (! $baseUrl instanceof UriInterface) { |
||
366 | $baseUrl = new Uri($baseUrl); |
||
367 | } |
||
368 | |||
369 | if ($baseUrl->getScheme() === '') { |
||
370 | $baseUrl = $baseUrl->withScheme('http'); |
||
371 | } |
||
372 | |||
373 | if ($baseUrl->getPath() === '') { |
||
374 | $baseUrl = $baseUrl->withPath('/'); |
||
375 | } |
||
376 | |||
377 | $this->baseUrl = $baseUrl; |
||
378 | |||
379 | $crawlUrl = CrawlUrl::create($this->baseUrl); |
||
380 | |||
381 | $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url); |
||
382 | |||
383 | if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent()) || |
||
384 | ! $this->respectRobots |
||
385 | ) { |
||
386 | $this->addToCrawlQueue($crawlUrl); |
||
387 | } |
||
388 | |||
389 | $this->depthTree = new Node((string) $this->baseUrl); |
||
390 | |||
391 | $this->startCrawlingQueue(); |
||
392 | |||
393 | foreach ($this->crawlObservers as $crawlObserver) { |
||
394 | $crawlObserver->finishedCrawling(); |
||
395 | } |
||
396 | } |
||
397 | |||
398 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
399 | { |
||
400 | if (is_null($this->maximumDepth)) { |
||
401 | return new Node((string) $url); |
||
402 | } |
||
403 | |||
404 | $node = $node ?? $this->depthTree; |
||
405 | |||
406 | $returnNode = null; |
||
407 | |||
408 | if ($node->getValue() === (string) $parentUrl) { |
||
409 | $newNode = new Node((string) $url); |
||
410 | |||
411 | $node->addChild($newNode); |
||
412 | |||
413 | return $newNode; |
||
414 | } |
||
415 | |||
416 | foreach ($node->getChildren() as $currentNode) { |
||
417 | $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode); |
||
418 | |||
419 | if (! is_null($returnNode)) { |
||
420 | break; |
||
421 | } |
||
422 | } |
||
423 | |||
424 | return $returnNode; |
||
425 | } |
||
426 | |||
427 | protected function startCrawlingQueue() |
||
428 | { |
||
429 | while ($this->crawlQueue->hasPendingUrls()) { |
||
430 | $pool = new Pool($this->client, $this->getCrawlRequests(), [ |
||
431 | 'concurrency' => $this->concurrency, |
||
432 | 'options' => $this->client->getConfig(), |
||
433 | 'fulfilled' => new $this->crawlRequestFulfilledClass($this), |
||
434 | 'rejected' => new $this->crawlRequestFailedClass($this), |
||
435 | ]); |
||
436 | |||
437 | $promise = $pool->promise(); |
||
438 | |||
439 | $promise->wait(); |
||
440 | } |
||
441 | } |
||
442 | |||
443 | /** |
||
444 | * @deprecated This function will be removed in the next major version |
||
445 | */ |
||
446 | public function endsWith($haystack, $needle) |
||
447 | { |
||
448 | return strrpos($haystack, $needle) + strlen($needle) === |
||
449 | strlen($haystack); |
||
450 | } |
||
451 | |||
452 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
453 | { |
||
454 | return RobotsTxt::create($uri->withPath('/robots.txt')); |
||
455 | } |
||
456 | |||
457 | protected function getCrawlRequests(): Generator |
||
458 | { |
||
459 | while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) { |
||
460 | if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) { |
||
461 | $this->crawlQueue->markAsProcessed($crawlUrl); |
||
462 | continue; |
||
463 | } |
||
464 | |||
465 | if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) { |
||
466 | continue; |
||
467 | } |
||
468 | |||
469 | foreach ($this->crawlObservers as $crawlObserver) { |
||
470 | $crawlObserver->willCrawl($crawlUrl->url); |
||
471 | } |
||
472 | |||
473 | $this->crawlQueue->markAsProcessed($crawlUrl); |
||
474 | |||
475 | yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url); |
||
476 | } |
||
477 | } |
||
478 | |||
479 | public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler |
||
480 | { |
||
481 | if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) { |
||
482 | return $this; |
||
483 | } |
||
484 | |||
485 | if ($this->getCrawlQueue()->has($crawlUrl->url)) { |
||
486 | return $this; |
||
487 | } |
||
488 | |||
489 | $this->crawledUrlCount++; |
||
490 | |||
491 | $this->crawlQueue->add($crawlUrl); |
||
492 | |||
493 | return $this; |
||
494 | } |
||
495 | |||
496 | public function maximumCrawlCountReached(): bool |
||
497 | { |
||
498 | $maximumCrawlCount = $this->getMaximumCrawlCount(); |
||
499 | |||
500 | if (is_null($maximumCrawlCount)) { |
||
501 | return false; |
||
502 | } |
||
503 | |||
504 | return $this->getCrawlerUrlCount() >= $maximumCrawlCount; |
||
505 | } |
||
506 | } |
||
507 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.