spatie /
crawler
This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
| 1 | <?php |
||
| 2 | |||
| 3 | namespace Spatie\Crawler; |
||
| 4 | |||
| 5 | use Generator; |
||
| 6 | use GuzzleHttp\Client; |
||
| 7 | use GuzzleHttp\Pool; |
||
| 8 | use GuzzleHttp\Psr7\Request; |
||
| 9 | use GuzzleHttp\Psr7\Uri; |
||
| 10 | use GuzzleHttp\RequestOptions; |
||
| 11 | use Psr\Http\Message\UriInterface; |
||
| 12 | use Spatie\Browsershot\Browsershot; |
||
| 13 | use Spatie\Crawler\CrawlQueue\ArrayCrawlQueue; |
||
| 14 | use Spatie\Crawler\CrawlQueue\CrawlQueue; |
||
| 15 | use Spatie\Crawler\Exception\InvalidCrawlRequestHandler; |
||
| 16 | use Spatie\Crawler\Handlers\CrawlRequestFailed; |
||
| 17 | use Spatie\Crawler\Handlers\CrawlRequestFulfilled; |
||
| 18 | use Spatie\Robots\RobotsTxt; |
||
| 19 | use Tree\Node\Node; |
||
| 20 | |||
| 21 | class Crawler |
||
| 22 | { |
||
| 23 | public const DEFAULT_USER_AGENT = '*'; |
||
| 24 | |||
| 25 | /** @var \GuzzleHttp\Client */ |
||
| 26 | protected $client; |
||
| 27 | |||
| 28 | /** @var \Psr\Http\Message\UriInterface */ |
||
| 29 | protected $baseUrl; |
||
| 30 | |||
| 31 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
| 32 | protected $crawlObservers; |
||
| 33 | |||
| 34 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
| 35 | protected $crawlProfile; |
||
| 36 | |||
| 37 | /** @var int */ |
||
| 38 | protected $concurrency; |
||
| 39 | |||
| 40 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
| 41 | protected $crawlQueue; |
||
| 42 | |||
| 43 | /** @var int */ |
||
| 44 | protected $crawledUrlCount = 0; |
||
| 45 | |||
| 46 | /** @var int|null */ |
||
| 47 | protected $maximumCrawlCount = null; |
||
| 48 | |||
| 49 | /** @var int */ |
||
| 50 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
| 51 | |||
| 52 | /** @var int|null */ |
||
| 53 | protected $maximumDepth = null; |
||
| 54 | |||
| 55 | /** @var bool */ |
||
| 56 | protected $respectRobots = true; |
||
| 57 | |||
| 58 | /** @var \Tree\Node\Node */ |
||
| 59 | protected $depthTree; |
||
| 60 | |||
| 61 | /** @var bool */ |
||
| 62 | protected $executeJavaScript = false; |
||
| 63 | |||
| 64 | /** @var Browsershot */ |
||
| 65 | protected $browsershot = null; |
||
| 66 | |||
| 67 | /** @var \Spatie\Robots\RobotsTxt */ |
||
| 68 | protected $robotsTxt = null; |
||
| 69 | |||
| 70 | /** @var string */ |
||
| 71 | protected $crawlRequestFulfilledClass; |
||
| 72 | |||
| 73 | /** @var string */ |
||
| 74 | protected $crawlRequestFailedClass; |
||
| 75 | |||
| 76 | /** @var int */ |
||
| 77 | protected $delayBetweenRequests = 0; |
||
| 78 | |||
| 79 | /** @var */ |
||
| 80 | protected static $defaultClientOptions = [ |
||
| 81 | RequestOptions::COOKIES => true, |
||
| 82 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
| 83 | RequestOptions::TIMEOUT => 10, |
||
| 84 | RequestOptions::ALLOW_REDIRECTS => false, |
||
| 85 | RequestOptions::HEADERS => [ |
||
| 86 | 'User-Agent' => self::DEFAULT_USER_AGENT, |
||
| 87 | ], |
||
| 88 | ]; |
||
| 89 | |||
| 90 | public static function create(array $clientOptions = []): Crawler |
||
| 91 | { |
||
| 92 | $clientOptions = (count($clientOptions)) |
||
| 93 | ? $clientOptions |
||
| 94 | : static::$defaultClientOptions; |
||
| 95 | |||
| 96 | $client = new Client($clientOptions); |
||
| 97 | |||
| 98 | return new static($client); |
||
| 99 | } |
||
| 100 | |||
| 101 | public function __construct(Client $client, int $concurrency = 10) |
||
| 102 | { |
||
| 103 | $this->client = $client; |
||
| 104 | |||
| 105 | $this->concurrency = $concurrency; |
||
| 106 | |||
| 107 | $this->crawlProfile = new CrawlAllUrls(); |
||
| 108 | |||
| 109 | $this->crawlQueue = new ArrayCrawlQueue(); |
||
| 110 | |||
| 111 | $this->crawlObservers = new CrawlObserverCollection(); |
||
| 112 | |||
| 113 | $this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class; |
||
| 114 | |||
| 115 | $this->crawlRequestFailedClass = CrawlRequestFailed::class; |
||
| 116 | } |
||
| 117 | |||
| 118 | public function setConcurrency(int $concurrency): Crawler |
||
| 119 | { |
||
| 120 | $this->concurrency = $concurrency; |
||
| 121 | |||
| 122 | return $this; |
||
| 123 | } |
||
| 124 | |||
| 125 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler |
||
| 126 | { |
||
| 127 | $this->maximumResponseSize = $maximumResponseSizeInBytes; |
||
| 128 | |||
| 129 | return $this; |
||
| 130 | } |
||
| 131 | |||
| 132 | public function getMaximumResponseSize(): ?int |
||
| 133 | { |
||
| 134 | return $this->maximumResponseSize; |
||
| 135 | } |
||
| 136 | |||
| 137 | public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler |
||
| 138 | { |
||
| 139 | $this->maximumCrawlCount = $maximumCrawlCount; |
||
| 140 | |||
| 141 | return $this; |
||
| 142 | } |
||
| 143 | |||
| 144 | public function getMaximumCrawlCount(): ?int |
||
| 145 | { |
||
| 146 | return $this->maximumCrawlCount; |
||
| 147 | } |
||
| 148 | |||
| 149 | public function getCrawlerUrlCount(): int |
||
| 150 | { |
||
| 151 | return $this->crawledUrlCount; |
||
| 152 | } |
||
| 153 | |||
| 154 | public function setMaximumDepth(int $maximumDepth): Crawler |
||
| 155 | { |
||
| 156 | $this->maximumDepth = $maximumDepth; |
||
| 157 | |||
| 158 | return $this; |
||
| 159 | } |
||
| 160 | |||
| 161 | public function getMaximumDepth(): ?int |
||
| 162 | { |
||
| 163 | return $this->maximumDepth; |
||
| 164 | } |
||
| 165 | |||
| 166 | /** |
||
| 167 | * @param int $delay The delay in milliseconds. |
||
| 168 | * |
||
| 169 | * @return Crawler |
||
| 170 | */ |
||
| 171 | public function setDelayBetweenRequests(int $delay): Crawler |
||
| 172 | { |
||
| 173 | $this->delayBetweenRequests = ($delay * 1000); |
||
| 174 | |||
| 175 | return $this; |
||
| 176 | } |
||
| 177 | |||
| 178 | /** |
||
| 179 | * @return int The delay in milliseconds. |
||
| 180 | */ |
||
| 181 | public function getDelayBetweenRequests(): int |
||
| 182 | { |
||
| 183 | return $this->delayBetweenRequests; |
||
| 184 | } |
||
| 185 | |||
| 186 | public function ignoreRobots(): Crawler |
||
| 187 | { |
||
| 188 | $this->respectRobots = false; |
||
| 189 | |||
| 190 | return $this; |
||
| 191 | } |
||
| 192 | |||
| 193 | public function respectRobots(): Crawler |
||
| 194 | { |
||
| 195 | $this->respectRobots = true; |
||
| 196 | |||
| 197 | return $this; |
||
| 198 | } |
||
| 199 | |||
| 200 | public function mustRespectRobots(): bool |
||
| 201 | { |
||
| 202 | return $this->respectRobots; |
||
| 203 | } |
||
| 204 | |||
| 205 | public function getRobotsTxt(): RobotsTxt |
||
| 206 | { |
||
| 207 | return $this->robotsTxt; |
||
| 208 | } |
||
| 209 | |||
| 210 | public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler |
||
| 211 | { |
||
| 212 | $this->crawlQueue = $crawlQueue; |
||
| 213 | |||
| 214 | return $this; |
||
| 215 | } |
||
| 216 | |||
| 217 | public function getCrawlQueue(): CrawlQueue |
||
| 218 | { |
||
| 219 | return $this->crawlQueue; |
||
| 220 | } |
||
| 221 | |||
| 222 | public function executeJavaScript(): Crawler |
||
| 223 | { |
||
| 224 | $this->executeJavaScript = true; |
||
| 225 | |||
| 226 | return $this; |
||
| 227 | } |
||
| 228 | |||
| 229 | public function doNotExecuteJavaScript(): Crawler |
||
| 230 | { |
||
| 231 | $this->executeJavaScript = false; |
||
| 232 | |||
| 233 | return $this; |
||
| 234 | } |
||
| 235 | |||
| 236 | public function mayExecuteJavascript(): bool |
||
| 237 | { |
||
| 238 | return $this->executeJavaScript; |
||
| 239 | } |
||
| 240 | |||
| 241 | /** |
||
| 242 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
|
0 ignored issues
–
show
|
|||
| 243 | * |
||
| 244 | * @return $this |
||
| 245 | */ |
||
| 246 | public function setCrawlObserver($crawlObservers): Crawler |
||
| 247 | { |
||
| 248 | if (! is_array($crawlObservers)) { |
||
| 249 | $crawlObservers = [$crawlObservers]; |
||
| 250 | } |
||
| 251 | |||
| 252 | return $this->setCrawlObservers($crawlObservers); |
||
| 253 | } |
||
| 254 | |||
| 255 | public function setCrawlObservers(array $crawlObservers): Crawler |
||
| 256 | { |
||
| 257 | $this->crawlObservers = new CrawlObserverCollection($crawlObservers); |
||
| 258 | |||
| 259 | return $this; |
||
| 260 | } |
||
| 261 | |||
| 262 | public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler |
||
| 263 | { |
||
| 264 | $this->crawlObservers->addObserver($crawlObserver); |
||
| 265 | |||
| 266 | return $this; |
||
| 267 | } |
||
| 268 | |||
| 269 | public function getCrawlObservers(): CrawlObserverCollection |
||
| 270 | { |
||
| 271 | return $this->crawlObservers; |
||
| 272 | } |
||
| 273 | |||
| 274 | public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler |
||
| 275 | { |
||
| 276 | $this->crawlProfile = $crawlProfile; |
||
| 277 | |||
| 278 | return $this; |
||
| 279 | } |
||
| 280 | |||
| 281 | public function getCrawlProfile(): CrawlProfile |
||
| 282 | { |
||
| 283 | return $this->crawlProfile; |
||
| 284 | } |
||
| 285 | |||
| 286 | public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler |
||
| 287 | { |
||
| 288 | $baseClass = CrawlRequestFulfilled::class; |
||
| 289 | |||
| 290 | if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) { |
||
| 291 | throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass); |
||
| 292 | } |
||
| 293 | |||
| 294 | $this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass; |
||
| 295 | |||
| 296 | return $this; |
||
| 297 | } |
||
| 298 | |||
| 299 | public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler |
||
| 300 | { |
||
| 301 | $baseClass = CrawlRequestFailed::class; |
||
| 302 | |||
| 303 | if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) { |
||
| 304 | throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass); |
||
| 305 | } |
||
| 306 | |||
| 307 | $this->crawlRequestFailedClass = $crawlRequestFailedClass; |
||
| 308 | |||
| 309 | return $this; |
||
| 310 | } |
||
| 311 | |||
| 312 | public function setBrowsershot(Browsershot $browsershot) |
||
| 313 | { |
||
| 314 | $this->browsershot = $browsershot; |
||
| 315 | |||
| 316 | return $this; |
||
| 317 | } |
||
| 318 | |||
| 319 | public function setUserAgent(string $userAgent): Crawler |
||
| 320 | { |
||
| 321 | $clientOptions = $this->client->getConfig(); |
||
| 322 | |||
| 323 | $headers = array_change_key_case($clientOptions['headers']); |
||
| 324 | $headers['user-agent'] = $userAgent; |
||
| 325 | |||
| 326 | $clientOptions['headers'] = $headers; |
||
| 327 | |||
| 328 | $this->client = new Client($clientOptions); |
||
| 329 | |||
| 330 | return $this; |
||
| 331 | } |
||
| 332 | |||
| 333 | public function getUserAgent(): string |
||
| 334 | { |
||
| 335 | $headers = $this->client->getConfig('headers'); |
||
| 336 | |||
| 337 | foreach (array_keys($headers) as $name) { |
||
| 338 | if (strtolower($name) === 'user-agent') { |
||
| 339 | return (string) $headers[$name]; |
||
| 340 | } |
||
| 341 | } |
||
| 342 | |||
| 343 | return static::DEFAULT_USER_AGENT; |
||
| 344 | } |
||
| 345 | |||
| 346 | public function getBrowsershot(): Browsershot |
||
| 347 | { |
||
| 348 | if (! $this->browsershot) { |
||
| 349 | $this->browsershot = new Browsershot(); |
||
| 350 | } |
||
| 351 | |||
| 352 | return $this->browsershot; |
||
| 353 | } |
||
| 354 | |||
| 355 | public function getBaseUrl(): UriInterface |
||
| 356 | { |
||
| 357 | return $this->baseUrl; |
||
| 358 | } |
||
| 359 | |||
| 360 | /** |
||
| 361 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
| 362 | */ |
||
| 363 | public function startCrawling($baseUrl) |
||
| 364 | { |
||
| 365 | if (! $baseUrl instanceof UriInterface) { |
||
| 366 | $baseUrl = new Uri($baseUrl); |
||
| 367 | } |
||
| 368 | |||
| 369 | if ($baseUrl->getScheme() === '') { |
||
| 370 | $baseUrl = $baseUrl->withScheme('http'); |
||
| 371 | } |
||
| 372 | |||
| 373 | if ($baseUrl->getPath() === '') { |
||
| 374 | $baseUrl = $baseUrl->withPath('/'); |
||
| 375 | } |
||
| 376 | |||
| 377 | $this->baseUrl = $baseUrl; |
||
| 378 | |||
| 379 | $crawlUrl = CrawlUrl::create($this->baseUrl); |
||
| 380 | |||
| 381 | $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url); |
||
| 382 | |||
| 383 | if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent()) || |
||
| 384 | ! $this->respectRobots |
||
| 385 | ) { |
||
| 386 | $this->addToCrawlQueue($crawlUrl); |
||
| 387 | } |
||
| 388 | |||
| 389 | $this->depthTree = new Node((string) $this->baseUrl); |
||
| 390 | |||
| 391 | $this->startCrawlingQueue(); |
||
| 392 | |||
| 393 | foreach ($this->crawlObservers as $crawlObserver) { |
||
| 394 | $crawlObserver->finishedCrawling(); |
||
| 395 | } |
||
| 396 | } |
||
| 397 | |||
| 398 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
| 399 | { |
||
| 400 | if (is_null($this->maximumDepth)) { |
||
| 401 | return new Node((string) $url); |
||
| 402 | } |
||
| 403 | |||
| 404 | $node = $node ?? $this->depthTree; |
||
| 405 | |||
| 406 | $returnNode = null; |
||
| 407 | |||
| 408 | if ($node->getValue() === (string) $parentUrl) { |
||
| 409 | $newNode = new Node((string) $url); |
||
| 410 | |||
| 411 | $node->addChild($newNode); |
||
| 412 | |||
| 413 | return $newNode; |
||
| 414 | } |
||
| 415 | |||
| 416 | foreach ($node->getChildren() as $currentNode) { |
||
| 417 | $returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode); |
||
| 418 | |||
| 419 | if (! is_null($returnNode)) { |
||
| 420 | break; |
||
| 421 | } |
||
| 422 | } |
||
| 423 | |||
| 424 | return $returnNode; |
||
| 425 | } |
||
| 426 | |||
| 427 | protected function startCrawlingQueue() |
||
| 428 | { |
||
| 429 | while ($this->crawlQueue->hasPendingUrls()) { |
||
| 430 | $pool = new Pool($this->client, $this->getCrawlRequests(), [ |
||
| 431 | 'concurrency' => $this->concurrency, |
||
| 432 | 'options' => $this->client->getConfig(), |
||
| 433 | 'fulfilled' => new $this->crawlRequestFulfilledClass($this), |
||
| 434 | 'rejected' => new $this->crawlRequestFailedClass($this), |
||
| 435 | ]); |
||
| 436 | |||
| 437 | $promise = $pool->promise(); |
||
| 438 | |||
| 439 | $promise->wait(); |
||
| 440 | } |
||
| 441 | } |
||
| 442 | |||
| 443 | /** |
||
| 444 | * @deprecated This function will be removed in the next major version |
||
| 445 | */ |
||
| 446 | public function endsWith($haystack, $needle) |
||
| 447 | { |
||
| 448 | return strrpos($haystack, $needle) + strlen($needle) === |
||
| 449 | strlen($haystack); |
||
| 450 | } |
||
| 451 | |||
| 452 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
| 453 | { |
||
| 454 | return RobotsTxt::create($uri->withPath('/robots.txt')); |
||
| 455 | } |
||
| 456 | |||
| 457 | protected function getCrawlRequests(): Generator |
||
| 458 | { |
||
| 459 | while ($crawlUrl = $this->crawlQueue->getFirstPendingUrl()) { |
||
| 460 | if (! $this->crawlProfile->shouldCrawl($crawlUrl->url)) { |
||
| 461 | $this->crawlQueue->markAsProcessed($crawlUrl); |
||
| 462 | continue; |
||
| 463 | } |
||
| 464 | |||
| 465 | if ($this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)) { |
||
| 466 | continue; |
||
| 467 | } |
||
| 468 | |||
| 469 | foreach ($this->crawlObservers as $crawlObserver) { |
||
| 470 | $crawlObserver->willCrawl($crawlUrl->url); |
||
| 471 | } |
||
| 472 | |||
| 473 | $this->crawlQueue->markAsProcessed($crawlUrl); |
||
| 474 | |||
| 475 | yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url); |
||
| 476 | } |
||
| 477 | } |
||
| 478 | |||
| 479 | public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler |
||
| 480 | { |
||
| 481 | if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) { |
||
| 482 | return $this; |
||
| 483 | } |
||
| 484 | |||
| 485 | if ($this->getCrawlQueue()->has($crawlUrl->url)) { |
||
| 486 | return $this; |
||
| 487 | } |
||
| 488 | |||
| 489 | $this->crawledUrlCount++; |
||
| 490 | |||
| 491 | $this->crawlQueue->add($crawlUrl); |
||
| 492 | |||
| 493 | return $this; |
||
| 494 | } |
||
| 495 | |||
| 496 | public function maximumCrawlCountReached(): bool |
||
| 497 | { |
||
| 498 | $maximumCrawlCount = $this->getMaximumCrawlCount(); |
||
| 499 | |||
| 500 | if (is_null($maximumCrawlCount)) { |
||
| 501 | return false; |
||
| 502 | } |
||
| 503 | |||
| 504 | return $this->getCrawlerUrlCount() >= $maximumCrawlCount; |
||
| 505 | } |
||
| 506 | } |
||
| 507 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.