This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | namespace MediaMonks\Crawler; |
||
4 | |||
5 | use MediaMonks\Crawler\Client\CrawlerClientInterface; |
||
6 | use MediaMonks\Crawler\Client\GoutteClient; |
||
7 | use MediaMonks\Crawler\Exception\RequestException; |
||
8 | use MediaMonks\Crawler\Exception\UnsupportedUrlException; |
||
9 | use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface; |
||
10 | use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface; |
||
11 | use MediaMonks\Crawler\Url\UrlCollection; |
||
12 | use Symfony\Component\BrowserKit\Client; |
||
13 | use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
||
14 | use Psr\Log\LoggerAwareInterface; |
||
15 | use Psr\Log\LoggerInterface; |
||
16 | use Psr\Log\NullLogger; |
||
17 | |||
18 | class Crawler implements LoggerAwareInterface |
||
19 | { |
||
20 | /** |
||
21 | * @var Client |
||
22 | */ |
||
23 | private $client; |
||
24 | |||
25 | /** |
||
26 | * @var int |
||
27 | */ |
||
28 | private $limit = 0; |
||
29 | |||
30 | /** |
||
31 | * @var bool |
||
32 | */ |
||
33 | private $stopOnError = false; |
||
34 | |||
35 | /** |
||
36 | * @var bool |
||
37 | */ |
||
38 | private $exceptionOnError = false; |
||
39 | |||
40 | /** |
||
41 | * @var UrlMatcherInterface[] |
||
42 | */ |
||
43 | private $whitelistUrlMatchers = []; |
||
44 | |||
45 | /** |
||
46 | * @var UrlMatcherInterface[] |
||
47 | */ |
||
48 | private $blacklistUrlMatchers = []; |
||
49 | |||
50 | /** |
||
51 | * @var UrlNormalizerInterface[] |
||
52 | */ |
||
53 | private $urlNormalizers = []; |
||
54 | |||
55 | /** |
||
56 | * @var Url |
||
57 | */ |
||
58 | private $baseUrl; |
||
59 | |||
60 | /** |
||
61 | * @var UrlCollection |
||
62 | */ |
||
63 | private $urlsCrawled; |
||
64 | |||
65 | /** |
||
66 | * @var UrlCollection |
||
67 | */ |
||
68 | private $urlsQueued; |
||
69 | |||
70 | /** |
||
71 | * @var UrlCollection |
||
72 | */ |
||
73 | private $urlsReturned; |
||
74 | |||
75 | /** |
||
76 | * @var array |
||
77 | */ |
||
78 | private $urlsRejected = []; |
||
79 | |||
80 | /** |
||
81 | * @var LoggerInterface |
||
82 | */ |
||
83 | private $logger = null; |
||
84 | |||
85 | /** |
||
86 | * @param CrawlerClientInterface $client |
||
87 | */ |
||
88 | 17 | public function __construct(CrawlerClientInterface $client = null) |
|
89 | { |
||
90 | 17 | if (empty($client)) { |
|
91 | 7 | $client = new GoutteClient(); |
|
92 | 7 | } |
|
93 | |||
94 | 17 | $this->setClient($client); |
|
95 | |||
96 | 17 | $this->urlsCrawled = new UrlCollection(); |
|
97 | 17 | $this->urlsQueued = new UrlCollection(); |
|
98 | 17 | $this->urlsReturned = new UrlCollection(); |
|
99 | |||
100 | 17 | return $this; |
|
101 | } |
||
102 | |||
103 | /** |
||
104 | * @param CrawlerClientInterface $client |
||
105 | */ |
||
106 | 17 | public function setClient(CrawlerClientInterface $client) |
|
107 | { |
||
108 | 17 | $this->client = $client; |
|
0 ignored issues
–
show
|
|||
109 | 17 | } |
|
110 | |||
111 | /** |
||
112 | * @return Client |
||
113 | */ |
||
114 | 2 | public function getClient() |
|
115 | { |
||
116 | 2 | return $this->client; |
|
117 | } |
||
118 | |||
119 | /** |
||
120 | * @return int |
||
121 | */ |
||
122 | 2 | public function getLimit() |
|
123 | { |
||
124 | 2 | return $this->limit; |
|
125 | } |
||
126 | |||
127 | /** |
||
128 | * @param int $limit |
||
129 | * @return $this |
||
130 | */ |
||
131 | 2 | public function setLimit($limit) |
|
132 | { |
||
133 | 2 | $this->limit = $limit; |
|
134 | |||
135 | 2 | return $this; |
|
136 | } |
||
137 | |||
138 | /** |
||
139 | * @return boolean |
||
140 | */ |
||
141 | 5 | public function getStopOnError() |
|
142 | { |
||
143 | 5 | return $this->stopOnError; |
|
144 | } |
||
145 | |||
146 | /** |
||
147 | * @param boolean $stopOnError |
||
148 | * @return $this |
||
149 | */ |
||
150 | 2 | public function setStopOnError($stopOnError) |
|
151 | { |
||
152 | 2 | $this->stopOnError = $stopOnError; |
|
153 | |||
154 | 2 | return $this; |
|
155 | } |
||
156 | |||
157 | /** |
||
158 | * @return boolean |
||
159 | */ |
||
160 | 2 | public function getExceptionOnError() |
|
161 | { |
||
162 | 2 | return $this->exceptionOnError; |
|
163 | } |
||
164 | |||
165 | /** |
||
166 | * @param boolean $exceptionOnError |
||
167 | * @return $this |
||
168 | */ |
||
169 | 1 | public function setExceptionOnError($exceptionOnError) |
|
170 | { |
||
171 | 1 | $this->exceptionOnError = $exceptionOnError; |
|
172 | |||
173 | 1 | return $this; |
|
174 | } |
||
175 | |||
176 | /** |
||
177 | * @return array |
||
178 | */ |
||
179 | 9 | public function getUrlsCrawled() |
|
180 | { |
||
181 | 9 | return $this->urlsCrawled->toArray(); |
|
182 | } |
||
183 | |||
184 | /** |
||
185 | * @return array |
||
186 | */ |
||
187 | 2 | public function getUrlsQueued() |
|
188 | { |
||
189 | 2 | return $this->urlsQueued->toArray(); |
|
190 | } |
||
191 | |||
192 | /** |
||
193 | * @return array |
||
194 | */ |
||
195 | 4 | public function getUrlsReturned() |
|
196 | { |
||
197 | 4 | return $this->urlsReturned->toArray(); |
|
198 | } |
||
199 | |||
200 | /** |
||
201 | * @return array |
||
202 | */ |
||
203 | 3 | public function getUrlsRejected() |
|
204 | { |
||
205 | 3 | return $this->urlsRejected; |
|
206 | } |
||
207 | |||
208 | /** |
||
209 | * @param $urlMatchers |
||
210 | * @return $this |
||
211 | */ |
||
212 | 1 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
213 | { |
||
214 | 1 | $this->clearWhitelistUrlMatchers(); |
|
215 | 1 | foreach ($urlMatchers as $matcher) { |
|
216 | 1 | $this->addWhitelistUrlMatcher($matcher); |
|
217 | 1 | } |
|
218 | |||
219 | 1 | return $this; |
|
220 | } |
||
221 | |||
222 | /** |
||
223 | * @return Url\Matcher\UrlMatcherInterface[] |
||
224 | */ |
||
225 | 2 | public function getWhitelistUrlMatchers() |
|
226 | { |
||
227 | 2 | return $this->whitelistUrlMatchers; |
|
228 | } |
||
229 | |||
230 | /** |
||
231 | * @param UrlMatcherInterface $urlMatcher |
||
232 | * @return $this |
||
233 | */ |
||
234 | 2 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
235 | { |
||
236 | 2 | $this->whitelistUrlMatchers[] = $urlMatcher; |
|
237 | |||
238 | 2 | return $this; |
|
239 | } |
||
240 | |||
241 | /** |
||
242 | * @return $this |
||
243 | */ |
||
244 | 1 | public function clearWhitelistUrlMatchers() |
|
245 | { |
||
246 | 1 | $this->whitelistUrlMatchers = []; |
|
247 | |||
248 | 1 | return $this; |
|
249 | } |
||
250 | |||
251 | /** |
||
252 | * @param array $urlMatchers |
||
253 | * @return $this |
||
254 | */ |
||
255 | 1 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
256 | { |
||
257 | 1 | $this->clearBlacklistUrlMatchers(); |
|
258 | 1 | foreach ($urlMatchers as $matcher) { |
|
259 | 1 | $this->addBlacklistUrlMatcher($matcher); |
|
260 | 1 | } |
|
261 | |||
262 | 1 | return $this; |
|
263 | } |
||
264 | |||
265 | /** |
||
266 | * @return UrlMatcherInterface[] |
||
267 | */ |
||
268 | 2 | public function getBlacklistUrlMatchers() |
|
269 | { |
||
270 | 2 | return $this->blacklistUrlMatchers; |
|
271 | } |
||
272 | |||
273 | /** |
||
274 | * @param UrlMatcherInterface $urlMatcher |
||
275 | * @return $this |
||
276 | */ |
||
277 | 2 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
278 | { |
||
279 | 2 | $this->blacklistUrlMatchers[] = $urlMatcher; |
|
280 | |||
281 | 2 | return $this; |
|
282 | } |
||
283 | |||
284 | /** |
||
285 | * @return $this |
||
286 | */ |
||
287 | 1 | public function clearBlacklistUrlMatchers() |
|
288 | { |
||
289 | 1 | $this->blacklistUrlMatchers = []; |
|
290 | |||
291 | 1 | return $this; |
|
292 | } |
||
293 | |||
294 | /** |
||
295 | * @param array $normalizers |
||
296 | * @return $this |
||
297 | */ |
||
298 | 1 | public function setUrlNormalizers(array $normalizers) |
|
299 | { |
||
300 | 1 | $this->clearUrlNormalizers(); |
|
301 | |||
302 | 1 | foreach ($normalizers as $normalizer) { |
|
303 | 1 | $this->addUrlNormalizer($normalizer); |
|
304 | 1 | } |
|
305 | |||
306 | 1 | return $this; |
|
307 | } |
||
308 | |||
309 | /** |
||
310 | * @return UrlNormalizerInterface[] |
||
311 | */ |
||
312 | 1 | public function getUrlNormalizers() |
|
313 | { |
||
314 | 1 | return $this->urlNormalizers; |
|
315 | } |
||
316 | |||
317 | /** |
||
318 | * @param UrlNormalizerInterface $normalizer |
||
319 | * @return $this |
||
320 | */ |
||
321 | 2 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
|
322 | { |
||
323 | 2 | $this->urlNormalizers[] = $normalizer; |
|
324 | |||
325 | 2 | return $this; |
|
326 | } |
||
327 | |||
328 | /** |
||
329 | * @return $this |
||
330 | */ |
||
331 | 1 | public function clearUrlNormalizers() |
|
332 | { |
||
333 | 1 | $this->urlNormalizers = []; |
|
334 | |||
335 | 1 | return $this; |
|
336 | } |
||
337 | |||
338 | /** |
||
339 | * @return LoggerInterface |
||
340 | */ |
||
341 | 13 | public function getLogger() |
|
342 | { |
||
343 | 13 | if (is_null($this->logger)) { |
|
344 | 12 | $this->logger = new NullLogger(); |
|
345 | 12 | } |
|
346 | |||
347 | 13 | return $this->logger; |
|
348 | } |
||
349 | |||
350 | /** |
||
351 | * @param LoggerInterface $logger |
||
352 | * @return $this |
||
353 | */ |
||
354 | 1 | public function setLogger(LoggerInterface $logger) |
|
355 | { |
||
356 | 1 | $this->logger = $logger; |
|
357 | |||
358 | 1 | return $this; |
|
359 | } |
||
360 | |||
361 | /** |
||
362 | * @param $url |
||
363 | * @return Url |
||
364 | * @throws \Exception |
||
365 | */ |
||
366 | 10 | protected function createHttpUrlString($url) |
|
367 | { |
||
368 | try { |
||
369 | 10 | return $this->normalizeUrl(Url::createFromString($url)); |
|
370 | } |
||
371 | 6 | catch (\Exception $e) { |
|
372 | 6 | $this->getLogger()->warning( |
|
373 | 6 | sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage()) |
|
374 | 6 | ); |
|
375 | |||
376 | 6 | throw new UnsupportedUrlException($url); |
|
377 | } |
||
378 | } |
||
379 | |||
380 | /** |
||
381 | * @param Url $url |
||
382 | */ |
||
383 | 11 | protected function reset(Url $url) |
|
384 | { |
||
385 | 11 | $this->baseUrl = $url; |
|
386 | |||
387 | 11 | $this->urlsCrawled->reset(); |
|
388 | 11 | $this->urlsQueued->reset(); |
|
389 | 11 | $this->urlsReturned->reset(); |
|
390 | 11 | $this->urlsRejected = []; |
|
391 | |||
392 | 11 | $this->urlsQueued->push($url); |
|
393 | 11 | } |
|
394 | |||
395 | /** |
||
396 | * @param string $url |
||
397 | * @return \Generator|Page[] |
||
398 | * @throws RequestException |
||
399 | */ |
||
400 | 10 | public function crawl($url) |
|
401 | { |
||
402 | 10 | $this->reset($this->createHttpUrlString($url)); |
|
403 | |||
404 | 10 | while ($url = $this->urlsQueued->pop()) { |
|
405 | |||
406 | try { |
||
407 | 10 | $crawler = $this->requestPage($url); |
|
408 | 10 | $url = $this->updateResolvedUrl($url); |
|
409 | 10 | } catch (\Exception $e) { |
|
410 | 3 | $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage())); |
|
411 | |||
412 | 3 | if ($this->getStopOnError()) { |
|
413 | 1 | return; |
|
414 | } |
||
415 | 2 | if ($this->getExceptionOnError()) { |
|
416 | 1 | throw new RequestException($e->getMessage(), $e->getCode(), $e); |
|
417 | } |
||
418 | |||
419 | 1 | continue; |
|
420 | } |
||
421 | |||
422 | 9 | $this->urlsCrawled->push($url); |
|
423 | 9 | $this->updateQueue($crawler); |
|
424 | |||
425 | 9 | if ($this->shouldReturnUrl($url)) { |
|
426 | 9 | $this->getLogger()->debug(sprintf('Return url "%s"', $url)); |
|
427 | 9 | $this->urlsReturned->push($url); |
|
428 | |||
429 | 9 | yield new Page($url, $crawler, $this->client->getResponse()); |
|
430 | 9 | } |
|
431 | |||
432 | 9 | if ($this->isLimitReached()) { |
|
433 | 1 | $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit)); |
|
434 | |||
435 | 1 | return; |
|
436 | } |
||
437 | 9 | } |
|
438 | 7 | } |
|
439 | |||
440 | /** |
||
441 | * @param Url $url |
||
442 | * @return Url |
||
443 | */ |
||
444 | 10 | protected function updateResolvedUrl(Url $url) |
|
445 | { |
||
446 | 10 | $request = $this->client->getRequest(); |
|
447 | 9 | if (!empty($request)) { |
|
448 | 1 | $url = $this->createHttpUrlString($request->getUri()); |
|
449 | 1 | } |
|
450 | |||
451 | 9 | return $url; |
|
452 | } |
||
453 | |||
454 | /** |
||
455 | * @param DomCrawler $crawler |
||
456 | */ |
||
457 | 9 | protected function updateQueue(DomCrawler $crawler) |
|
458 | { |
||
459 | 9 | foreach ($this->extractUrlsFromCrawler($crawler) as $url) { |
|
460 | 7 | $this->getLogger()->debug(sprintf('Found url %s in page', $url)); |
|
461 | try { |
||
462 | 7 | $url = $this->createHttpUrlString($url); |
|
463 | |||
464 | 7 | if ($this->shouldCrawlUrl($url)) { |
|
465 | 7 | $this->urlsQueued->push($url); |
|
466 | 7 | } |
|
467 | 7 | } catch (\Exception $e) { |
|
468 | 6 | $this->addRejectedUrl($url); |
|
469 | } |
||
470 | 9 | } |
|
471 | 9 | } |
|
472 | |||
473 | /** |
||
474 | * @param Url $url |
||
475 | * @return Url |
||
476 | */ |
||
477 | 10 | protected function normalizeUrl(Url $url) |
|
478 | { |
||
479 | 10 | foreach ($this->urlNormalizers as $normalizer) { |
|
480 | 1 | $url = $normalizer->normalize($url); |
|
481 | 10 | } |
|
482 | |||
483 | 10 | return $url; |
|
484 | } |
||
485 | |||
486 | /** |
||
487 | * @param Url $url |
||
488 | * @return bool |
||
489 | */ |
||
490 | 9 | protected function shouldReturnUrl(Url $url) |
|
491 | { |
||
492 | 9 | if (!empty($this->whitelistUrlMatchers)) { |
|
493 | 1 | if (!$this->isUrlWhitelisted($url)) { |
|
494 | 1 | $this->getLogger()->info(sprintf('Skipping "%s" because it is not whitelisted', $url)); |
|
495 | |||
496 | 1 | return false; |
|
497 | } |
||
498 | 1 | } |
|
499 | |||
500 | 9 | if ($this->isUrlBlacklisted($url)) { |
|
501 | 1 | $this->getLogger()->info(sprintf('Skipping "%s" because it is blacklisted', $url)); |
|
502 | |||
503 | 1 | return false; |
|
504 | } |
||
505 | |||
506 | 9 | return true; |
|
507 | } |
||
508 | |||
509 | /** |
||
510 | * @param Url $url |
||
511 | * @return bool |
||
512 | */ |
||
513 | 1 | protected function isUrlWhitelisted(Url $url) |
|
514 | { |
||
515 | 1 | foreach ($this->whitelistUrlMatchers as $matcher) { |
|
516 | 1 | if ($matcher->matches($url)) { |
|
517 | 1 | return true; |
|
518 | } |
||
519 | 1 | } |
|
520 | |||
521 | 1 | return false; |
|
522 | } |
||
523 | |||
524 | /** |
||
525 | * @param Url $url |
||
526 | * @return bool |
||
527 | */ |
||
528 | 9 | protected function isUrlBlacklisted(Url $url) |
|
529 | { |
||
530 | 9 | foreach ($this->blacklistUrlMatchers as $matcher) { |
|
531 | 1 | if ($matcher->matches($url)) { |
|
532 | 1 | return true; |
|
533 | } |
||
534 | 9 | } |
|
535 | |||
536 | 9 | return false; |
|
537 | } |
||
538 | |||
539 | /** |
||
540 | * @param Url $url |
||
541 | * @return bool |
||
542 | */ |
||
543 | 8 | protected function shouldCrawlUrl(Url $url) |
|
544 | { |
||
545 | 8 | if ($this->urlsCrawled->contains($url) || $this->urlsQueued->contains($url)) { |
|
546 | 6 | return false; |
|
547 | } |
||
548 | |||
549 | 8 | if (!$this->isUrlPartOfBaseUrl($url)) { |
|
550 | 7 | $this->addRejectedUrl($url); |
|
551 | |||
552 | 7 | return false; |
|
553 | } |
||
554 | |||
555 | 8 | return true; |
|
556 | } |
||
557 | |||
558 | /** |
||
559 | * @param $url |
||
560 | */ |
||
561 | 9 | protected function addRejectedUrl($url) |
|
562 | { |
||
563 | 9 | if ($url instanceof Url) { |
|
564 | 8 | $url = $url->__toString(); |
|
565 | 8 | } |
|
566 | 9 | if (!is_string($url)) { |
|
567 | 1 | throw new \InvalidArgumentException('Url should be a string or an instance of '.Url::class); |
|
568 | } |
||
569 | |||
570 | 8 | $this->urlsRejected[$url] = $url; |
|
571 | 8 | } |
|
572 | |||
573 | /** |
||
574 | * @param Url $url |
||
575 | * @return bool |
||
576 | */ |
||
577 | 8 | protected function isUrlPartOfBaseUrl(Url $url) |
|
578 | { |
||
579 | 8 | $baseUrlString = (string)$this->baseUrl; |
|
580 | 8 | $this->getLogger()->debug($baseUrlString.' - '.$url); |
|
581 | 8 | if (strpos((string)$url, $baseUrlString) === false) { |
|
582 | 7 | return false; |
|
583 | } |
||
584 | |||
585 | 8 | return true; |
|
586 | } |
||
587 | |||
588 | /** |
||
589 | * @return bool |
||
590 | */ |
||
591 | 9 | protected function isLimitReached() |
|
592 | { |
||
593 | 9 | return (!empty($this->limit) && count($this->urlsReturned) === $this->limit); |
|
594 | } |
||
595 | |||
596 | /** |
||
597 | * @param DomCrawler $crawler |
||
598 | * @return array |
||
599 | */ |
||
600 | 9 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
601 | { |
||
602 | 9 | return $crawler->filter('a')->each( |
|
603 | 7 | function (DomCrawler $node) { |
|
604 | 7 | return $node->link()->getUri(); |
|
605 | } |
||
606 | 9 | ); |
|
607 | } |
||
608 | |||
609 | /** |
||
610 | * @param Url $url |
||
611 | * @return DomCrawler |
||
612 | */ |
||
613 | 10 | protected function requestPage(Url $url) |
|
614 | { |
||
615 | 10 | $this->getLogger()->info(sprintf('Crawling page %s', $url)); |
|
616 | 10 | $crawler = $this->client->request('GET', (string)$url); |
|
617 | 10 | $this->getLogger()->info(sprintf('Crawled page %s', $url)); |
|
618 | |||
619 | 10 | return $crawler; |
|
620 | } |
||
621 | } |
||
622 |
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..