Completed
Push — master ( d70065...3138ff )
by
unknown
03:52
created

Crawler::isUrlCrawled()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 1
crap 1
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Client\CrawlerClientInterface;
6
use MediaMonks\Crawler\Client\GoutteClient;
7
use MediaMonks\Crawler\Exception\RequestException;
8
use MediaMonks\Crawler\Exception\UnsupportedUrlException;
9
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
10
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
11
use MediaMonks\Crawler\Url\UrlCollection;
12
use Symfony\Component\BrowserKit\Client;
13
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
14
use Psr\Log\LoggerAwareInterface;
15
use Psr\Log\LoggerInterface;
16
use Psr\Log\NullLogger;
17
18
class Crawler implements LoggerAwareInterface
19
{
20
    /**
21
     * @var Client
22
     */
23
    private $client;
24
25
    /**
26
     * @var int
27
     */
28
    private $limit = 0;
29
30
    /**
31
     * @var bool
32
     */
33
    private $stopOnError = false;
34
35
    /**
36
     * @var bool
37
     */
38
    private $exceptionOnError = false;
39
40
    /**
41
     * @var UrlMatcherInterface[]
42
     */
43
    private $whitelistUrlMatchers = [];
44
45
    /**
46
     * @var UrlMatcherInterface[]
47
     */
48
    private $blacklistUrlMatchers = [];
49
50
    /**
51
     * @var UrlNormalizerInterface[]
52
     */
53
    private $urlNormalizers = [];
54
55
    /**
56
     * @var Url
57
     */
58
    private $baseUrl;
59
60
    /**
61
     * @var UrlCollection
62
     */
63
    private $urlsCrawled;
64
65
    /**
66
     * @var UrlCollection
67
     */
68
    private $urlsQueued;
69
70
    /**
71
     * @var UrlCollection
72
     */
73
    private $urlsReturned;
74
75
    /**
76
     * @var array
77
     */
78
    private $urlsRejected = [];
79
80
    /**
81
     * @var LoggerInterface
82
     */
83
    private $logger = null;
84
85
    /**
86
     * @param CrawlerClientInterface $client
87
     */
88 15
    public function __construct(CrawlerClientInterface $client = null)
89
    {
90 15
        if (empty($client)) {
91 5
            $client = new GoutteClient();
92 5
        }
93
94 15
        $this->setClient($client);
95
96 15
        $this->urlsCrawled = new UrlCollection();
97 15
        $this->urlsQueued = new UrlCollection();
98 15
        $this->urlsReturned = new UrlCollection();
99
100 15
        return $this;
0 ignored issues
show
Bug introduced by
Constructors do not have meaningful return values, anything that is returned from here is discarded. Are you sure this is correct?
Loading history...
101
    }
102
103
    /**
104
     * @param CrawlerClientInterface $client
105
     */
106 15
    public function setClient(CrawlerClientInterface $client)
107
    {
108 15
        $this->client = $client;
0 ignored issues
show
Documentation Bug introduced by
It seems like $client of type object<MediaMonks\Crawle...CrawlerClientInterface> is incompatible with the declared type object<Symfony\Component\BrowserKit\Client> of property $client.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
109 15
    }
110
111
    /**
112
     * @return Client
113
     */
114 2
    public function getClient()
115
    {
116 2
        return $this->client;
117
    }
118
119
    /**
120
     * @return int
121
     */
122 2
    public function getLimit()
123
    {
124 2
        return $this->limit;
125
    }
126
127
    /**
128
     * @param int $limit
129
     * @return $this
130
     */
131 2
    public function setLimit($limit)
132
    {
133 2
        $this->limit = $limit;
134
135 2
        return $this;
136
    }
137
138
    /**
139
     * @return boolean
140
     */
141 5
    public function getStopOnError()
142
    {
143 5
        return $this->stopOnError;
144
    }
145
146
    /**
147
     * @param boolean $stopOnError
148
     * @return $this
149
     */
150 2
    public function setStopOnError($stopOnError)
151
    {
152 2
        $this->stopOnError = $stopOnError;
153
154 2
        return $this;
155
    }
156
157
    /**
158
     * @return boolean
159
     */
160 2
    public function getExceptionOnError()
161
    {
162 2
        return $this->exceptionOnError;
163
    }
164
165
    /**
166
     * @param boolean $exceptionOnError
167
     * @return $this
168
     */
169 1
    public function setExceptionOnError($exceptionOnError)
170
    {
171 1
        $this->exceptionOnError = $exceptionOnError;
172
173 1
        return $this;
174
    }
175
176
    /**
177
     * @return array
178
     */
179 9
    public function getUrlsCrawled()
180
    {
181 9
        return $this->urlsCrawled->toArray();
182
    }
183
184
    /**
185
     * @return array
186
     */
187 2
    public function getUrlsQueued()
188
    {
189 2
        return $this->urlsQueued->toArray();
190
    }
191
192
    /**
193
     * @return array
194
     */
195 4
    public function getUrlsReturned()
196
    {
197 4
        return $this->urlsReturned->toArray();
198
    }
199
200
    /**
201
     * @return array
202
     */
203 2
    public function getUrlsRejected()
204
    {
205 2
        return $this->urlsRejected;
206
    }
207
208
    /**
209
     * @param $urlMatchers
210
     * @return $this
211
     */
212 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
213
    {
214 1
        $this->clearWhitelistUrlMatchers();
215 1
        foreach ($urlMatchers as $matcher) {
216 1
            $this->addWhitelistUrlMatcher($matcher);
217 1
        }
218
219 1
        return $this;
220
    }
221
222
    /**
223
     * @return Url\Matcher\UrlMatcherInterface[]
224
     */
225 2
    public function getWhitelistUrlMatchers()
226
    {
227 2
        return $this->whitelistUrlMatchers;
228
    }
229
230
    /**
231
     * @param UrlMatcherInterface $urlMatcher
232
     * @return $this
233
     */
234 2
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
235
    {
236 2
        $this->whitelistUrlMatchers[] = $urlMatcher;
237
238 2
        return $this;
239
    }
240
241
    /**
242
     * @return $this
243
     */
244 1
    public function clearWhitelistUrlMatchers()
245
    {
246 1
        $this->whitelistUrlMatchers = [];
247
248 1
        return $this;
249
    }
250
251
    /**
252
     * @param array $urlMatchers
253
     * @return $this
254
     */
255 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
256
    {
257 1
        $this->clearBlacklistUrlMatchers();
258 1
        foreach ($urlMatchers as $matcher) {
259 1
            $this->addBlacklistUrlMatcher($matcher);
260 1
        }
261
262 1
        return $this;
263
    }
264
265
    /**
266
     * @return UrlMatcherInterface[]
267
     */
268 2
    public function getBlacklistUrlMatchers()
269
    {
270 2
        return $this->blacklistUrlMatchers;
271
    }
272
273
    /**
274
     * @param UrlMatcherInterface $urlMatcher
275
     * @return $this
276
     */
277 2
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
278
    {
279 2
        $this->blacklistUrlMatchers[] = $urlMatcher;
280
281 2
        return $this;
282
    }
283
284
    /**
285
     * @return $this
286
     */
287 1
    public function clearBlacklistUrlMatchers()
288
    {
289 1
        $this->blacklistUrlMatchers = [];
290
291 1
        return $this;
292
    }
293
294
    /**
295
     * @param array $normalizers
296
     * @return $this
297
     */
298 1
    public function setUrlNormalizers(array $normalizers)
299
    {
300 1
        $this->clearUrlNormalizers();
301
302 1
        foreach ($normalizers as $normalizer) {
303 1
            $this->addUrlNormalizer($normalizer);
304 1
        }
305
306 1
        return $this;
307
    }
308
309
    /**
310
     * @return UrlNormalizerInterface[]
311
     */
312 1
    public function getUrlNormalizers()
313
    {
314 1
        return $this->urlNormalizers;
315
    }
316
317
    /**
318
     * @param UrlNormalizerInterface $normalizer
319
     * @return $this
320
     */
321 2
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
322
    {
323 2
        $this->urlNormalizers[] = $normalizer;
324
325 2
        return $this;
326
    }
327
328
    /**
329
     * @return $this
330
     */
331 1
    public function clearUrlNormalizers()
332
    {
333 1
        $this->urlNormalizers = [];
334
335 1
        return $this;
336
    }
337
338
    /**
339
     * @return LoggerInterface
340
     */
341 13
    public function getLogger()
342
    {
343 13
        if (is_null($this->logger)) {
344 12
            $this->logger = new NullLogger();
345 12
        }
346
347 13
        return $this->logger;
348
    }
349
350
    /**
351
     * @param LoggerInterface $logger
352
     * @return $this
353
     */
354 1
    public function setLogger(LoggerInterface $logger)
355
    {
356 1
        $this->logger = $logger;
357
358 1
        return $this;
359
    }
360
361
    /**
362
     * @param $url
363
     * @return Url
364
     * @throws \Exception
365
     */
366 10
    protected function createHttpUrlString($url)
367
    {
368
        try {
369 10
            return $this->normalizeUrl(Url::createFromString($url));
370
        }
371 6
        catch (\Exception $e) {
372 6
            $this->getLogger()->warning(
373 6
                sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
374 6
            );
375
376 6
            throw new UnsupportedUrlException($url);
377
        }
378
    }
379
380
    /**
381
     * @param Url $url
382
     */
383 11
    protected function reset(Url $url)
384
    {
385 11
        $this->baseUrl = $url;
386
387 11
        $this->urlsCrawled->reset();
388 11
        $this->urlsQueued->reset();
389 11
        $this->urlsReturned->reset();
390 11
        $this->urlsRejected = [];
391
392 11
        $this->urlsQueued->push($url);
393 11
    }
394
395
    /**
396
     * @param string $url
397
     * @return \Generator|Page[]
398
     * @throws RequestException
399
     */
400 10
    public function crawl($url)
401
    {
402 10
        $this->reset($this->createHttpUrlString($url));
403
404 10
        while (count($this->urlsQueued) > 0) {
405
406
            try {
407 10
                $url = $this->urlsQueued->pop();
408 10
                $crawler = $this->requestPage($url);
0 ignored issues
show
Bug introduced by
It seems like $url defined by $this->urlsQueued->pop() on line 407 can be null; however, MediaMonks\Crawler\Crawler::requestPage() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
409 10
                $url = $this->updateResolvedUrl($url);
0 ignored issues
show
Bug introduced by
It seems like $url can be null; however, updateResolvedUrl() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
410 10
            } catch (\Exception $e) {
411 3
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
412
413 3
                if ($this->getStopOnError()) {
414 1
                    return;
415
                }
416 2
                if ($this->getExceptionOnError()) {
417 1
                    throw new RequestException($e->getMessage(), $e->getCode(), $e);
418
                }
419
420 1
                continue;
421
            }
422
423 9
            $this->urlsCrawled->push($url);
424 9
            $this->updateQueue($crawler);
0 ignored issues
show
Bug introduced by
It seems like $crawler defined by $this->requestPage($url) on line 408 can be null; however, MediaMonks\Crawler\Crawler::updateQueue() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
425
426 9
            if ($this->shouldReturnUrl($url)) {
427 9
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
428 9
                $this->urlsReturned->push($url);
429
430 9
                yield new Page($url, $crawler, $this->client->getResponse());
0 ignored issues
show
Bug introduced by
It seems like $crawler defined by $this->requestPage($url) on line 408 can be null; however, MediaMonks\Crawler\Page::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
431 9
            }
432
433 9
            if ($this->isLimitReached()) {
434 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
435
436 1
                return;
437
            }
438 9
        }
439 7
    }
440
441
    /**
442
     * @param Url $url
443
     * @return Url
444
     */
445 10
    protected function updateResolvedUrl(Url $url)
446
    {
447 10
        $request = $this->client->getRequest();
448 9
        if (!empty($request)) {
449 1
            $url = $this->createHttpUrlString($request->getUri());
450 1
        }
451
452 9
        return $url;
453
    }
454
455
    /**
456
     * @param DomCrawler $crawler
457
     */
458 9
    protected function updateQueue(DomCrawler $crawler)
459
    {
460 9
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
461 7
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
462
            try {
463 7
                $url = $this->createHttpUrlString($url);
464
465 7
                if ($this->shouldCrawlUrl($url)) {
466 7
                    $this->urlsQueued->push($url);
467 7
                }
468 7
            } catch (\Exception $e) {
469 6
                $this->addRejectedUrl($url);
470
            }
471 9
        }
472 9
    }
473
474
    /**
475
     * @param Url $url
476
     * @return Url
477
     */
478 10
    protected function normalizeUrl(Url $url)
479
    {
480 10
        foreach ($this->urlNormalizers as $normalizer) {
481 1
            $url = $normalizer->normalize($url);
482 10
        }
483
484 10
        return $url;
485
    }
486
487
    /**
488
     * @param Url $url
489
     * @return bool
490
     */
491 9
    protected function shouldReturnUrl(Url $url)
492
    {
493 9
        if (!empty($this->whitelistUrlMatchers)) {
494 1
            if (!$this->isUrlWhitelisted($url)) {
495 1
                $this->getLogger()->info(sprintf('Skipping "%s" because it is not whitelisted', $url));
496
497 1
                return false;
498
            }
499 1
        }
500
501 9
        if ($this->isUrlBlacklisted($url)) {
502 1
            $this->getLogger()->info(sprintf('Skipping "%s" because it is blacklisted', $url));
503
504 1
            return false;
505
        }
506
507 9
        return true;
508
    }
509
510
    /**
511
     * @param Url $url
512
     * @return bool
513
     */
514 1
    protected function isUrlWhitelisted(Url $url)
515
    {
516 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
517 1
            if ($matcher->matches($url)) {
518 1
                return true;
519
            }
520 1
        }
521
522 1
        return false;
523
    }
524
525
    /**
526
     * @param Url $url
527
     * @return bool
528
     */
529 9
    protected function isUrlBlacklisted(Url $url)
530
    {
531 9
        foreach ($this->blacklistUrlMatchers as $matcher) {
532 1
            if ($matcher->matches($url)) {
533 1
                return true;
534
            }
535 9
        }
536
537 9
        return false;
538
    }
539
540
    /**
541
     * @param Url $url
542
     * @return bool
543
     */
544 8
    protected function shouldCrawlUrl(Url $url)
545
    {
546 8
        if ($this->urlsCrawled->contains($url) || $this->urlsQueued->contains($url)) {
547 6
            return false;
548
        }
549
550 8
        if (!$this->isUrlPartOfBaseUrl($url)) {
551 7
            $this->addRejectedUrl($url);
552
553 7
            return false;
554
        }
555
556 8
        return true;
557
    }
558
559
    /**
560
     * @param $url
561
     */
562 7
    protected function addRejectedUrl($url)
563
    {
564 7
        if ($url instanceof Url) {
565 7
            $url = $url->__toString();
566 7
        }
567 7
        if (!is_string($url)) {
568
            throw new \InvalidArgumentException('Url should be a string or an instance of '.Url::class);
569
        }
570
571 7
        $this->urlsRejected[$url] = $url;
572 7
    }
573
574
    /**
575
     * @param Url $url
576
     * @return bool
577
     */
578 8
    protected function isUrlPartOfBaseUrl(Url $url)
579
    {
580 8
        $baseUrlString = (string)$this->baseUrl;
581 8
        $this->getLogger()->debug($baseUrlString.' - '.$url);
582 8
        if (strpos((string)$url, $baseUrlString) === false) {
583 7
            return false;
584
        }
585
586 8
        return true;
587
    }
588
589
    /**
590
     * @return bool
591
     */
592 9
    protected function isLimitReached()
593
    {
594 9
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
595
    }
596
597
    /**
598
     * @param DomCrawler $crawler
599
     * @return array
600
     */
601 9
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
602
    {
603 9
        return $crawler->filter('a')->each(
604 7
            function (DomCrawler $node) {
605 7
                return $node->link()->getUri();
606
            }
607 9
        );
608
    }
609
610
    /**
611
     * @param Url $url
612
     * @return DomCrawler
613
     */
614 10
    protected function requestPage(Url $url)
615
    {
616 10
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
617 10
        $crawler = $this->client->request('GET', (string)$url);
618 10
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
619
620 10
        return $crawler;
621
    }
622
}
623