Completed
Push — master ( ecad61...e21061 )
by
unknown
02:06
created

Crawler   D

Complexity

Total Complexity 75

Size/Duplication

Total Lines 623
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 12

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 75
lcom 1
cbo 12
dl 0
loc 623
ccs 203
cts 203
cp 1
rs 4.8992
c 0
b 0
f 0

45 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 10 2
A setClient() 0 4 1
A getClient() 0 4 1
A getLimit() 0 4 1
A setLimit() 0 6 1
A getStopOnError() 0 4 1
A setStopOnError() 0 6 1
A getExceptionOnError() 0 4 1
A setExceptionOnError() 0 6 1
A getUrlsCrawled() 0 4 1
A getUrlsQueued() 0 4 1
A getUrlsRejected() 0 4 1
A getUrlsReturned() 0 4 1
A setWhitelistUrlMatchers() 0 9 2
A getWhitelistUrlMatchers() 0 4 1
A addWhitelistUrlMatcher() 0 6 1
A clearWhitelistUrlMatchers() 0 6 1
A setBlacklistUrlMatchers() 0 9 2
A getBlacklistUrlMatchers() 0 4 1
A addBlacklistUrlMatcher() 0 6 1
A clearBlacklistUrlMatchers() 0 6 1
A setUrlNormalizers() 0 10 2
A getUrlNormalizers() 0 4 1
A addUrlNormalizer() 0 6 1
A clearUrlNormalizers() 0 6 1
A getLogger() 0 8 2
A setLogger() 0 6 1
A addUrlToQueue() 0 4 1
A createHttpUrlString() 0 13 2
A reset() 0 8 1
C crawl() 0 42 7
A updateUrl() 0 9 2
A updateQueue() 0 15 4
A normalizeUrl() 0 8 2
A shouldReturnUrl() 0 18 4
A isUrlWhitelisted() 0 10 3
A isUrlBlacklisted() 0 10 3
B shouldCrawlUrl() 0 17 5
A isUrlRejected() 0 4 1
A isUrlCrawled() 0 4 1
A isUrlQueued() 0 4 1
A isUrlPartOfBaseUrl() 0 10 2
A isLimitReached() 0 4 2
A extractUrlsFromCrawler() 0 8 1
A requestPage() 0 8 1

How to fix   Complexity   

Complex Class

Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Exception\RequestException;
6
use MediaMonks\Crawler\Exception\UnsupportedUrlException;
7
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
8
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
9
use Symfony\Component\BrowserKit\Client;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
use Psr\Log\LoggerAwareInterface;
12
use Psr\Log\LoggerInterface;
13
use Psr\Log\NullLogger;
14
15
class Crawler implements LoggerAwareInterface
16
{
17
    /**
18
     * @var Client
19
     */
20
    private $client;
21
22
    /**
23
     * @var int
24
     */
25
    private $limit = 0;
26
27
    /**
28
     * @var bool
29
     */
30
    private $stopOnError = false;
31
32
    /**
33
     * @var bool
34
     */
35
    private $exceptionOnError = false;
36
37
    /**
38
     * @var UrlMatcherInterface[]
39
     */
40
    private $whitelistUrlMatchers = [];
41
42
    /**
43
     * @var UrlMatcherInterface[]
44
     */
45
    private $blacklistUrlMatchers = [];
46
47
    /**
48
     * @var UrlNormalizerInterface[]
49
     */
50
    private $urlNormalizers = [];
51
52
    /**
53
     * @var Url
54
     */
55
    private $baseUrl;
56
57
    /**
58
     * @var array
59
     */
60
    private $urlsCrawled = [];
61
62
    /**
63
     * @var array
64
     */
65
    private $urlsQueued = [];
66
67
    /**
68
     * @var array
69
     */
70
    private $urlsRejected = [];
71
72
    /**
73
     * @var array
74
     */
75
    private $urlsReturned = [];
76
77
    /**
78
     * @var LoggerInterface
79
     */
80
    private $logger = null;
81
82
    /**
83
     * @param Client $client
84
     */
85 15
    public function __construct(Client $client = null)
86
    {
87 15
        if (empty($client)) {
88 6
            $client = new \Goutte\Client();
89 5
        }
90
91 15
        $this->setClient($client);
92
93 15
        return $this;
94
    }
95
96
    /**
97
     * @param Client $client
98
     */
99 15
    public function setClient(Client $client)
100
    {
101 15
        $this->client = $client;
102 15
    }
103
104
    /**
105
     * @return Client
106
     */
107 2
    public function getClient()
108
    {
109 2
        return $this->client;
110
    }
111
112
    /**
113
     * @return int
114
     */
115 2
    public function getLimit()
116
    {
117 2
        return $this->limit;
118
    }
119
120
    /**
121
     * @param int $limit
122
     * @return $this
123
     */
124 2
    public function setLimit($limit)
125
    {
126 2
        $this->limit = $limit;
127
128 2
        return $this;
129
    }
130
131
    /**
132
     * @return boolean
133
     */
134 5
    public function getStopOnError()
135
    {
136 5
        return $this->stopOnError;
137
    }
138
139
    /**
140
     * @param boolean $stopOnError
141
     * @return $this
142
     */
143 2
    public function setStopOnError($stopOnError)
144
    {
145 2
        $this->stopOnError = $stopOnError;
146
147 2
        return $this;
148
    }
149
150
    /**
151
     * @return boolean
152
     */
153 2
    public function getExceptionOnError()
154
    {
155 2
        return $this->exceptionOnError;
156
    }
157
158
    /**
159
     * @param boolean $exceptionOnError
160
     * @return $this
161
     */
162 1
    public function setExceptionOnError($exceptionOnError)
163
    {
164 1
        $this->exceptionOnError = $exceptionOnError;
165
166 1
        return $this;
167
    }
168
169
    /**
170
     * @return array
171
     */
172 9
    public function getUrlsCrawled()
173
    {
174 9
        return $this->urlsCrawled;
175
    }
176
177
    /**
178
     * @return array
179
     */
180 2
    public function getUrlsQueued()
181
    {
182 2
        return $this->urlsQueued;
183
    }
184
185
    /**
186
     * @return array
187
     */
188 2
    public function getUrlsRejected()
189
    {
190 2
        return $this->urlsRejected;
191
    }
192
193
    /**
194
     * @return array
195
     */
196 4
    public function getUrlsReturned()
197
    {
198 4
        return $this->urlsReturned;
199
    }
200
201
    /**
202
     * @param $urlMatchers
203
     * @return $this
204
     */
205 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
206
    {
207 1
        $this->clearWhitelistUrlMatchers();
208 1
        foreach ($urlMatchers as $matcher) {
209 1
            $this->addWhitelistUrlMatcher($matcher);
210 1
        }
211
212 1
        return $this;
213
    }
214
215
    /**
216
     * @return Url\Matcher\UrlMatcherInterface[]
217
     */
218 2
    public function getWhitelistUrlMatchers()
219
    {
220 2
        return $this->whitelistUrlMatchers;
221
    }
222
223
    /**
224
     * @param UrlMatcherInterface $urlMatcher
225
     * @return $this
226
     */
227 2
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
228
    {
229 2
        $this->whitelistUrlMatchers[] = $urlMatcher;
230
231 2
        return $this;
232
    }
233
234
    /**
235
     * @return $this
236
     */
237 1
    public function clearWhitelistUrlMatchers()
238
    {
239 1
        $this->whitelistUrlMatchers = [];
240
241 1
        return $this;
242
    }
243
244
    /**
245
     * @param array $urlMatchers
246
     * @return $this
247
     */
248 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
249
    {
250 1
        $this->clearBlacklistUrlMatchers();
251 1
        foreach ($urlMatchers as $matcher) {
252 1
            $this->addBlacklistUrlMatcher($matcher);
253 1
        }
254
255 1
        return $this;
256
    }
257
258
    /**
259
     * @return Url\Matcher\UrlMatcherInterface[]
260
     */
261 2
    public function getBlacklistUrlMatchers()
262
    {
263 2
        return $this->blacklistUrlMatchers;
264
    }
265
266
    /**
267
     * @param UrlMatcherInterface $urlMatcher
268
     * @return $this
269
     */
270 2
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
271
    {
272 2
        $this->blacklistUrlMatchers[] = $urlMatcher;
273
274 2
        return $this;
275
    }
276
277
    /**
278
     * @return $this
279
     */
280 1
    public function clearBlacklistUrlMatchers()
281
    {
282 1
        $this->blacklistUrlMatchers = [];
283
284 1
        return $this;
285
    }
286
287
    /**
288
     * @param array $normalizers
289
     * @return $this
290
     */
291 1
    public function setUrlNormalizers(array $normalizers)
292
    {
293 1
        $this->clearUrlNormalizers();
294
295 1
        foreach ($normalizers as $normalizer) {
296 1
            $this->addUrlNormalizer($normalizer);
297 1
        }
298
299 1
        return $this;
300
    }
301
302
    /**
303
     * @return UrlNormalizerInterface[]
304
     */
305 1
    public function getUrlNormalizers()
306
    {
307 1
        return $this->urlNormalizers;
308
    }
309
310
    /**
311
     * @param UrlNormalizerInterface $normalizer
312
     * @return $this
313
     */
314 2
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
315
    {
316 2
        $this->urlNormalizers[] = $normalizer;
317
318 2
        return $this;
319
    }
320
321
    /**
322
     * @return $this
323
     */
324 1
    public function clearUrlNormalizers()
325
    {
326 1
        $this->urlNormalizers = [];
327
328 1
        return $this;
329
    }
330
331
    /**
332
     * @return LoggerInterface
333
     */
334 13
    public function getLogger()
335
    {
336 13
        if (is_null($this->logger)) {
337 12
            $this->logger = new NullLogger();
338 12
        }
339
340 13
        return $this->logger;
341
    }
342
343
    /**
344
     * @param LoggerInterface $logger
345
     * @return $this
346
     */
347 1
    public function setLogger(LoggerInterface $logger)
348
    {
349 1
        $this->logger = $logger;
350
351 1
        return $this;
352
    }
353
354
    /**
355
     * @param Url $url
356
     */
357 11
    protected function addUrlToQueue(Url $url)
358
    {
359 11
        $this->urlsQueued[(string)$url] = $url;
360 11
    }
361
362
    /**
363
     * @param $url
364
     * @return Url
365
     * @throws \Exception
366
     */
367 10
    protected function createHttpUrlString($url)
368
    {
369
        try {
370 10
            return Url::createFromString($url);
371
        }
372 6
        catch (\Exception $e) {
373 6
            $this->getLogger()->warning(
374 6
                sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
375 6
            );
376
377 6
            throw new UnsupportedUrlException($url);
378
        }
379
    }
380
381
    /**
382
     * @param Url $url
383
     */
384 11
    protected function reset(Url $url)
385
    {
386 11
        $this->baseUrl = $url;
387 11
        $this->urlsCrawled = [];
388 11
        $this->urlsQueued = [];
389
390 11
        $this->addUrlToQueue($url);
391 11
    }
392
393
    /**
394
     * @param string $url
395
     * @return \Generator|Page[]
396
     * @throws RequestException
397
     */
398 10
    public function crawl($url)
399
    {
400 10
        $this->reset($this->createHttpUrlString($url));
401
402 10
        while (count($this->urlsQueued) > 0) {
403
404 10
            $url = array_shift($this->urlsQueued);
405
406
            try {
407 10
                $crawler = $this->requestPage((string)$url);
408 10
                $url = $this->updateUrl($url);
409 10
            } catch (\Exception $e) {
410 3
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
411
412 3
                if ($this->getStopOnError()) {
413 1
                    return;
414
                }
415 2
                if ($this->getExceptionOnError()) {
416 1
                    throw new RequestException($e->getMessage(), $e->getCode(), $e);
417
                }
418
419 1
                continue;
420
            }
421
422 9
            $this->urlsCrawled[] = (string)$url;
423 9
            $this->updateQueue($crawler);
424
425 9
            if ($this->shouldReturnUrl($url)) {
426 9
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
427
428 9
                $this->urlsReturned[] = (string)$url;
429
430 9
                yield new Page($url, $crawler, $this->client->getResponse());
0 ignored issues
show
Bug introduced by
It seems like $crawler defined by $this->requestPage((string) $url) on line 407 can be null; however, MediaMonks\Crawler\Page::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
431 9
            }
432
433 9
            if ($this->isLimitReached()) {
434 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
435
436 1
                return;
437
            }
438 9
        }
439 7
    }
440
441
    /**
442
     * @param Url $url
443
     * @return Url
444
     */
445 10
    protected function updateUrl(Url $url)
446
    {
447 10
        $internalRequest = $this->client->getInternalRequest();
448 9
        if (!empty($internalRequest)) {
449 1
            $url = $this->createHttpUrlString($this->client->getInternalRequest()->getUri());
450 1
        }
451
452 9
        return $url;
453
    }
454
455
    /**
456
     * @param DomCrawler $crawler
457
     */
458 9
    protected function updateQueue(DomCrawler $crawler)
459
    {
460 9
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
461 7
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
462
            try {
463 7
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
464
465 7
                if ($this->shouldCrawlUrl($url)) {
466 7
                    $this->addUrlToQueue($url);
467 7
                }
468 7
            } catch (\Exception $e) {
469 6
                $this->urlsRejected[] = $url;
470
            }
471 9
        }
472 9
    }
473
474
    /**
475
     * @param Url $url
476
     * @return Url
477
     */
478 7
    protected function normalizeUrl(Url $url)
479
    {
480 7
        foreach ($this->urlNormalizers as $normalizer) {
481 1
            $url = $normalizer->normalize($url);
482 7
        }
483
484 7
        return $url;
485
    }
486
487
    /**
488
     * @param Url $url
489
     * @return bool
490
     */
491 9
    protected function shouldReturnUrl(Url $url)
492
    {
493 9
        if (!empty($this->whitelistUrlMatchers)) {
494 1
            if (!$this->isUrlWhitelisted($url)) {
495 1
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
496
497 1
                return false;
498
            }
499 1
        }
500
501 9
        if ($this->isUrlBlacklisted($url)) {
502 1
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
503
504 1
            return false;
505
        }
506
507 9
        return true;
508
    }
509
510
    /**
511
     * @param Url $url
512
     * @return bool
513
     */
514 1
    protected function isUrlWhitelisted(Url $url)
515
    {
516 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
517 1
            if ($matcher->matches($url)) {
518 1
                return true;
519
            }
520 1
        }
521
522 1
        return false;
523
    }
524
525
    /**
526
     * @param Url $url
527
     * @return bool
528
     */
529 9
    protected function isUrlBlacklisted(Url $url)
530
    {
531 9
        foreach ($this->blacklistUrlMatchers as $matcher) {
532 1
            if ($matcher->matches($url)) {
533 1
                return true;
534
            }
535 9
        }
536
537 9
        return false;
538
    }
539
540
    /**
541
     * @param Url $url
542
     * @return bool
543
     */
544 8
    protected function shouldCrawlUrl(Url $url)
545
    {
546 8
        if ($this->isUrlRejected($url)
547 8
            || $this->isUrlCrawled($url)
548 8
            || $this->isUrlQueued($url)
549 8
        ) {
550 6
            return false;
551
        }
552
553 8
        if (!$this->isUrlPartOfBaseUrl($url)) {
554 7
            $this->urlsRejected[] = (string)$url;
555
556 7
            return false;
557
        }
558
559 8
        return true;
560
    }
561
562
    /**
563
     * @param Url $url
564
     * @return bool
565
     */
566 8
    protected function isUrlRejected(Url $url)
567
    {
568 8
        return in_array((string)$url, $this->urlsRejected);
569
    }
570
571
    /**
572
     * @param Url $url
573
     * @return bool
574
     */
575 8
    protected function isUrlCrawled(Url $url)
576
    {
577 8
        return in_array((string)$url, $this->urlsCrawled);
578
    }
579
580
    /**
581
     * @param Url $url
582
     * @return bool
583
     */
584 8
    protected function isUrlQueued(Url $url)
585
    {
586 8
        return isset($this->urlsQueued[(string)$url]);
587
    }
588
589
    /**
590
     * @param Url $url
591
     * @return bool
592
     */
593 8
    protected function isUrlPartOfBaseUrl(Url $url)
594
    {
595 8
        $baseUrlString = (string)$this->baseUrl;
596 8
        $this->getLogger()->debug($baseUrlString.' - '.$url);
597 8
        if (strpos((string)$url, $baseUrlString) === false) {
598 7
            return false;
599
        }
600
601 8
        return true;
602
    }
603
604
    /**
605
     * @return bool
606
     */
607 9
    protected function isLimitReached()
608
    {
609 9
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
610
    }
611
612
    /**
613
     * @param DomCrawler $crawler
614
     * @return array
615
     */
616 9
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
617
    {
618 9
        return $crawler->filter('a')->each(
619 7
            function (DomCrawler $node) {
620 7
                return $node->link()->getUri();
621
            }
622 9
        );
623
    }
624
625
    /**
626
     * @param string $url
627
     * @return DomCrawler
628
     */
629 10
    protected function requestPage($url)
630
    {
631 10
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
632 10
        $crawler = $this->client->request('GET', $url);
633 10
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
634
635 10
        return $crawler;
636
    }
637
}
638