Completed
Push — master ( ba3171...d0dfba )
by
unknown
03:31
created

Crawler   C

Complexity

Total Complexity 72

Size/Duplication

Total Lines 598
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 11

Test Coverage

Coverage 98.96%

Importance

Changes 0
Metric Value
wmc 72
lcom 1
cbo 11
dl 0
loc 598
ccs 191
cts 193
cp 0.9896
rs 5.5667
c 0
b 0
f 0

44 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 10 2
A setClient() 0 4 1
A getClient() 0 4 1
A getLimit() 0 4 1
A setLimit() 0 6 1
A getStopOnError() 0 4 1
A setStopOnError() 0 6 1
A getExceptionOnError() 0 4 1
A setExceptionOnError() 0 6 1
A getUrlsCrawled() 0 4 1
A getUrlsQueued() 0 4 1
A getUrlsRejected() 0 4 1
A getUrlsReturned() 0 4 1
A setWhitelistUrlMatchers() 0 9 2
A getWhitelistUrlMatchers() 0 4 1
A addWhitelistUrlMatcher() 0 6 1
A clearWhitelistUrlMatchers() 0 6 1
A setBlacklistUrlMatchers() 0 9 2
A getBlacklistUrlMatchers() 0 4 1
A addBlacklistUrlMatcher() 0 6 1
A clearBlacklistUrlMatchers() 0 6 1
A setUrlNormalizers() 0 10 2
A getUrlNormalizers() 0 4 1
A addUrlNormalizer() 0 6 1
A clearUrlNormalizers() 0 6 1
A getLogger() 0 8 2
A setLogger() 0 6 1
A addUrlToQueue() 0 4 1
A createHttpUrlString() 0 4 1
A reset() 0 8 1
C crawl() 0 41 7
A updateQueue() 0 18 4
A normalizeUrl() 0 8 2
A shouldReturnUrl() 0 18 4
A isUrlWhitelisted() 0 10 3
A isUrlBlacklisted() 0 10 3
B shouldCrawlUrl() 0 14 5
A isUrlRejected() 0 4 1
A isUrlCrawled() 0 4 1
A isUrlQueued() 0 4 1
A isUrlPartOfBaseUrl() 0 10 2
A isLimitReached() 0 4 2
A extractUrlsFromCrawler() 0 8 1
A requestPage() 0 8 1

How to fix   Complexity   

Complex Class

Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Exception\RequestException;
6
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
7
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
8
use Symfony\Component\BrowserKit\Client;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Psr\Log\LoggerAwareInterface;
11
use Psr\Log\LoggerInterface;
12
use Psr\Log\NullLogger;
13
14
class Crawler implements LoggerAwareInterface
15
{
16
    /**
17
     * @var Client
18
     */
19
    private $client;
20
21
    /**
22
     * @var int
23
     */
24
    private $limit = 0;
25
26
    /**
27
     * @var bool
28
     */
29
    private $stopOnError = false;
30
31
    /**
32
     * @var bool
33
     */
34
    private $exceptionOnError = false;
35
36
    /**
37
     * @var UrlMatcherInterface[]
38
     */
39
    private $whitelistUrlMatchers = [];
40
41
    /**
42
     * @var UrlMatcherInterface[]
43
     */
44
    private $blacklistUrlMatchers = [];
45
46
    /**
47
     * @var UrlNormalizerInterface[]
48
     */
49
    private $urlNormalizers = [];
50
51
    /**
52
     * @var Url
53
     */
54
    private $baseUrl;
55
56
    /**
57
     * @var array
58
     */
59
    private $urlsCrawled = [];
60
61
    /**
62
     * @var array
63
     */
64
    private $urlsQueued = [];
65
66
    /**
67
     * @var array
68
     */
69
    private $urlsRejected = [];
70
71
    /**
72
     * @var array
73
     */
74
    private $urlsReturned = [];
75
76
    /**
77
     * @var LoggerInterface
78
     */
79
    private $logger = null;
80
81
    /**
82
     * @param Client $client
83
     */
84 13
    public function __construct(Client $client = null)
85
    {
86 13
        if (empty($client)) {
87 4
            $client = new \Goutte\Client();
88 5
        }
89
90 13
        $this->setClient($client);
91
92 13
        return $this;
93
    }
94
95
    /**
96
     * @param Client $client
97
     */
98 13
    public function setClient(Client $client)
99
    {
100 13
        $this->client = $client;
101 13
    }
102
103
    /**
104
     * @return Client
105
     */
106 2
    public function getClient()
107
    {
108 2
        return $this->client;
109
    }
110
111
    /**
112
     * @return int
113
     */
114 2
    public function getLimit()
115
    {
116 2
        return $this->limit;
117
    }
118
119
    /**
120
     * @param int $limit
121
     * @return $this
122
     */
123 2
    public function setLimit($limit)
124
    {
125 2
        $this->limit = $limit;
126
127 2
        return $this;
128
    }
129
130
    /**
131
     * @return boolean
132
     */
133 5
    public function getStopOnError()
134
    {
135 5
        return $this->stopOnError;
136
    }
137
138
    /**
139
     * @param boolean $stopOnError
140
     * @return $this
141
     */
142 2
    public function setStopOnError($stopOnError)
143
    {
144 2
        $this->stopOnError = $stopOnError;
145
146 2
        return $this;
147
    }
148
149
    /**
150
     * @return boolean
151
     */
152 2
    public function getExceptionOnError()
153
    {
154 2
        return $this->exceptionOnError;
155
    }
156
157
    /**
158
     * @param boolean $exceptionOnError
159
     * @return $this
160
     */
161 1
    public function setExceptionOnError($exceptionOnError)
162
    {
163 1
        $this->exceptionOnError = $exceptionOnError;
164
165 1
        return $this;
166
    }
167
168
    /**
169
     * @return array
170
     */
171 9
    public function getUrlsCrawled()
172
    {
173 9
        return $this->urlsCrawled;
174
    }
175
176
    /**
177
     * @return array
178
     */
179 2
    public function getUrlsQueued()
180
    {
181 2
        return $this->urlsQueued;
182
    }
183
184
    /**
185
     * @return array
186
     */
187 2
    public function getUrlsRejected()
188
    {
189 2
        return $this->urlsRejected;
190
    }
191
192
    /**
193
     * @return array
194
     */
195 4
    public function getUrlsReturned()
196
    {
197 4
        return $this->urlsReturned;
198
    }
199
200
    /**
201
     * @param $urlMatchers
202
     * @return $this
203
     */
204 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
205
    {
206 1
        $this->clearWhitelistUrlMatchers();
207 1
        foreach ($urlMatchers as $matcher) {
208 1
            $this->addWhitelistUrlMatcher($matcher);
209 1
        }
210
211 1
        return $this;
212
    }
213
214
    /**
215
     * @return Url\Matcher\UrlMatcherInterface[]
216
     */
217 2
    public function getWhitelistUrlMatchers()
218
    {
219 2
        return $this->whitelistUrlMatchers;
220
    }
221
222
    /**
223
     * @param UrlMatcherInterface $urlMatcher
224
     * @return $this
225
     */
226 2
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
227
    {
228 2
        $this->whitelistUrlMatchers[] = $urlMatcher;
229
230 2
        return $this;
231
    }
232
233
    /**
234
     * @return $this
235
     */
236 1
    public function clearWhitelistUrlMatchers()
237
    {
238 1
        $this->whitelistUrlMatchers = [];
239
240 1
        return $this;
241
    }
242
243
    /**
244
     * @param array $urlMatchers
245
     * @return $this
246
     */
247 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
248
    {
249 1
        $this->clearBlacklistUrlMatchers();
250 1
        foreach ($urlMatchers as $matcher) {
251 1
            $this->addBlacklistUrlMatcher($matcher);
252 1
        }
253
254 1
        return $this;
255
    }
256
257
    /**
258
     * @return Url\Matcher\UrlMatcherInterface[]
259
     */
260 2
    public function getBlacklistUrlMatchers()
261
    {
262 2
        return $this->blacklistUrlMatchers;
263
    }
264
265
    /**
266
     * @param UrlMatcherInterface $urlMatcher
267
     * @return $this
268
     */
269 2
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
270
    {
271 2
        $this->blacklistUrlMatchers[] = $urlMatcher;
272
273 2
        return $this;
274
    }
275
276
    /**
277
     * @return $this
278
     */
279 1
    public function clearBlacklistUrlMatchers()
280
    {
281 1
        $this->blacklistUrlMatchers = [];
282
283 1
        return $this;
284
    }
285
286
    /**
287
     * @param array $normalizers
288
     * @return $this
289
     */
290 1
    public function setUrlNormalizers(array $normalizers)
291
    {
292 1
        $this->clearUrlNormalizers();
293
294 1
        foreach ($normalizers as $normalizer) {
295 1
            $this->addUrlNormalizer($normalizer);
296 1
        }
297
298 1
        return $this;
299
    }
300
301
    /**
302
     * @return UrlNormalizerInterface[]
303
     */
304 1
    public function getUrlNormalizers()
305
    {
306 1
        return $this->urlNormalizers;
307
    }
308
309
    /**
310
     * @param UrlNormalizerInterface $normalizer
311
     * @return $this
312
     */
313 2
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
314
    {
315 2
        $this->urlNormalizers[] = $normalizer;
316
317 2
        return $this;
318
    }
319
320
    /**
321
     * @return $this
322
     */
323 1
    public function clearUrlNormalizers()
324
    {
325 1
        $this->urlNormalizers = [];
326
327 1
        return $this;
328
    }
329
330
    /**
331
     * @return LoggerInterface
332
     */
333 11
    public function getLogger()
334
    {
335 11
        if (is_null($this->logger)) {
336 10
            $this->logger = new NullLogger();
337 10
        }
338
339 11
        return $this->logger;
340
    }
341
342
    /**
343
     * @param LoggerInterface $logger
344
     * @return $this
345
     */
346 1
    public function setLogger(LoggerInterface $logger)
347
    {
348 1
        $this->logger = $logger;
349
350 1
        return $this;
351
    }
352
353
    /**
354
     * @param Url $url
355
     */
356 9
    protected function addUrlToQueue(Url $url)
357
    {
358 9
        $this->urlsQueued[(string)$url] = $url;
359 9
    }
360
361
    /**
362
     * @param string $url
363
     * @return Url
364
     */
365 9
    protected function createHttpUrlString($url)
366
    {
367 9
        return Url::createFromString($url);
368
    }
369
370
    /**
371
     * @param Url $url
372
     */
373 9
    protected function reset(Url $url)
374
    {
375 9
        $this->baseUrl = $url;
376 9
        $this->urlsCrawled = [];
377 9
        $this->urlsQueued = [];
378
379 9
        $this->addUrlToQueue($url);
380 9
    }
381
382
    /**
383
     * @param string $url
384
     * @return \Generator
385
     * @throws RequestException
386
     */
387 9
    public function crawl($url)
388
    {
389 9
        $this->reset($this->createHttpUrlString($url));
390
391 9
        while (count($this->urlsQueued) > 0) {
392
393 9
            $url = array_shift($this->urlsQueued);
394
395
            try {
396 9
                $crawler = $this->requestPage((string)$url);
397 9
            } catch (\Exception $e) {
398 3
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
399
400 3
                if ($this->getStopOnError()) {
401 1
                    return;
402
                }
403 2
                if ($this->getExceptionOnError()) {
404 1
                    throw new RequestException($e->getMessage(), $e->getCode(), $e);
405
                }
406
407 1
                continue;
408
            }
409
410 9
            $this->urlsCrawled[] = (string)$url;
411 9
            $this->updateQueue($crawler);
412
413 9
            if ($this->shouldReturnUrl($url)) {
414 9
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
415
416 9
                $this->urlsReturned[] = (string)$url;
417
418 9
                yield new Page($url, $crawler);
419 9
            }
420
421 9
            if ($this->isLimitReached()) {
422 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
423
424 1
                return;
425
            }
426 9
        }
427 6
    }
428
429
    /**
430
     * @param DomCrawler $crawler
431
     */
432 9
    protected function updateQueue(DomCrawler $crawler)
433
    {
434 9
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
435 8
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
436
            try {
437 8
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
438
439 8
                if ($this->shouldCrawlUrl($url)) {
440 8
                    $this->addUrlToQueue($url);
441 8
                }
442 8
            } catch (\Exception $e) {
443 6
                $this->getLogger()->warning(
444 6
                    sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
445 6
                );
446 6
                $this->urlsRejected[] = $url;
447
            }
448 9
        }
449 9
    }
450
451
    /**
452
     * @param Url $url
453
     * @return Url
454
     */
455 8
    protected function normalizeUrl(Url $url)
456
    {
457 8
        foreach ($this->urlNormalizers as $normalizer) {
458 1
            $url = $normalizer->normalize($url);
459 8
        }
460
461 8
        return $url;
462
    }
463
464
    /**
465
     * @param Url $url
466
     * @return bool
467
     */
468 9
    protected function shouldReturnUrl(Url $url)
469
    {
470 9
        if (!empty($this->whitelistUrlMatchers)) {
471 1
            if (!$this->isUrlWhitelisted($url)) {
472 1
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
473
474 1
                return false;
475
            }
476 1
        }
477
478 9
        if ($this->isUrlBlacklisted($url)) {
479 1
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
480
481 1
            return false;
482
        }
483
484 9
        return true;
485
    }
486
487
    /**
488
     * @param Url $url
489
     * @return bool
490
     */
491 1
    protected function isUrlWhitelisted(Url $url)
492
    {
493 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
494 1
            if ($matcher->matches($url)) {
495 1
                return true;
496
            }
497 1
        }
498
499 1
        return false;
500
    }
501
502
    /**
503
     * @param Url $url
504
     * @return bool
505
     */
506 9
    protected function isUrlBlacklisted(Url $url)
507
    {
508 9
        foreach ($this->blacklistUrlMatchers as $matcher) {
509 1
            if ($matcher->matches($url)) {
510 1
                return true;
511
            }
512 9
        }
513
514 9
        return false;
515
    }
516
517
    /**
518
     * @param Url $url
519
     * @return bool
520
     */
521 8
    protected function shouldCrawlUrl(Url $url)
522
    {
523 8
        if ($this->isUrlRejected($url) || $this->isUrlCrawled($url) || $this->isUrlQueued($url)) {
524 5
            return false;
525
        }
526
527 8
        if (!$this->isUrlPartOfBaseUrl($url)) {
528 6
            $this->urlsRejected[] = (string)$url;
529
530 6
            return false;
531
        }
532
533 8
        return true;
534
    }
535
536
    /**
537
     * @param Url $url
538
     * @return bool
539
     */
540 8
    protected function isUrlRejected(Url $url)
541
    {
542 8
        return in_array((string)$url, $this->urlsRejected);
543
    }
544
545
    /**
546
     * @param Url $url
547
     * @return bool
548
     */
549 8
    protected function isUrlCrawled(Url $url)
550
    {
551 8
        return in_array((string)$url, $this->urlsCrawled);
552
    }
553
554
    /**
555
     * @param Url $url
556
     * @return bool
557
     */
558 8
    protected function isUrlQueued(Url $url)
559
    {
560 8
        return isset($this->urlsQueued[(string)$url]);
561
    }
562
563
    /**
564
     * @param Url $url
565
     * @return bool
566
     */
567 8
    protected function isUrlPartOfBaseUrl(Url $url)
568
    {
569 8
        $baseUrlString = (string)$this->baseUrl;
570 8
        $this->getLogger()->debug($baseUrlString.' - '.$url);
571 8
        if (strpos((string)$url, $baseUrlString) === false) {
572 6
            return false;
573
        }
574
575 8
        return true;
576
    }
577
578
    /**
579
     * @return bool
580
     */
581 9
    protected function isLimitReached()
582
    {
583 9
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
584
    }
585
586
    /**
587
     * @param DomCrawler $crawler
588
     * @return array
589
     */
590 9
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
591
    {
592 9
        return $crawler->filter('a')->each(
593 8
            function (DomCrawler $node) {
594 8
                return $node->link()->getUri();
595
            }
596 9
        );
597
    }
598
599
    /**
600
     * @param string $url
601
     * @return DomCrawler
602
     */
603 9
    protected function requestPage($url)
604
    {
605 9
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
606 9
        $crawler = $this->client->request('GET', $url);
607 9
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
608
609 9
        return $crawler;
610
    }
611
}
612