Completed
Push — master ( 5ec984...fdde14 )
by
unknown
04:22
created

Crawler   C

Complexity

Total Complexity 76

Size/Duplication

Total Lines 597
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 10

Test Coverage

Coverage 99.03%

Importance

Changes 0
Metric Value
wmc 76
lcom 1
cbo 10
dl 0
loc 597
ccs 205
cts 207
cp 0.9903
rs 5.488
c 0
b 0
f 0

43 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 11 2
A setClient() 0 4 1
A getClient() 0 4 1
B setOptions() 0 21 7
A getLimit() 0 4 1
A setLimit() 0 6 1
A getStopOnError() 0 4 1
A setStopOnError() 0 6 1
A getUrlsCrawled() 0 4 1
A getUrlsQueued() 0 4 1
A getUrlsRejected() 0 4 1
A getUrlsReturned() 0 4 1
A setWhitelistUrlMatchers() 0 9 2
A getWhitelistUrlMatchers() 0 4 1
A addWhitelistUrlMatcher() 0 6 1
A clearWhitelistUrlMatchers() 0 6 1
A setBlacklistUrlMatchers() 0 9 2
A getBlacklistUrlMatchers() 0 4 1
A addBlacklistUrlMatcher() 0 6 1
A clearBlacklistUrlMatchers() 0 6 1
A setUrlNormalizers() 0 10 2
A getUrlNormalizers() 0 4 1
A addUrlNormalizer() 0 6 1
A clearUrlNormalizers() 0 6 1
A getLogger() 0 8 2
A setLogger() 0 6 1
A addUrlToQueue() 0 4 1
A createHttpUrlString() 0 4 1
A reset() 0 8 1
B crawl() 0 38 6
A updateQueue() 0 18 4
A normalizeUrl() 0 8 2
A shouldReturnUrl() 0 18 4
A isUrlWhitelisted() 0 10 3
A isUrlBlacklisted() 0 10 3
B shouldCrawlUrl() 0 14 5
A isUrlRejected() 0 4 1
A isUrlCrawled() 0 4 1
A isUrlQueued() 0 4 1
A isUrlPartOfBaseUrl() 0 10 2
A isLimitReached() 0 4 2
A extractUrlsFromCrawler() 0 8 1
A requestPage() 0 8 1

How to fix   Complexity   

Complex Class

Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
6
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
7
use Symfony\Component\BrowserKit\Client;
8
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
9
use Psr\Log\LoggerAwareInterface;
10
use Psr\Log\LoggerInterface;
11
use Psr\Log\NullLogger;
12
13
class Crawler implements LoggerAwareInterface
14
{
15
    /**
16
     * @var Client
17
     */
18
    private $client;
19
20
    /**
21
     * @var int
22
     */
23
    private $limit = 0;
24
25
    /**
26
     * @var bool
27
     */
28
    private $stopOnError = false;
29
30
    /**
31
     * @var UrlMatcherInterface[]
32
     */
33
    private $whitelistUrlMatchers = [];
34
35
    /**
36
     * @var UrlMatcherInterface[]
37
     */
38
    private $blacklistUrlMatchers = [];
39
40
    /**
41
     * @var UrlNormalizerInterface[]
42
     */
43
    private $urlNormalizers = [];
44
45
    /**
46
     * @var Url
47
     */
48
    private $baseUrl;
49
50
    /**
51
     * @var array
52
     */
53
    private $urlsCrawled = [];
54
55
    /**
56
     * @var array
57
     */
58
    private $urlsQueued = [];
59
60
    /**
61
     * @var array
62
     */
63
    private $urlsRejected = [];
64
65
    /**
66
     * @var array
67
     */
68
    private $urlsReturned = [];
69
70
    /**
71
     * @var LoggerInterface
72
     */
73
    private $logger = null;
74
75
    /**
76
     * @param Client $client
77
     * @param array $options
78
     */
79 13
    public function __construct(Client $client = null, array $options = [])
80
    {
81 13
        if (empty($client)) {
82 5
            $client = new \Goutte\Client();
83 5
        }
84
85 13
        $this->setClient($client);
86 13
        $this->setOptions($options);
87
88 13
        return $this;
89
    }
90
91
    /**
92
     * @param Client $client
93
     */
94 13
    public function setClient(Client $client)
95
    {
96 13
        $this->client = $client;
97 13
    }
98
99
    /**
100
     * @return Client
101
     */
102 2
    public function getClient()
103
    {
104 2
        return $this->client;
105
    }
106
107
    /**
108
     * @param array $options
109
     */
110 13
    public function setOptions(array $options)
111
    {
112 13
        if (isset($options['limit'])) {
113 2
            $this->setLimit($options['limit']);
114 1
        }
115 13
        if (isset($options['stop_on_error'])) {
116 1
            $this->setStopOnError($options['stop_on_error']);
117 1
        }
118 13
        if (isset($options['logger'])) {
119 1
            $this->setLogger($options['logger']);
120 1
        }
121 13
        if (isset($options['whitelist_url_matchers'])) {
122 1
            $this->setWhitelistUrlMatchers($options['whitelist_url_matchers']);
123 1
        }
124 13
        if (isset($options['blacklist_url_matchers'])) {
125 1
            $this->setBlacklistUrlMatchers($options['blacklist_url_matchers']);
126 1
        }
127 13
        if (isset($options['url_normalizers'])) {
128 1
            $this->setUrlNormalizers($options['url_normalizers']);
129 1
        }
130 13
    }
131
132
    /**
133
     * @return int
134
     */
135 3
    public function getLimit()
136
    {
137 3
        return $this->limit;
138
    }
139
140
    /**
141
     * @param int $limit
142
     * @return $this
143
     */
144 3
    public function setLimit($limit)
145
    {
146 3
        $this->limit = $limit;
147
148 3
        return $this;
149
    }
150
151
    /**
152
     * @return boolean
153
     */
154 5
    public function getStopOnError()
155
    {
156 5
        return $this->stopOnError;
157
    }
158
159
    /**
160
     * @param boolean $stopOnError
161
     * @return Crawler
162
     */
163 3
    public function setStopOnError($stopOnError)
164
    {
165 3
        $this->stopOnError = $stopOnError;
166
167 3
        return $this;
168
    }
169
170
    /**
171
     * @return array
172
     */
173 9
    public function getUrlsCrawled()
174
    {
175 9
        return $this->urlsCrawled;
176
    }
177
178
    /**
179
     * @return array
180
     */
181 2
    public function getUrlsQueued()
182
    {
183 2
        return $this->urlsQueued;
184
    }
185
186
    /**
187
     * @return array
188
     */
189 2
    public function getUrlsRejected()
190
    {
191 2
        return $this->urlsRejected;
192
    }
193
194
    /**
195
     * @return array
196
     */
197 4
    public function getUrlsReturned()
198
    {
199 4
        return $this->urlsReturned;
200
    }
201
202
    /**
203
     * @param $urlMatchers
204
     * @return $this
205
     */
206 2
    public function setWhitelistUrlMatchers(array $urlMatchers)
207
    {
208 2
        $this->clearWhitelistUrlMatchers();
209 2
        foreach ($urlMatchers as $matcher) {
210 2
            $this->addWhitelistUrlMatcher($matcher);
211 2
        }
212
213 2
        return $this;
214
    }
215
216
    /**
217
     * @return Url\Matcher\UrlMatcherInterface[]
218
     */
219 3
    public function getWhitelistUrlMatchers()
220
    {
221 3
        return $this->whitelistUrlMatchers;
222
    }
223
224
    /**
225
     * @param UrlMatcherInterface $urlMatcher
226
     * @return $this
227
     */
228 3
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
229
    {
230 3
        $this->whitelistUrlMatchers[] = $urlMatcher;
231
232 3
        return $this;
233
    }
234
235
    /**
236
     * @return $this
237
     */
238 2
    public function clearWhitelistUrlMatchers()
239
    {
240 2
        $this->whitelistUrlMatchers = [];
241
242 2
        return $this;
243
    }
244
245
    /**
246
     * @param array $urlMatchers
247
     * @return $this
248
     */
249 2
    public function setBlacklistUrlMatchers(array $urlMatchers)
250
    {
251 2
        $this->clearBlacklistUrlMatchers();
252 2
        foreach ($urlMatchers as $matcher) {
253 2
            $this->addBlacklistUrlMatcher($matcher);
254 2
        }
255
256 2
        return $this;
257
    }
258
259
    /**
260
     * @return Url\Matcher\UrlMatcherInterface[]
261
     */
262 3
    public function getBlacklistUrlMatchers()
263
    {
264 3
        return $this->blacklistUrlMatchers;
265
    }
266
267
    /**
268
     * @param UrlMatcherInterface $urlMatcher
269
     * @return $this
270
     */
271 3
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
272
    {
273 3
        $this->blacklistUrlMatchers[] = $urlMatcher;
274
275 3
        return $this;
276
    }
277
278
    /**
279
     * @return $this
280
     */
281 2
    public function clearBlacklistUrlMatchers()
282
    {
283 2
        $this->blacklistUrlMatchers = [];
284
285 2
        return $this;
286
    }
287
288
    /**
289
     * @param array $normalizers
290
     * @return $this
291
     */
292 2
    public function setUrlNormalizers(array $normalizers)
293
    {
294 2
        $this->clearUrlNormalizers();
295
296 2
        foreach ($normalizers as $normalizer) {
297 2
            $this->addUrlNormalizer($normalizer);
298 2
        }
299
300 2
        return $this;
301
    }
302
303
    /**
304
     * @return UrlNormalizerInterface[]
305
     */
306 2
    public function getUrlNormalizers()
307
    {
308 2
        return $this->urlNormalizers;
309
    }
310
311
    /**
312
     * @param UrlNormalizerInterface $normalizer
313
     * @return $this
314
     */
315 3
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
316
    {
317 3
        $this->urlNormalizers[] = $normalizer;
318
319 3
        return $this;
320
    }
321
322
    /**
323
     * @return $this
324
     */
325 2
    public function clearUrlNormalizers()
326
    {
327 2
        $this->urlNormalizers = [];
328
329 2
        return $this;
330
    }
331
332
    /**
333
     * @return LoggerInterface
334
     */
335 11
    public function getLogger()
336
    {
337 11
        if (is_null($this->logger)) {
338 9
            $this->logger = new NullLogger();
339 9
        }
340
341 11
        return $this->logger;
342
    }
343
344
    /**
345
     * @param LoggerInterface $logger
346
     * @return $this
347
     */
348 2
    public function setLogger(LoggerInterface $logger)
349
    {
350 2
        $this->logger = $logger;
351
352 2
        return $this;
353
    }
354
355
    /**
356
     * @param Url $url
357
     */
358 8
    protected function addUrlToQueue(Url $url)
359
    {
360 8
        $this->urlsQueued[(string)$url] = $url;
361 8
    }
362
363
    /**
364
     * @param string $url
365
     * @return Url
366
     */
367 8
    protected function createHttpUrlString($url)
368
    {
369 8
        return Url::createFromString($url);
370
    }
371
372
    /**
373
     * @param Url $url
374
     */
375 8
    protected function reset(Url $url)
376
    {
377 8
        $this->baseUrl = $url;
378 8
        $this->urlsCrawled = [];
379 8
        $this->urlsQueued = [];
380
381 8
        $this->addUrlToQueue($url);
382 8
    }
383
384
    /**
385
     * @param string $url
386
     * @return \Generator
387
     */
388 8
    public function crawl($url)
389
    {
390 8
        $this->reset($this->createHttpUrlString($url));
391
392 8
        while (count($this->urlsQueued) > 0) {
393
394 8
            $url = array_shift($this->urlsQueued);
395
396
            try {
397 8
                $crawler = $this->requestPage((string)$url);
398 8
            } catch (\Exception $e) {
399 2
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
400
401 2
                if ($this->getStopOnError()) {
402 1
                    return;
403
                }
404
405 1
                continue;
406
            }
407
408 8
            $this->urlsCrawled[] = (string)$url;
409 8
            $this->updateQueue($crawler);
410
411 8
            if ($this->shouldReturnUrl($url)) {
412 8
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
413
414 8
                $this->urlsReturned[] = (string)$url;
415
416 8
                yield new Page($url, $crawler);
417 8
            }
418
419 8
            if ($this->isLimitReached()) {
420 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
421
422 1
                return;
423
            }
424 8
        }
425 6
    }
426
427
    /**
428
     * @param DomCrawler $crawler
429
     */
430 8
    protected function updateQueue(DomCrawler $crawler)
431
    {
432 8
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
433 7
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
434
            try {
435 7
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
436
437 7
                if ($this->shouldCrawlUrl($url)) {
438 7
                    $this->addUrlToQueue($url);
439 7
                }
440 7
            } catch (\Exception $e) {
441 6
                $this->getLogger()->warning(
442 6
                    sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
443 6
                );
444 6
                $this->urlsRejected[] = $url;
445
            }
446 8
        }
447 8
    }
448
449
    /**
450
     * @param Url $url
451
     * @return Url
452
     */
453 7
    protected function normalizeUrl(Url $url)
454
    {
455 7
        foreach ($this->urlNormalizers as $normalizer) {
456 1
            $url = $normalizer->normalize($url);
457 7
        }
458
459 7
        return $url;
460
    }
461
462
    /**
463
     * @param Url $url
464
     * @return bool
465
     */
466 8
    protected function shouldReturnUrl(Url $url)
467
    {
468 8
        if (!empty($this->whitelistUrlMatchers)) {
469 1
            if (!$this->isUrlWhitelisted($url)) {
470 1
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
471
472 1
                return false;
473
            }
474 1
        }
475
476 8
        if ($this->isUrlBlacklisted($url)) {
477 1
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
478
479 1
            return false;
480
        }
481
482 8
        return true;
483
    }
484
485
    /**
486
     * @param Url $url
487
     * @return bool
488
     */
489 1
    protected function isUrlWhitelisted(Url $url)
490
    {
491 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
492 1
            if ($matcher->matches($url)) {
493 1
                return true;
494
            }
495 1
        }
496
497 1
        return false;
498
    }
499
500
    /**
501
     * @param Url $url
502
     * @return bool
503
     */
504 8
    protected function isUrlBlacklisted(Url $url)
505
    {
506 8
        foreach ($this->blacklistUrlMatchers as $matcher) {
507 1
            if ($matcher->matches($url)) {
508 1
                return true;
509
            }
510 8
        }
511
512 8
        return false;
513
    }
514
515
    /**
516
     * @param Url $url
517
     * @return bool
518
     */
519 7
    protected function shouldCrawlUrl(Url $url)
520
    {
521 7
        if ($this->isUrlRejected($url) || $this->isUrlCrawled($url) || $this->isUrlQueued($url)) {
522 5
            return false;
523
        }
524
525 7
        if (!$this->isUrlPartOfBaseUrl($url)) {
526 6
            $this->urlsRejected[] = (string)$url;
527
528 6
            return false;
529
        }
530
531 7
        return true;
532
    }
533
534
    /**
535
     * @param Url $url
536
     * @return bool
537
     */
538 7
    protected function isUrlRejected(Url $url)
539
    {
540 7
        return in_array((string)$url, $this->urlsRejected);
541
    }
542
543
    /**
544
     * @param Url $url
545
     * @return bool
546
     */
547 7
    protected function isUrlCrawled(Url $url)
548
    {
549 7
        return in_array((string)$url, $this->urlsCrawled);
550
    }
551
552
    /**
553
     * @param Url $url
554
     * @return bool
555
     */
556 7
    protected function isUrlQueued(Url $url)
557
    {
558 7
        return isset($this->urlsQueued[(string)$url]);
559
    }
560
561
    /**
562
     * @param Url $url
563
     * @return bool
564
     */
565 7
    protected function isUrlPartOfBaseUrl(Url $url)
566
    {
567 7
        $baseUrlString = (string)$this->baseUrl;
568 7
        $this->getLogger()->debug($baseUrlString.' - '.$url);
569 7
        if (strpos((string)$url, $baseUrlString) === false) {
570 6
            return false;
571
        }
572
573 7
        return true;
574
    }
575
576
    /**
577
     * @return bool
578
     */
579 8
    protected function isLimitReached()
580
    {
581 8
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
582
    }
583
584
    /**
585
     * @param DomCrawler $crawler
586
     * @return array
587
     */
588 8
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
589
    {
590 8
        return $crawler->filter('a')->each(
591 7
            function (DomCrawler $node) {
592 7
                return $node->link()->getUri();
593
            }
594 8
        );
595
    }
596
597
    /**
598
     * @param string $url
599
     * @return DomCrawler
600
     */
601 8
    protected function requestPage($url)
602
    {
603 8
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
604 8
        $crawler = $this->client->request('GET', $url);
605 8
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
606
607 8
        return $crawler;
608
    }
609
}
610