Completed
Push — master ( 1b8c9c...ab459d )
by
unknown
02:55
created

Crawler::isUrlRejected()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 1
crap 1
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
6
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
7
use Symfony\Component\BrowserKit\Client;
8
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
9
use Psr\Log\LoggerAwareInterface;
10
use Psr\Log\LoggerInterface;
11
use Psr\Log\NullLogger;
12
13
class Crawler implements LoggerAwareInterface
14
{
15
    /**
16
     * @var Client
17
     */
18
    private $client;
19
20
    /**
21
     * @var int
22
     */
23
    private $limit = 0;
24
25
    /**
26
     * @var bool
27
     */
28
    private $stopOnError = false;
29
30
    /**
31
     * @var UrlMatcherInterface[]
32
     */
33
    private $whitelistUrlMatchers = [];
34
35
    /**
36
     * @var UrlMatcherInterface[]
37
     */
38
    private $blacklistUrlMatchers = [];
39
40
    /**
41
     * @var UrlNormalizerInterface[]
42
     */
43
    private $urlNormalizers = [];
44
45
    /**
46
     * @var Url
47
     */
48
    private $baseUrl;
49
50
    /**
51
     * @var array
52
     */
53
    private $urlsCrawled = [];
54
55
    /**
56
     * @var array
57
     */
58
    private $urlsQueued = [];
59
60
    /**
61
     * @var array
62
     */
63
    private $urlsRejected = [];
64
65
    /**
66
     * @var array
67
     */
68
    private $urlsReturned = [];
69
70
    /**
71
     * @var LoggerInterface
72
     */
73
    private $logger = null;
74
75
    /**
76
     * @param Client $client
77
     * @param array $options
78
     */
79 13
    public function __construct(Client $client = null, array $options = [])
80
    {
81 13
        if (empty($client)) {
82 5
            $client = new \Goutte\Client();
83 5
        }
84
85 13
        $this->setClient($client);
86 13
        $this->setOptions($options);
87
88 13
        return $this;
89
    }
90
91
    /**
92
     * @param Client $client
93
     */
94 13
    public function setClient(Client $client)
95
    {
96 13
        $this->client = $client;
97 13
    }
98
99
    /**
100
     * @return Client
101
     */
102 2
    public function getClient()
103
    {
104 2
        return $this->client;
105
    }
106
107
    /**
108
     * @param array $options
109
     */
110 13
    public function setOptions(array $options)
111
    {
112 13
        if (isset($options['limit'])) {
113 2
            $this->setLimit($options['limit']);
114 1
        }
115 13
        if (isset($options['stop_on_error'])) {
116 1
            $this->setStopOnError($options['stop_on_error']);
117 1
        }
118 13
        if (isset($options['logger'])) {
119 1
            $this->setLogger($options['logger']);
120 1
        }
121 13
        if (isset($options['whitelist_url_matchers'])) {
122 1
            $this->setWhitelistUrlMatchers($options['whitelist_url_matchers']);
123 1
        }
124 13
        if (isset($options['blacklist_url_matchers'])) {
125 1
            $this->setBlacklistUrlMatchers($options['blacklist_url_matchers']);
126 1
        }
127 13
        if (isset($options['url_normalizers'])) {
128
            $this->setUrlNormalizers($options['url_normalizers']);
129
        }
130 13
    }
131
132
    /**
133
     * @return int
134
     */
135 3
    public function getLimit()
136
    {
137 3
        return $this->limit;
138
    }
139
140
    /**
141
     * @param int $limit
142
     * @return $this
143
     */
144 3
    public function setLimit($limit)
145
    {
146 3
        $this->limit = $limit;
147
148 3
        return $this;
149
    }
150
151
    /**
152
     * @return boolean
153
     */
154 5
    public function getStopOnError()
155
    {
156 5
        return $this->stopOnError;
157
    }
158
159
    /**
160
     * @param boolean $stopOnError
161
     * @return Crawler
162
     */
163 3
    public function setStopOnError($stopOnError)
164
    {
165 3
        $this->stopOnError = $stopOnError;
166
167 3
        return $this;
168
    }
169
170
    /**
171
     * @return array
172
     */
173 9
    public function getUrlsCrawled()
174
    {
175 9
        return $this->urlsCrawled;
176
    }
177
178
    /**
179
     * @return array
180
     */
181 2
    public function getUrlsQueued()
182
    {
183 2
        return $this->urlsQueued;
184
    }
185
186
    /**
187
     * @return array
188
     */
189 2
    public function getUrlsRejected()
190
    {
191 2
        return $this->urlsRejected;
192
    }
193
194
    /**
195
     * @return array
196
     */
197 4
    public function getUrlsReturned()
198
    {
199 4
        return $this->urlsReturned;
200
    }
201
202
    /**
203
     * @param $urlMatchers
204
     * @return $this
205
     */
206 2
    public function setWhitelistUrlMatchers(array $urlMatchers)
207
    {
208 2
        $this->clearWhitelistUrlMatchers();
209 2
        foreach ($urlMatchers as $matcher) {
210 2
            $this->addWhitelistUrlMatcher($matcher);
211 2
        }
212
213 2
        return $this;
214
    }
215
216
    /**
217
     * @return Url\Matcher\UrlMatcherInterface[]
218
     */
219 3
    public function getWhitelistUrlMatchers()
220
    {
221 3
        return $this->whitelistUrlMatchers;
222
    }
223
224
    /**
225
     * @param UrlMatcherInterface $urlMatcher
226
     * @return $this
227
     */
228 3
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
229
    {
230 3
        $this->whitelistUrlMatchers[] = $urlMatcher;
231
232 3
        return $this;
233
    }
234
235
    /**
236
     * @return $this
237
     */
238 2
    public function clearWhitelistUrlMatchers()
239
    {
240 2
        $this->whitelistUrlMatchers = [];
241
242 2
        return $this;
243
    }
244
245
    /**
246
     * @param array $urlMatchers
247
     * @return $this
248
     */
249 2
    public function setBlacklistUrlMatchers(array $urlMatchers)
250
    {
251 2
        $this->clearBlacklistUrlMatchers();
252 2
        foreach ($urlMatchers as $matcher) {
253 2
            $this->addBlacklistUrlMatcher($matcher);
254 2
        }
255
256 2
        return $this;
257
    }
258
259
    /**
260
     * @return Url\Matcher\UrlMatcherInterface[]
261
     */
262 3
    public function getBlacklistUrlMatchers()
263
    {
264 3
        return $this->blacklistUrlMatchers;
265
    }
266
267
    /**
268
     * @param UrlMatcherInterface $urlMatcher
269
     * @return $this
270
     */
271 3
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
272
    {
273 3
        $this->blacklistUrlMatchers[] = $urlMatcher;
274
275 3
        return $this;
276
    }
277
278
    /**
279
     * @return $this
280
     */
281 2
    public function clearBlacklistUrlMatchers()
282
    {
283 2
        $this->blacklistUrlMatchers = [];
284
285 2
        return $this;
286
    }
287
288
    /**
289
     * @param array $normalizers
290
     * @return $this
291
     */
292 1
    public function setUrlNormalizers(array $normalizers)
293
    {
294 1
        $this->clearUrlNormalizers();
295
296 1
        foreach ($normalizers as $normalizer) {
297 1
            $this->addUrlNormalizer($normalizer);
298 1
        }
299
300 1
        return $this;
301
    }
302
303
    /**
304
     * @return UrlNormalizerInterface[]
305
     */
306 1
    public function getUrlNormalizers()
307
    {
308 1
        return $this->urlNormalizers;
309
    }
310
311
    /**
312
     * @param UrlNormalizerInterface $normalizer
313
     * @return $this
314
     */
315 2
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
316
    {
317 2
        $this->urlNormalizers[] = $normalizer;
318
319 2
        return $this;
320
    }
321
322
    /**
323
     * @return $this
324
     */
325 1
    public function clearUrlNormalizers()
326
    {
327 1
        $this->urlNormalizers = [];
328
329 1
        return $this;
330
    }
331
332
    /**
333
     * @return LoggerInterface
334
     */
335 11
    public function getLogger()
336
    {
337 11
        if (is_null($this->logger)) {
338 9
            $this->logger = new NullLogger();
339 9
        }
340
341 11
        return $this->logger;
342
    }
343
344
    /**
345
     * @param LoggerInterface $logger
346
     * @return $this
347
     */
348 2
    public function setLogger(LoggerInterface $logger)
349
    {
350 2
        $this->logger = $logger;
351
352 2
        return $this;
353
    }
354
355
    /**
356
     * @param Url $url
357
     */
358 8
    protected function addUrlToQueue(Url $url)
359
    {
360 8
        $this->urlsQueued[(string)$url] = $url;
361 8
    }
362
363
    /**
364
     * @param string $url
365
     * @return Url
366
     */
367 8
    protected function createHttpUrlString($url)
368
    {
369 8
        return Url::createFromString($url);
370
    }
371
372
    /**
373
     * @param Url $url
374
     */
375 8
    protected function reset(Url $url)
376
    {
377 8
        $this->baseUrl = $url;
378 8
        $this->urlsCrawled = [];
379 8
        $this->urlsQueued = [];
380
381 8
        $this->addUrlToQueue($url);
382 8
    }
383
384
    /**
385
     * @param string $url
386
     * @return \Generator
387
     */
388 8
    public function crawl($url)
389
    {
390 8
        $url = $this->createHttpUrlString($url);
391 8
        $this->reset($url);
392
393 8
        while (count($this->urlsQueued) > 0) {
394
395 8
            $url = array_shift($this->urlsQueued);
396
397
            try {
398 8
                $crawler = $this->requestPage((string)$url);
399 8
            } catch (\Exception $e) {
400 2
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
401
402 2
                if ($this->getStopOnError()) {
403 1
                    return;
404
                }
405
406 1
                continue;
407
            }
408
409 8
            $this->urlsCrawled[] = (string)$url;
410 8
            $this->updateQueue($crawler);
411
412 8
            if ($this->shouldReturnUrl($url)) {
413 8
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
414
415 8
                $this->urlsReturned[] = (string)$url;
416
417 8
                yield new Page($url, $crawler);
418 8
            }
419
420 8
            if ($this->isLimitReached()) {
421 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
422
423 1
                return;
424
            }
425 8
        }
426 6
    }
427
428
    /**
429
     * @param DomCrawler $crawler
430
     */
431 8
    protected function updateQueue(DomCrawler $crawler)
432
    {
433 8
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
434 7
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
435
            try {
436 7
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
437
438 7
                if ($this->shouldCrawlUrl($url)) {
439 7
                    $this->addUrlToQueue($url);
440 7
                }
441 7
            } catch (\Exception $e) {
442 6
                $this->getLogger()->warning(
443 6
                    sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
444 6
                );
445 6
                $this->urlsRejected[] = $url;
446
            }
447 8
        }
448 8
    }
449
450
    /**
451
     * @param Url $url
452
     * @return Url
453
     */
454 7
    protected function normalizeUrl(Url $url)
455
    {
456 7
        foreach ($this->urlNormalizers as $normalizer) {
457 1
            $url = $normalizer->normalize($url);
458 7
        }
459
460 7
        return $url;
461
    }
462
463
    /**
464
     * @param Url $url
465
     * @return bool
466
     */
467 8
    protected function shouldReturnUrl(Url $url)
468
    {
469 8
        if (!empty($this->whitelistUrlMatchers)) {
470 1
            if (!$this->isUrlWhitelisted($url)) {
471 1
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
472
473 1
                return false;
474
            }
475 1
        }
476
477 8
        if ($this->isUrlBlacklisted($url)) {
478 1
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
479
480 1
            return false;
481
        }
482
483 8
        return true;
484
    }
485
486
    /**
487
     * @param Url $url
488
     * @return bool
489
     */
490 1
    protected function isUrlWhitelisted(Url $url)
491
    {
492 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
493 1
            if ($matcher->matches($url)) {
494 1
                return true;
495
            }
496 1
        }
497
498 1
        return false;
499
    }
500
501
    /**
502
     * @param Url $url
503
     * @return bool
504
     */
505 8
    protected function isUrlBlacklisted(Url $url)
506
    {
507 8
        foreach ($this->blacklistUrlMatchers as $matcher) {
508 1
            if ($matcher->matches($url)) {
509 1
                return true;
510
            }
511 8
        }
512
513 8
        return false;
514
    }
515
516
    /**
517
     * @param Url $url
518
     * @return bool
519
     */
520 7
    protected function shouldCrawlUrl(Url $url)
521
    {
522 7
        if ($this->isUrlRejected($url) || $this->isUrlCrawled($url) || $this->isUrlQueued($url)) {
523 5
            return false;
524
        }
525
526 7
        if (!$this->isUrlPartOfBaseUrl($url)) {
527 6
            $this->urlsRejected[] = (string)$url;
528
529 6
            return false;
530
        }
531
532 7
        return true;
533
    }
534
535
    /**
536
     * @param Url $url
537
     * @return bool
538
     */
539 7
    protected function isUrlRejected(Url $url)
540
    {
541 7
        return in_array((string)$url, $this->urlsRejected);
542
    }
543
544
    /**
545
     * @param Url $url
546
     * @return bool
547
     */
548 7
    protected function isUrlCrawled(Url $url)
549
    {
550 7
        return in_array((string)$url, $this->urlsCrawled);
551
    }
552
553
    /**
554
     * @param Url $url
555
     * @return bool
556
     */
557 7
    protected function isUrlQueued(Url $url)
558
    {
559 7
        return isset($this->urlsQueued[(string)$url]);
560
    }
561
562
    /**
563
     * @param Url $url
564
     * @return bool
565
     */
566 7
    protected function isUrlPartOfBaseUrl(Url $url)
567
    {
568 7
        $baseUrlString = (string)$this->baseUrl;
569 7
        $this->getLogger()->debug($baseUrlString.' - '.$url);
570 7
        if (strpos((string)$url, $baseUrlString) === false) {
571 6
            return false;
572
        }
573
574 7
        return true;
575
    }
576
577
    /**
578
     * @return bool
579
     */
580 8
    protected function isLimitReached()
581
    {
582 8
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
583
    }
584
585
    /**
586
     * @param DomCrawler $crawler
587
     * @return array
588
     */
589 8
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
590
    {
591 8
        return $crawler->filter('a')->each(
592 7
            function (DomCrawler $node) {
593 7
                return $node->link()->getUri();
594
            }
595 8
        );
596
    }
597
598
    /**
599
     * @param $url
600
     * @return DomCrawler
601
     */
602 8
    protected function requestPage($url)
603
    {
604 8
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
605 8
        $crawler = $this->client->request('GET', $url);
606 8
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
607
608 8
        return $crawler;
609
    }
610
}
611