Completed
Push — master ( fdde14...ba3171 )
by
unknown
03:04
created

Crawler::setExceptionOnError()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 6
ccs 3
cts 3
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 3
nc 1
nop 1
crap 1
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Exception\RequestException;
6
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
7
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
8
use Symfony\Component\BrowserKit\Client;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Psr\Log\LoggerAwareInterface;
11
use Psr\Log\LoggerInterface;
12
use Psr\Log\NullLogger;
13
14
class Crawler implements LoggerAwareInterface
15
{
16
    /**
17
     * @var Client
18
     */
19
    private $client;
20
21
    /**
22
     * @var int
23
     */
24
    private $limit = 0;
25
26
    /**
27
     * @var bool
28
     */
29
    private $stopOnError = false;
30
31
    /**
32
     * @var bool
33
     */
34
    private $exceptionOnError = false;
35
36
    /**
37
     * @var UrlMatcherInterface[]
38
     */
39
    private $whitelistUrlMatchers = [];
40
41
    /**
42
     * @var UrlMatcherInterface[]
43
     */
44
    private $blacklistUrlMatchers = [];
45
46
    /**
47
     * @var UrlNormalizerInterface[]
48
     */
49
    private $urlNormalizers = [];
50
51
    /**
52
     * @var Url
53
     */
54
    private $baseUrl;
55
56
    /**
57
     * @var array
58
     */
59
    private $urlsCrawled = [];
60
61
    /**
62
     * @var array
63
     */
64
    private $urlsQueued = [];
65
66
    /**
67
     * @var array
68
     */
69
    private $urlsRejected = [];
70
71
    /**
72
     * @var array
73
     */
74
    private $urlsReturned = [];
75
76
    /**
77
     * @var LoggerInterface
78
     */
79
    private $logger = null;
80
81
    /**
82
     * @param Client $client
83
     * @param array $options
84
     */
85 14
    public function __construct(Client $client = null, array $options = [])
86
    {
87 14
        if (empty($client)) {
88 6
            $client = new \Goutte\Client();
89 5
        }
90
91 14
        $this->setClient($client);
92 14
        $this->setOptions($options);
93
94 14
        return $this;
95
    }
96
97
    /**
98
     * @param Client $client
99
     */
100 14
    public function setClient(Client $client)
101
    {
102 14
        $this->client = $client;
103 14
    }
104
105
    /**
106
     * @return Client
107
     */
108 2
    public function getClient()
109
    {
110 2
        return $this->client;
111
    }
112
113
    /**
114
     * @param array $options
115
     */
116 14
    public function setOptions(array $options)
117
    {
118 14
        if (isset($options['limit'])) {
119 1
            $this->setLimit($options['limit']);
120 1
        }
121 14
        if (isset($options['stop_on_error'])) {
122 1
            $this->setStopOnError($options['stop_on_error']);
123 1
        }
124 14
        if (isset($options['exception_on_error'])) {
125 1
            $this->setExceptionOnError($options['exception_on_error']);
126 1
        }
127 14
        if (isset($options['logger'])) {
128 1
            $this->setLogger($options['logger']);
129 1
        }
130 14
        if (isset($options['whitelist_url_matchers'])) {
131 1
            $this->setWhitelistUrlMatchers($options['whitelist_url_matchers']);
132 1
        }
133 14
        if (isset($options['blacklist_url_matchers'])) {
134 1
            $this->setBlacklistUrlMatchers($options['blacklist_url_matchers']);
135 1
        }
136 14
        if (isset($options['url_normalizers'])) {
137 1
            $this->setUrlNormalizers($options['url_normalizers']);
138 1
        }
139 14
    }
140
141
    /**
142
     * @return int
143
     */
144 3
    public function getLimit()
145
    {
146 3
        return $this->limit;
147
    }
148
149
    /**
150
     * @param int $limit
151
     * @return $this
152
     */
153 3
    public function setLimit($limit)
154
    {
155 3
        $this->limit = $limit;
156
157 3
        return $this;
158
    }
159
160
    /**
161
     * @return boolean
162
     */
163 6
    public function getStopOnError()
164
    {
165 6
        return $this->stopOnError;
166
    }
167
168
    /**
169
     * @param boolean $stopOnError
170
     * @return $this
171
     */
172 3
    public function setStopOnError($stopOnError)
173
    {
174 3
        $this->stopOnError = $stopOnError;
175
176 3
        return $this;
177
    }
178
179
    /**
180
     * @return boolean
181
     */
182 3
    public function getExceptionOnError()
183
    {
184 3
        return $this->exceptionOnError;
185
    }
186
187
    /**
188
     * @param boolean $exceptionOnError
189
     * @return $this
190
     */
191 2
    public function setExceptionOnError($exceptionOnError)
192
    {
193 2
        $this->exceptionOnError = $exceptionOnError;
194
195 2
        return $this;
196
    }
197
198
    /**
199
     * @return array
200
     */
201 9
    public function getUrlsCrawled()
202
    {
203 9
        return $this->urlsCrawled;
204
    }
205
206
    /**
207
     * @return array
208
     */
209 2
    public function getUrlsQueued()
210
    {
211 2
        return $this->urlsQueued;
212
    }
213
214
    /**
215
     * @return array
216
     */
217 2
    public function getUrlsRejected()
218
    {
219 2
        return $this->urlsRejected;
220
    }
221
222
    /**
223
     * @return array
224
     */
225 4
    public function getUrlsReturned()
226
    {
227 4
        return $this->urlsReturned;
228
    }
229
230
    /**
231
     * @param $urlMatchers
232
     * @return $this
233
     */
234 2
    public function setWhitelistUrlMatchers(array $urlMatchers)
235
    {
236 2
        $this->clearWhitelistUrlMatchers();
237 2
        foreach ($urlMatchers as $matcher) {
238 2
            $this->addWhitelistUrlMatcher($matcher);
239 2
        }
240
241 2
        return $this;
242
    }
243
244
    /**
245
     * @return Url\Matcher\UrlMatcherInterface[]
246
     */
247 3
    public function getWhitelistUrlMatchers()
248
    {
249 3
        return $this->whitelistUrlMatchers;
250
    }
251
252
    /**
253
     * @param UrlMatcherInterface $urlMatcher
254
     * @return $this
255
     */
256 3
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
257
    {
258 3
        $this->whitelistUrlMatchers[] = $urlMatcher;
259
260 3
        return $this;
261
    }
262
263
    /**
264
     * @return $this
265
     */
266 2
    public function clearWhitelistUrlMatchers()
267
    {
268 2
        $this->whitelistUrlMatchers = [];
269
270 2
        return $this;
271
    }
272
273
    /**
274
     * @param array $urlMatchers
275
     * @return $this
276
     */
277 2
    public function setBlacklistUrlMatchers(array $urlMatchers)
278
    {
279 2
        $this->clearBlacklistUrlMatchers();
280 2
        foreach ($urlMatchers as $matcher) {
281 2
            $this->addBlacklistUrlMatcher($matcher);
282 2
        }
283
284 2
        return $this;
285
    }
286
287
    /**
288
     * @return Url\Matcher\UrlMatcherInterface[]
289
     */
290 3
    public function getBlacklistUrlMatchers()
291
    {
292 3
        return $this->blacklistUrlMatchers;
293
    }
294
295
    /**
296
     * @param UrlMatcherInterface $urlMatcher
297
     * @return $this
298
     */
299 3
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
300
    {
301 3
        $this->blacklistUrlMatchers[] = $urlMatcher;
302
303 3
        return $this;
304
    }
305
306
    /**
307
     * @return $this
308
     */
309 2
    public function clearBlacklistUrlMatchers()
310
    {
311 2
        $this->blacklistUrlMatchers = [];
312
313 2
        return $this;
314
    }
315
316
    /**
317
     * @param array $normalizers
318
     * @return $this
319
     */
320 2
    public function setUrlNormalizers(array $normalizers)
321
    {
322 2
        $this->clearUrlNormalizers();
323
324 2
        foreach ($normalizers as $normalizer) {
325 2
            $this->addUrlNormalizer($normalizer);
326 2
        }
327
328 2
        return $this;
329
    }
330
331
    /**
332
     * @return UrlNormalizerInterface[]
333
     */
334 2
    public function getUrlNormalizers()
335
    {
336 2
        return $this->urlNormalizers;
337
    }
338
339
    /**
340
     * @param UrlNormalizerInterface $normalizer
341
     * @return $this
342
     */
343 3
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
344
    {
345 3
        $this->urlNormalizers[] = $normalizer;
346
347 3
        return $this;
348
    }
349
350
    /**
351
     * @return $this
352
     */
353 2
    public function clearUrlNormalizers()
354
    {
355 2
        $this->urlNormalizers = [];
356
357 2
        return $this;
358
    }
359
360
    /**
361
     * @return LoggerInterface
362
     */
363 12
    public function getLogger()
364
    {
365 12
        if (is_null($this->logger)) {
366 10
            $this->logger = new NullLogger();
367 10
        }
368
369 12
        return $this->logger;
370
    }
371
372
    /**
373
     * @param LoggerInterface $logger
374
     * @return $this
375
     */
376 2
    public function setLogger(LoggerInterface $logger)
377
    {
378 2
        $this->logger = $logger;
379
380 2
        return $this;
381
    }
382
383
    /**
384
     * @param Url $url
385
     */
386 9
    protected function addUrlToQueue(Url $url)
387
    {
388 9
        $this->urlsQueued[(string)$url] = $url;
389 9
    }
390
391
    /**
392
     * @param string $url
393
     * @return Url
394
     */
395 9
    protected function createHttpUrlString($url)
396
    {
397 9
        return Url::createFromString($url);
398
    }
399
400
    /**
401
     * @param Url $url
402
     */
403 9
    protected function reset(Url $url)
404
    {
405 9
        $this->baseUrl = $url;
406 9
        $this->urlsCrawled = [];
407 9
        $this->urlsQueued = [];
408
409 9
        $this->addUrlToQueue($url);
410 9
    }
411
412
    /**
413
     * @param string $url
414
     * @return \Generator
415
     * @throws RequestException
416
     */
417 9
    public function crawl($url)
418
    {
419 9
        $this->reset($this->createHttpUrlString($url));
420
421 9
        while (count($this->urlsQueued) > 0) {
422
423 9
            $url = array_shift($this->urlsQueued);
424
425
            try {
426 9
                $crawler = $this->requestPage((string)$url);
427 9
            } catch (\Exception $e) {
428 3
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
429
430 3
                if ($this->getStopOnError()) {
431 1
                    return;
432
                }
433 2
                if ($this->getExceptionOnError()) {
434 1
                    throw new RequestException($e->getMessage(), $e->getCode(), $e);
435
                }
436
437 1
                continue;
438
            }
439
440 9
            $this->urlsCrawled[] = (string)$url;
441 9
            $this->updateQueue($crawler);
442
443 9
            if ($this->shouldReturnUrl($url)) {
444 9
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
445
446 9
                $this->urlsReturned[] = (string)$url;
447
448 9
                yield new Page($url, $crawler);
449 9
            }
450
451 9
            if ($this->isLimitReached()) {
452 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
453
454 1
                return;
455
            }
456 9
        }
457 6
    }
458
459
    /**
460
     * @param DomCrawler $crawler
461
     */
462 9
    protected function updateQueue(DomCrawler $crawler)
463
    {
464 9
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
465 8
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
466
            try {
467 8
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
468
469 8
                if ($this->shouldCrawlUrl($url)) {
470 8
                    $this->addUrlToQueue($url);
471 8
                }
472 8
            } catch (\Exception $e) {
473 6
                $this->getLogger()->warning(
474 6
                    sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
475 6
                );
476 6
                $this->urlsRejected[] = $url;
477
            }
478 9
        }
479 9
    }
480
481
    /**
482
     * @param Url $url
483
     * @return Url
484
     */
485 8
    protected function normalizeUrl(Url $url)
486
    {
487 8
        foreach ($this->urlNormalizers as $normalizer) {
488 1
            $url = $normalizer->normalize($url);
489 8
        }
490
491 8
        return $url;
492
    }
493
494
    /**
495
     * @param Url $url
496
     * @return bool
497
     */
498 9
    protected function shouldReturnUrl(Url $url)
499
    {
500 9
        if (!empty($this->whitelistUrlMatchers)) {
501 1
            if (!$this->isUrlWhitelisted($url)) {
502 1
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
503
504 1
                return false;
505
            }
506 1
        }
507
508 9
        if ($this->isUrlBlacklisted($url)) {
509 1
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
510
511 1
            return false;
512
        }
513
514 9
        return true;
515
    }
516
517
    /**
518
     * @param Url $url
519
     * @return bool
520
     */
521 1
    protected function isUrlWhitelisted(Url $url)
522
    {
523 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
524 1
            if ($matcher->matches($url)) {
525 1
                return true;
526
            }
527 1
        }
528
529 1
        return false;
530
    }
531
532
    /**
533
     * @param Url $url
534
     * @return bool
535
     */
536 9
    protected function isUrlBlacklisted(Url $url)
537
    {
538 9
        foreach ($this->blacklistUrlMatchers as $matcher) {
539 1
            if ($matcher->matches($url)) {
540 1
                return true;
541
            }
542 9
        }
543
544 9
        return false;
545
    }
546
547
    /**
548
     * @param Url $url
549
     * @return bool
550
     */
551 8
    protected function shouldCrawlUrl(Url $url)
552
    {
553 8
        if ($this->isUrlRejected($url) || $this->isUrlCrawled($url) || $this->isUrlQueued($url)) {
554 5
            return false;
555
        }
556
557 8
        if (!$this->isUrlPartOfBaseUrl($url)) {
558 6
            $this->urlsRejected[] = (string)$url;
559
560 6
            return false;
561
        }
562
563 8
        return true;
564
    }
565
566
    /**
567
     * @param Url $url
568
     * @return bool
569
     */
570 8
    protected function isUrlRejected(Url $url)
571
    {
572 8
        return in_array((string)$url, $this->urlsRejected);
573
    }
574
575
    /**
576
     * @param Url $url
577
     * @return bool
578
     */
579 8
    protected function isUrlCrawled(Url $url)
580
    {
581 8
        return in_array((string)$url, $this->urlsCrawled);
582
    }
583
584
    /**
585
     * @param Url $url
586
     * @return bool
587
     */
588 8
    protected function isUrlQueued(Url $url)
589
    {
590 8
        return isset($this->urlsQueued[(string)$url]);
591
    }
592
593
    /**
594
     * @param Url $url
595
     * @return bool
596
     */
597 8
    protected function isUrlPartOfBaseUrl(Url $url)
598
    {
599 8
        $baseUrlString = (string)$this->baseUrl;
600 8
        $this->getLogger()->debug($baseUrlString.' - '.$url);
601 8
        if (strpos((string)$url, $baseUrlString) === false) {
602 6
            return false;
603
        }
604
605 8
        return true;
606
    }
607
608
    /**
609
     * @return bool
610
     */
611 9
    protected function isLimitReached()
612
    {
613 9
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
614
    }
615
616
    /**
617
     * @param DomCrawler $crawler
618
     * @return array
619
     */
620 9
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
621
    {
622 9
        return $crawler->filter('a')->each(
623 8
            function (DomCrawler $node) {
624 8
                return $node->link()->getUri();
625
            }
626 9
        );
627
    }
628
629
    /**
630
     * @param string $url
631
     * @return DomCrawler
632
     */
633 9
    protected function requestPage($url)
634
    {
635 9
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
636 9
        $crawler = $this->client->request('GET', $url);
637 9
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
638
639 9
        return $crawler;
640
    }
641
}
642