Completed
Push — master ( 0b45f4...ecad61 )
by
unknown
02:17
created

Crawler::updateUrl()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 9
ccs 6
cts 6
cp 1
rs 9.6666
c 0
b 0
f 0
cc 2
eloc 5
nc 2
nop 1
crap 2
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Exception\RequestException;
6
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
7
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
8
use Symfony\Component\BrowserKit\Client;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Psr\Log\LoggerAwareInterface;
11
use Psr\Log\LoggerInterface;
12
use Psr\Log\NullLogger;
13
14
class Crawler implements LoggerAwareInterface
15
{
16
    /**
17
     * @var Client
18
     */
19
    private $client;
20
21
    /**
22
     * @var int
23
     */
24
    private $limit = 0;
25
26
    /**
27
     * @var bool
28
     */
29
    private $stopOnError = false;
30
31
    /**
32
     * @var bool
33
     */
34
    private $exceptionOnError = false;
35
36
    /**
37
     * @var UrlMatcherInterface[]
38
     */
39
    private $whitelistUrlMatchers = [];
40
41
    /**
42
     * @var UrlMatcherInterface[]
43
     */
44
    private $blacklistUrlMatchers = [];
45
46
    /**
47
     * @var UrlNormalizerInterface[]
48
     */
49
    private $urlNormalizers = [];
50
51
    /**
52
     * @var Url
53
     */
54
    private $baseUrl;
55
56
    /**
57
     * @var array
58
     */
59
    private $urlsCrawled = [];
60
61
    /**
62
     * @var array
63
     */
64
    private $urlsQueued = [];
65
66
    /**
67
     * @var array
68
     */
69
    private $urlsRejected = [];
70
71
    /**
72
     * @var array
73
     */
74
    private $urlsReturned = [];
75
76
    /**
77
     * @var LoggerInterface
78
     */
79
    private $logger = null;
80
81
    /**
82
     * @param Client $client
83
     */
84 15
    public function __construct(Client $client = null)
85
    {
86 15
        if (empty($client)) {
87 5
            $client = new \Goutte\Client();
88 6
        }
89
90 15
        $this->setClient($client);
91
92 15
        return $this;
93
    }
94
95
    /**
96
     * @param Client $client
97
     */
98 15
    public function setClient(Client $client)
99
    {
100 15
        $this->client = $client;
101 15
    }
102
103
    /**
104
     * @return Client
105
     */
106 2
    public function getClient()
107
    {
108 2
        return $this->client;
109
    }
110
111
    /**
112
     * @return int
113
     */
114 2
    public function getLimit()
115
    {
116 2
        return $this->limit;
117
    }
118
119
    /**
120
     * @param int $limit
121
     * @return $this
122
     */
123 2
    public function setLimit($limit)
124
    {
125 2
        $this->limit = $limit;
126
127 2
        return $this;
128
    }
129
130
    /**
131
     * @return boolean
132
     */
133 5
    public function getStopOnError()
134
    {
135 5
        return $this->stopOnError;
136
    }
137
138
    /**
139
     * @param boolean $stopOnError
140
     * @return $this
141
     */
142 2
    public function setStopOnError($stopOnError)
143
    {
144 2
        $this->stopOnError = $stopOnError;
145
146 2
        return $this;
147
    }
148
149
    /**
150
     * @return boolean
151
     */
152 2
    public function getExceptionOnError()
153
    {
154 2
        return $this->exceptionOnError;
155
    }
156
157
    /**
158
     * @param boolean $exceptionOnError
159
     * @return $this
160
     */
161 1
    public function setExceptionOnError($exceptionOnError)
162
    {
163 1
        $this->exceptionOnError = $exceptionOnError;
164
165 1
        return $this;
166
    }
167
168
    /**
169
     * @return array
170
     */
171 9
    public function getUrlsCrawled()
172
    {
173 9
        return $this->urlsCrawled;
174
    }
175
176
    /**
177
     * @return array
178
     */
179 2
    public function getUrlsQueued()
180
    {
181 2
        return $this->urlsQueued;
182
    }
183
184
    /**
185
     * @return array
186
     */
187 2
    public function getUrlsRejected()
188
    {
189 2
        return $this->urlsRejected;
190
    }
191
192
    /**
193
     * @return array
194
     */
195 4
    public function getUrlsReturned()
196
    {
197 4
        return $this->urlsReturned;
198
    }
199
200
    /**
201
     * @param $urlMatchers
202
     * @return $this
203
     */
204 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
205
    {
206 1
        $this->clearWhitelistUrlMatchers();
207 1
        foreach ($urlMatchers as $matcher) {
208 1
            $this->addWhitelistUrlMatcher($matcher);
209 1
        }
210
211 1
        return $this;
212
    }
213
214
    /**
215
     * @return Url\Matcher\UrlMatcherInterface[]
216
     */
217 2
    public function getWhitelistUrlMatchers()
218
    {
219 2
        return $this->whitelistUrlMatchers;
220
    }
221
222
    /**
223
     * @param UrlMatcherInterface $urlMatcher
224
     * @return $this
225
     */
226 2
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
227
    {
228 2
        $this->whitelistUrlMatchers[] = $urlMatcher;
229
230 2
        return $this;
231
    }
232
233
    /**
234
     * @return $this
235
     */
236 1
    public function clearWhitelistUrlMatchers()
237
    {
238 1
        $this->whitelistUrlMatchers = [];
239
240 1
        return $this;
241
    }
242
243
    /**
244
     * @param array $urlMatchers
245
     * @return $this
246
     */
247 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
248
    {
249 1
        $this->clearBlacklistUrlMatchers();
250 1
        foreach ($urlMatchers as $matcher) {
251 1
            $this->addBlacklistUrlMatcher($matcher);
252 1
        }
253
254 1
        return $this;
255
    }
256
257
    /**
258
     * @return Url\Matcher\UrlMatcherInterface[]
259
     */
260 2
    public function getBlacklistUrlMatchers()
261
    {
262 2
        return $this->blacklistUrlMatchers;
263
    }
264
265
    /**
266
     * @param UrlMatcherInterface $urlMatcher
267
     * @return $this
268
     */
269 2
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
270
    {
271 2
        $this->blacklistUrlMatchers[] = $urlMatcher;
272
273 2
        return $this;
274
    }
275
276
    /**
277
     * @return $this
278
     */
279 1
    public function clearBlacklistUrlMatchers()
280
    {
281 1
        $this->blacklistUrlMatchers = [];
282
283 1
        return $this;
284
    }
285
286
    /**
287
     * @param array $normalizers
288
     * @return $this
289
     */
290 1
    public function setUrlNormalizers(array $normalizers)
291
    {
292 1
        $this->clearUrlNormalizers();
293
294 1
        foreach ($normalizers as $normalizer) {
295 1
            $this->addUrlNormalizer($normalizer);
296 1
        }
297
298 1
        return $this;
299
    }
300
301
    /**
302
     * @return UrlNormalizerInterface[]
303
     */
304 1
    public function getUrlNormalizers()
305
    {
306 1
        return $this->urlNormalizers;
307
    }
308
309
    /**
310
     * @param UrlNormalizerInterface $normalizer
311
     * @return $this
312
     */
313 2
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
314
    {
315 2
        $this->urlNormalizers[] = $normalizer;
316
317 2
        return $this;
318
    }
319
320
    /**
321
     * @return $this
322
     */
323 1
    public function clearUrlNormalizers()
324
    {
325 1
        $this->urlNormalizers = [];
326
327 1
        return $this;
328
    }
329
330
    /**
331
     * @return LoggerInterface
332
     */
333 13
    public function getLogger()
334
    {
335 13
        if (is_null($this->logger)) {
336 12
            $this->logger = new NullLogger();
337 12
        }
338
339 13
        return $this->logger;
340
    }
341
342
    /**
343
     * @param LoggerInterface $logger
344
     * @return $this
345
     */
346 1
    public function setLogger(LoggerInterface $logger)
347
    {
348 1
        $this->logger = $logger;
349
350 1
        return $this;
351
    }
352
353
    /**
354
     * @param Url $url
355
     */
356 11
    protected function addUrlToQueue(Url $url)
357
    {
358 11
        $this->urlsQueued[(string)$url] = $url;
359 11
    }
360
361
    /**
362
     * @param string $url
363
     * @return Url
364
     */
365 10
    protected function createHttpUrlString($url)
366
    {
367 10
        return Url::createFromString($url);
368
    }
369
370
    /**
371
     * @param Url $url
372
     */
373 11
    protected function reset(Url $url)
374
    {
375 11
        $this->baseUrl = $url;
376 11
        $this->urlsCrawled = [];
377 11
        $this->urlsQueued = [];
378
379 11
        $this->addUrlToQueue($url);
380 11
    }
381
382
    /**
383
     * @param string $url
384
     * @return \Generator|Page[]
385
     * @throws RequestException
386
     */
387 10
    public function crawl($url)
388
    {
389 10
        $this->reset($this->createHttpUrlString($url));
390
391 10
        while (count($this->urlsQueued) > 0) {
392
393 10
            $url = array_shift($this->urlsQueued);
394
395
            try {
396 10
                $crawler = $this->requestPage((string)$url);
397 10
                $url = $this->updateUrl($url);
398 10
            } catch (\Exception $e) {
399 3
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
400
401 3
                if ($this->getStopOnError()) {
402 1
                    return;
403
                }
404 2
                if ($this->getExceptionOnError()) {
405 1
                    throw new RequestException($e->getMessage(), $e->getCode(), $e);
406
                }
407
408 1
                continue;
409
            }
410
411 9
            $this->urlsCrawled[] = (string)$url;
412 9
            $this->updateQueue($crawler);
413
414 9
            if ($this->shouldReturnUrl($url)) {
415 9
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
416
417 9
                $this->urlsReturned[] = (string)$url;
418
419 9
                yield new Page($url, $crawler, $this->client->getResponse());
0 ignored issues
show
Bug introduced by
It seems like $crawler defined by $this->requestPage((string) $url) on line 396 can be null; however, MediaMonks\Crawler\Page::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
420 9
            }
421
422 9
            if ($this->isLimitReached()) {
423 1
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
424
425 1
                return;
426
            }
427 9
        }
428 7
    }
429
430
    /**
431
     * @param Url $url
432
     * @return Url
433
     */
434 10
    protected function updateUrl(Url $url)
435
    {
436 10
        $internalRequest = $this->client->getInternalRequest();
437 9
        if (!empty($internalRequest)) {
438 1
            $url = $this->createHttpUrlString($this->client->getInternalRequest()->getUri());
439 1
        }
440
441 9
        return $url;
442
    }
443
444
    /**
445
     * @param DomCrawler $crawler
446
     */
447 9
    protected function updateQueue(DomCrawler $crawler)
448
    {
449 9
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
450 7
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
451
            try {
452 7
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
453
454 7
                if ($this->shouldCrawlUrl($url)) {
455 7
                    $this->addUrlToQueue($url);
456 7
                }
457 7
            } catch (\Exception $e) {
458 6
                $this->getLogger()->warning(
459 6
                    sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
460 6
                );
461 6
                $this->urlsRejected[] = $url;
462
            }
463 9
        }
464 9
    }
465
466
    /**
467
     * @param Url $url
468
     * @return Url
469
     */
470 7
    protected function normalizeUrl(Url $url)
471
    {
472 7
        foreach ($this->urlNormalizers as $normalizer) {
473 1
            $url = $normalizer->normalize($url);
474 7
        }
475
476 7
        return $url;
477
    }
478
479
    /**
480
     * @param Url $url
481
     * @return bool
482
     */
483 9
    protected function shouldReturnUrl(Url $url)
484
    {
485 9
        if (!empty($this->whitelistUrlMatchers)) {
486 1
            if (!$this->isUrlWhitelisted($url)) {
487 1
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
488
489 1
                return false;
490
            }
491 1
        }
492
493 9
        if ($this->isUrlBlacklisted($url)) {
494 1
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
495
496 1
            return false;
497
        }
498
499 9
        return true;
500
    }
501
502
    /**
503
     * @param Url $url
504
     * @return bool
505
     */
506 1
    protected function isUrlWhitelisted(Url $url)
507
    {
508 1
        foreach ($this->whitelistUrlMatchers as $matcher) {
509 1
            if ($matcher->matches($url)) {
510 1
                return true;
511
            }
512 1
        }
513
514 1
        return false;
515
    }
516
517
    /**
518
     * @param Url $url
519
     * @return bool
520
     */
521 9
    protected function isUrlBlacklisted(Url $url)
522
    {
523 9
        foreach ($this->blacklistUrlMatchers as $matcher) {
524 1
            if ($matcher->matches($url)) {
525 1
                return true;
526
            }
527 9
        }
528
529 9
        return false;
530
    }
531
532
    /**
533
     * @param Url $url
534
     * @return bool
535
     */
536 8
    protected function shouldCrawlUrl(Url $url)
537
    {
538 8
        if ($this->isUrlRejected($url)
539 8
            || $this->isUrlCrawled($url)
540 8
            || $this->isUrlQueued($url)
541 8
        ) {
542 6
            return false;
543
        }
544
545 8
        if (!$this->isUrlPartOfBaseUrl($url)) {
546 7
            $this->urlsRejected[] = (string)$url;
547
548 7
            return false;
549
        }
550
551 8
        return true;
552
    }
553
554
    /**
555
     * @param Url $url
556
     * @return bool
557
     */
558 8
    protected function isUrlRejected(Url $url)
559
    {
560 8
        return in_array((string)$url, $this->urlsRejected);
561
    }
562
563
    /**
564
     * @param Url $url
565
     * @return bool
566
     */
567 8
    protected function isUrlCrawled(Url $url)
568
    {
569 8
        return in_array((string)$url, $this->urlsCrawled);
570
    }
571
572
    /**
573
     * @param Url $url
574
     * @return bool
575
     */
576 8
    protected function isUrlQueued(Url $url)
577
    {
578 8
        return isset($this->urlsQueued[(string)$url]);
579
    }
580
581
    /**
582
     * @param Url $url
583
     * @return bool
584
     */
585 8
    protected function isUrlPartOfBaseUrl(Url $url)
586
    {
587 8
        $baseUrlString = (string)$this->baseUrl;
588 8
        $this->getLogger()->debug($baseUrlString.' - '.$url);
589 8
        if (strpos((string)$url, $baseUrlString) === false) {
590 7
            return false;
591
        }
592
593 8
        return true;
594
    }
595
596
    /**
597
     * @return bool
598
     */
599 9
    protected function isLimitReached()
600
    {
601 9
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
602
    }
603
604
    /**
605
     * @param DomCrawler $crawler
606
     * @return array
607
     */
608 9
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
609
    {
610 9
        return $crawler->filter('a')->each(
611 7
            function (DomCrawler $node) {
612 7
                return $node->link()->getUri();
613
            }
614 9
        );
615
    }
616
617
    /**
618
     * @param string $url
619
     * @return DomCrawler
620
     */
621 10
    protected function requestPage($url)
622
    {
623 10
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
624 10
        $crawler = $this->client->request('GET', $url);
625 10
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
626
627 10
        return $crawler;
628
    }
629
}
630