Completed
Push — master ( e93528...0b45f4 )
by
unknown
05:04
created

Crawler::updateQueue()   A

Complexity

Conditions 4
Paths 6

Size

Total Lines 18
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 20

Importance

Changes 0
Metric Value
dl 0
loc 18
ccs 0
cts 14
cp 0
rs 9.2
c 0
b 0
f 0
cc 4
eloc 11
nc 6
nop 1
crap 20
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Exception\RequestException;
6
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
7
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
8
use Symfony\Component\BrowserKit\Client;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Psr\Log\LoggerAwareInterface;
11
use Psr\Log\LoggerInterface;
12
use Psr\Log\NullLogger;
13
14
class Crawler implements LoggerAwareInterface
15
{
16
    /**
17
     * @var Client
18
     */
19
    private $client;
20
21
    /**
22
     * @var int
23
     */
24
    private $limit = 0;
25
26
    /**
27
     * @var bool
28
     */
29
    private $stopOnError = false;
30
31
    /**
32
     * @var bool
33
     */
34
    private $exceptionOnError = false;
35
36
    /**
37
     * @var UrlMatcherInterface[]
38
     */
39
    private $whitelistUrlMatchers = [];
40
41
    /**
42
     * @var UrlMatcherInterface[]
43
     */
44
    private $blacklistUrlMatchers = [];
45
46
    /**
47
     * @var UrlNormalizerInterface[]
48
     */
49
    private $urlNormalizers = [];
50
51
    /**
52
     * @var Url
53
     */
54
    private $baseUrl;
55
56
    /**
57
     * @var array
58
     */
59
    private $urlsCrawled = [];
60
61
    /**
62
     * @var array
63
     */
64
    private $urlsQueued = [];
65
66
    /**
67
     * @var array
68
     */
69
    private $urlsRejected = [];
70
71
    /**
72
     * @var array
73
     */
74
    private $urlsReturned = [];
75
76
    /**
77
     * @var LoggerInterface
78
     */
79
    private $logger = null;
80
81
    /**
82
     * @param Client $client
83
     */
84 14
    public function __construct(Client $client = null)
85
    {
86 14
        if (empty($client)) {
87 5
            $client = new \Goutte\Client();
88 6
        }
89
90 14
        $this->setClient($client);
91
92 14
        return $this;
93
    }
94
95
    /**
96
     * @param Client $client
97
     */
98 14
    public function setClient(Client $client)
99
    {
100 14
        $this->client = $client;
101 14
    }
102
103
    /**
104
     * @return Client
105
     */
106 2
    public function getClient()
107
    {
108 2
        return $this->client;
109
    }
110
111
    /**
112
     * @return int
113
     */
114 2
    public function getLimit()
115
    {
116 2
        return $this->limit;
117
    }
118
119
    /**
120
     * @param int $limit
121
     * @return $this
122
     */
123 2
    public function setLimit($limit)
124
    {
125 2
        $this->limit = $limit;
126
127 2
        return $this;
128
    }
129
130
    /**
131
     * @return boolean
132
     */
133 11
    public function getStopOnError()
134
    {
135 11
        return $this->stopOnError;
136
    }
137
138
    /**
139
     * @param boolean $stopOnError
140
     * @return $this
141
     */
142 2
    public function setStopOnError($stopOnError)
143
    {
144 2
        $this->stopOnError = $stopOnError;
145
146 2
        return $this;
147
    }
148
149
    /**
150
     * @return boolean
151
     */
152 8
    public function getExceptionOnError()
153
    {
154 8
        return $this->exceptionOnError;
155
    }
156
157
    /**
158
     * @param boolean $exceptionOnError
159
     * @return $this
160
     */
161 1
    public function setExceptionOnError($exceptionOnError)
162
    {
163 1
        $this->exceptionOnError = $exceptionOnError;
164
165 1
        return $this;
166
    }
167
168
    /**
169
     * @return array
170
     */
171 9
    public function getUrlsCrawled()
172
    {
173 9
        return $this->urlsCrawled;
174
    }
175
176
    /**
177
     * @return array
178
     */
179 1
    public function getUrlsQueued()
180
    {
181 1
        return $this->urlsQueued;
182
    }
183
184
    /**
185
     * @return array
186
     */
187 1
    public function getUrlsRejected()
188
    {
189 1
        return $this->urlsRejected;
190
    }
191
192
    /**
193
     * @return array
194
     */
195 1
    public function getUrlsReturned()
196
    {
197 1
        return $this->urlsReturned;
198
    }
199
200
    /**
201
     * @param $urlMatchers
202
     * @return $this
203
     */
204 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
205
    {
206 1
        $this->clearWhitelistUrlMatchers();
207 1
        foreach ($urlMatchers as $matcher) {
208 1
            $this->addWhitelistUrlMatcher($matcher);
209 1
        }
210
211 1
        return $this;
212
    }
213
214
    /**
215
     * @return Url\Matcher\UrlMatcherInterface[]
216
     */
217 2
    public function getWhitelistUrlMatchers()
218
    {
219 2
        return $this->whitelistUrlMatchers;
220
    }
221
222
    /**
223
     * @param UrlMatcherInterface $urlMatcher
224
     * @return $this
225
     */
226 2
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
227
    {
228 2
        $this->whitelistUrlMatchers[] = $urlMatcher;
229
230 2
        return $this;
231
    }
232
233
    /**
234
     * @return $this
235
     */
236 1
    public function clearWhitelistUrlMatchers()
237
    {
238 1
        $this->whitelistUrlMatchers = [];
239
240 1
        return $this;
241
    }
242
243
    /**
244
     * @param array $urlMatchers
245
     * @return $this
246
     */
247 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
248
    {
249 1
        $this->clearBlacklistUrlMatchers();
250 1
        foreach ($urlMatchers as $matcher) {
251 1
            $this->addBlacklistUrlMatcher($matcher);
252 1
        }
253
254 1
        return $this;
255
    }
256
257
    /**
258
     * @return Url\Matcher\UrlMatcherInterface[]
259
     */
260 2
    public function getBlacklistUrlMatchers()
261
    {
262 2
        return $this->blacklistUrlMatchers;
263
    }
264
265
    /**
266
     * @param UrlMatcherInterface $urlMatcher
267
     * @return $this
268
     */
269 2
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
270
    {
271 2
        $this->blacklistUrlMatchers[] = $urlMatcher;
272
273 2
        return $this;
274
    }
275
276
    /**
277
     * @return $this
278
     */
279 1
    public function clearBlacklistUrlMatchers()
280
    {
281 1
        $this->blacklistUrlMatchers = [];
282
283 1
        return $this;
284
    }
285
286
    /**
287
     * @param array $normalizers
288
     * @return $this
289
     */
290 1
    public function setUrlNormalizers(array $normalizers)
291
    {
292 1
        $this->clearUrlNormalizers();
293
294 1
        foreach ($normalizers as $normalizer) {
295 1
            $this->addUrlNormalizer($normalizer);
296 1
        }
297
298 1
        return $this;
299
    }
300
301
    /**
302
     * @return UrlNormalizerInterface[]
303
     */
304 1
    public function getUrlNormalizers()
305
    {
306 1
        return $this->urlNormalizers;
307
    }
308
309
    /**
310
     * @param UrlNormalizerInterface $normalizer
311
     * @return $this
312
     */
313 2
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
314
    {
315 2
        $this->urlNormalizers[] = $normalizer;
316
317 2
        return $this;
318
    }
319
320
    /**
321
     * @return $this
322
     */
323 1
    public function clearUrlNormalizers()
324
    {
325 1
        $this->urlNormalizers = [];
326
327 1
        return $this;
328
    }
329
330
    /**
331
     * @return LoggerInterface
332
     */
333 12
    public function getLogger()
334
    {
335 12
        if (is_null($this->logger)) {
336 11
            $this->logger = new NullLogger();
337 11
        }
338
339 12
        return $this->logger;
340
    }
341
342
    /**
343
     * @param LoggerInterface $logger
344
     * @return $this
345
     */
346 1
    public function setLogger(LoggerInterface $logger)
347
    {
348 1
        $this->logger = $logger;
349
350 1
        return $this;
351
    }
352
353
    /**
354
     * @param Url $url
355
     */
356 10
    protected function addUrlToQueue(Url $url)
357
    {
358 10
        $this->urlsQueued[(string)$url] = $url;
359 10
    }
360
361
    /**
362
     * @param string $url
363
     * @return Url
364
     */
365 9
    protected function createHttpUrlString($url)
366
    {
367 9
        return Url::createFromString($url);
368
    }
369
370
    /**
371
     * @param Url $url
372
     */
373 10
    protected function reset(Url $url)
374
    {
375 10
        $this->baseUrl = $url;
376 10
        $this->urlsCrawled = [];
377 10
        $this->urlsQueued = [];
378
379 10
        $this->addUrlToQueue($url);
380 10
    }
381
382
    /**
383
     * @param string $url
384
     * @return \Generator
385
     * @throws RequestException
386
     */
387 9
    public function crawl($url)
388
    {
389 9
        $this->reset($this->createHttpUrlString($url));
390
391 9
        while (count($this->urlsQueued) > 0) {
392
393 9
            $url = array_shift($this->urlsQueued);
394
395
            try {
396 9
                $crawler = $this->requestPage((string)$url);
397 9
                $url = $this->createHttpUrlString($this->client->getInternalRequest()->getUri());
398 9
            } catch (\Exception $e) {
399 9
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
400
401 9
                if ($this->getStopOnError()) {
402 1
                    return;
403
                }
404 8
                if ($this->getExceptionOnError()) {
405 1
                    throw new RequestException($e->getMessage(), $e->getCode(), $e);
406
                }
407
408 7
                continue;
409
            }
410
411
            $this->urlsCrawled[] = (string)$url;
412
            $this->updateQueue($crawler);
413
414
            if ($this->shouldReturnUrl($url)) {
415
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
416
417
                $this->urlsReturned[] = (string)$url;
418
419
                yield new Page($url, $crawler, $this->client->getResponse());
0 ignored issues
show
Bug introduced by
It seems like $crawler defined by $this->requestPage((string) $url) on line 396 can be null; however, MediaMonks\Crawler\Page::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
420
            }
421
422
            if ($this->isLimitReached()) {
423
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
424
425
                return;
426
            }
427
        }
428 7
    }
429
430
    /**
431
     * @param DomCrawler $crawler
432
     */
433
    protected function updateQueue(DomCrawler $crawler)
434
    {
435
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
436
            $this->getLogger()->debug(sprintf('Found url %s in page', $url));
437
            try {
438
                $url = $this->normalizeUrl($this->createHttpUrlString($url));
439
440
                if ($this->shouldCrawlUrl($url)) {
441
                    $this->addUrlToQueue($url);
442
                }
443
            } catch (\Exception $e) {
444
                $this->getLogger()->warning(
445
                    sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
446
                );
447
                $this->urlsRejected[] = $url;
448
            }
449
        }
450
    }
451
452
    /**
453
     * @param Url $url
454
     * @return Url
455
     */
456
    protected function normalizeUrl(Url $url)
457
    {
458
        foreach ($this->urlNormalizers as $normalizer) {
459
            $url = $normalizer->normalize($url);
460
        }
461
462
        return $url;
463
    }
464
465
    /**
466
     * @param Url $url
467
     * @return bool
468
     */
469
    protected function shouldReturnUrl(Url $url)
470
    {
471
        if (!empty($this->whitelistUrlMatchers)) {
472
            if (!$this->isUrlWhitelisted($url)) {
473
                $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
474
475
                return false;
476
            }
477
        }
478
479
        if ($this->isUrlBlacklisted($url)) {
480
            $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
481
482
            return false;
483
        }
484
485
        return true;
486
    }
487
488
    /**
489
     * @param Url $url
490
     * @return bool
491
     */
492
    protected function isUrlWhitelisted(Url $url)
493
    {
494
        foreach ($this->whitelistUrlMatchers as $matcher) {
495
            if ($matcher->matches($url)) {
496
                return true;
497
            }
498
        }
499
500
        return false;
501
    }
502
503
    /**
504
     * @param Url $url
505
     * @return bool
506
     */
507
    protected function isUrlBlacklisted(Url $url)
508
    {
509
        foreach ($this->blacklistUrlMatchers as $matcher) {
510
            if ($matcher->matches($url)) {
511
                return true;
512
            }
513
        }
514
515
        return false;
516
    }
517
518
    /**
519
     * @param Url $url
520
     * @return bool
521
     */
522 1
    protected function shouldCrawlUrl(Url $url)
523
    {
524 1
        if ($this->isUrlRejected($url)
525 1
            || $this->isUrlCrawled($url)
526 1
            || $this->isUrlQueued($url)
527 1
        ) {
528 1
            return false;
529
        }
530
531 1
        if (!$this->isUrlPartOfBaseUrl($url)) {
532 1
            $this->urlsRejected[] = (string)$url;
533
534 1
            return false;
535
        }
536
537 1
        return true;
538
    }
539
540
    /**
541
     * @param Url $url
542
     * @return bool
543
     */
544 1
    protected function isUrlRejected(Url $url)
545
    {
546 1
        return in_array((string)$url, $this->urlsRejected);
547
    }
548
549
    /**
550
     * @param Url $url
551
     * @return bool
552
     */
553 1
    protected function isUrlCrawled(Url $url)
554
    {
555 1
        return in_array((string)$url, $this->urlsCrawled);
556
    }
557
558
    /**
559
     * @param Url $url
560
     * @return bool
561
     */
562 1
    protected function isUrlQueued(Url $url)
563
    {
564 1
        return isset($this->urlsQueued[(string)$url]);
565
    }
566
567
    /**
568
     * @param Url $url
569
     * @return bool
570
     */
571 1
    protected function isUrlPartOfBaseUrl(Url $url)
572
    {
573 1
        $baseUrlString = (string)$this->baseUrl;
574 1
        $this->getLogger()->debug($baseUrlString.' - '.$url);
575 1
        if (strpos((string)$url, $baseUrlString) === false) {
576 1
            return false;
577
        }
578
579 1
        return true;
580
    }
581
582
    /**
583
     * @return bool
584
     */
585
    protected function isLimitReached()
586
    {
587
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
588
    }
589
590
    /**
591
     * @param DomCrawler $crawler
592
     * @return array
593
     */
594
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
595
    {
596
        return $crawler->filter('a')->each(
597
            function (DomCrawler $node) {
598
                return $node->link()->getUri();
599
            }
600
        );
601
    }
602
603
    /**
604
     * @param string $url
605
     * @return DomCrawler
606
     */
607 9
    protected function requestPage($url)
608
    {
609 9
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
610 9
        $crawler = $this->client->request('GET', $url);
611 9
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
612
613 9
        return $crawler;
614
    }
615
}
616