Completed
Push — master ( 77ed0b...238530 )
by
unknown
05:24
created

Crawler::getUrlsRejected()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
6
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
7
use Symfony\Component\BrowserKit\Client;
8
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
9
use Psr\Log\LoggerAwareInterface;
10
use Psr\Log\LoggerInterface;
11
use Psr\Log\NullLogger;
12
13
class Crawler implements LoggerAwareInterface
14
{
15
    /**
16
     * @var Client
17
     */
18
    private $client;
19
20
    /**
21
     * @var int
22
     */
23
    private $limit = 0;
24
25
    /**
26
     * @var bool
27
     */
28
    private $stopOnError = false;
29
30
    /**
31
     * @var UrlMatcherInterface[]
32
     */
33
    private $whitelistUrlMatchers = [];
34
35
    /**
36
     * @var UrlMatcherInterface[]
37
     */
38
    private $blacklistUrlMatchers = [];
39
40
    /**
41
     * @var UrlNormalizerInterface[]
42
     */
43
    private $urlNormalizers = [];
44
45
    /**
46
     * @var Url
47
     */
48
    private $baseUrl;
49
50
    /**
51
     * @var array
52
     */
53
    private $urlsCrawled = [];
54
55
    /**
56
     * @var array
57
     */
58
    private $urlsQueued = [];
59
60
    /**
61
     * @var array
62
     */
63
    private $urlsRejected = [];
64
65
    /**
66
     * @var array
67
     */
68
    private $urlsReturned = [];
69
70
    /**
71
     * @var LoggerInterface
72
     */
73
    private $logger = null;
74
75
    /**
76
     * @param Client $client
77
     * @param array $options
78
     */
79 3
    public function __construct(Client $client = null, array $options = [])
80
    {
81 3
        if (empty($client)) {
82 3
            $client = new \Goutte\Client();
83 3
        }
84
85 3
        $this->setClient($client);
86 3
        $this->setOptions($options);
87
88 3
        return $this;
89
    }
90
91
    /**
92
     * @param Client $client
93
     */
94 3
    public function setClient(Client $client)
95
    {
96 3
        $this->client = $client;
97 3
    }
98
99
    /**
100
     * @return Client
101
     */
102 2
    public function getClient()
103
    {
104 2
        return $this->client;
105
    }
106
107
    /**
108
     * @param array $options
109
     */
110 3
    public function setOptions(array $options)
111
    {
112 3
        if (isset($options['limit'])) {
113 1
            $this->setLimit($options['limit']);
114 1
        }
115 3
        if (isset($options['stop_on_error'])) {
116 1
            $this->setStopOnError($options['stop_on_error']);
117 1
        }
118 3
        if (isset($options['logger'])) {
119 1
            $this->setLogger($options['logger']);
120 1
        }
121 3
        if (isset($options['whitelist_url_matchers'])) {
122 1
            $this->setWhitelistUrlMatchers($options['whitelist_url_matchers']);
123 1
        }
124 3
        if (isset($options['blacklist_url_matchers'])) {
125 1
            $this->setBlacklistUrlMatchers($options['blacklist_url_matchers']);
126 1
        }
127 3
    }
128
129
    /**
130
     * @return int
131
     */
132 3
    public function getLimit()
133
    {
134 3
        return $this->limit;
135
    }
136
137
    /**
138
     * @param int $limit
139
     * @return $this
140
     */
141 2
    public function setLimit($limit)
142
    {
143 2
        $this->limit = $limit;
144
145 2
        return $this;
146
    }
147
148
    /**
149
     * @return boolean
150
     */
151 3
    public function getStopOnError()
152
    {
153 3
        return $this->stopOnError;
154
    }
155
156
    /**
157
     * @param boolean $stopOnError
158
     * @return Crawler
159
     */
160 2
    public function setStopOnError($stopOnError)
161
    {
162 2
        $this->stopOnError = $stopOnError;
163
164 2
        return $this;
165
    }
166
167
    /**
168
     * @return array
169
     */
170 1
    public function getUrlsCrawled()
171
    {
172 1
        return $this->urlsCrawled;
173
    }
174
175
    /**
176
     * @return array
177
     */
178 1
    public function getUrlsQueued()
179
    {
180 1
        return $this->urlsQueued;
181
    }
182
183
    /**
184
     * @return array
185
     */
186 1
    public function getUrlsRejected()
187
    {
188 1
        return $this->urlsRejected;
189
    }
190
191
    /**
192
     * @return array
193
     */
194 1
    public function getUrlsReturned()
195
    {
196 1
        return $this->urlsReturned;
197
    }
198
199
    /**
200
     * @param $urlMatchers
201
     * @return $this
202
     */
203 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
204
    {
205 1
        $this->clearWhitelistUrlMatchers();
206 1
        foreach ($urlMatchers as $matcher) {
207 1
            $this->addWhitelistUrlMatcher($matcher);
208 1
        }
209
210 1
        return $this;
211
    }
212
213
    /**
214
     * @return Url\Matcher\UrlMatcherInterface[]
215
     */
216 2
    public function getWhitelistUrlMatchers()
217
    {
218 2
        return $this->whitelistUrlMatchers;
219
    }
220
221
    /**
222
     * @param UrlMatcherInterface $urlMatcher
223
     * @return $this
224
     */
225 1
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
226
    {
227 1
        $this->whitelistUrlMatchers[] = $urlMatcher;
228
229 1
        return $this;
230
    }
231
232
    /**
233
     * @return $this
234
     */
235 1
    public function clearWhitelistUrlMatchers()
236
    {
237 1
        $this->whitelistUrlMatchers = [];
238
239 1
        return $this;
240
    }
241
242
    /**
243
     * @param array $urlMatchers
244
     * @return $this
245
     */
246 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
247
    {
248 1
        $this->clearBlacklistUrlMatchers();
249 1
        foreach ($urlMatchers as $matcher) {
250 1
            $this->addBlacklistUrlMatcher($matcher);
251 1
        }
252
253 1
        return $this;
254
    }
255
256
    /**
257
     * @return Url\Matcher\UrlMatcherInterface[]
258
     */
259 2
    public function getBlacklistUrlMatchers()
260
    {
261 2
        return $this->blacklistUrlMatchers;
262
    }
263
264
    /**
265
     * @param UrlMatcherInterface $urlMatcher
266
     * @return $this
267
     */
268 1
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
269
    {
270 1
        $this->blacklistUrlMatchers[] = $urlMatcher;
271
272 1
        return $this;
273
    }
274
275
    /**
276
     * @return $this
277
     */
278 1
    public function clearBlacklistUrlMatchers()
279
    {
280 1
        $this->blacklistUrlMatchers = [];
281
282 1
        return $this;
283
    }
284
285
    /**
286
     * @param array $normalizers
287
     * @return $this
288
     */
289
    public function setUrlNormalizers(array $normalizers)
290
    {
291
        $this->clearUrlNormalizers();
292
293
        foreach ($normalizers as $normalizer) {
294
            $this->addUrlNormalizer($normalizer);
295
        }
296
297
        return $this;
298
    }
299
300
    /**
301
     * @param UrlNormalizerInterface $normalizer
302
     * @return $this
303
     */
304
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
305
    {
306
        $this->urlNormalizers[] = $normalizer;
307
308
        return $this;
309
    }
310
311
    /**
312
     * @return $this
313
     */
314
    public function clearUrlNormalizers()
315
    {
316
        $this->urlNormalizers = [];
317
318
        return $this;
319
    }
320
321
    /**
322
     * @return LoggerInterface
323
     */
324 3
    public function getLogger()
325
    {
326 3
        if (is_null($this->logger)) {
327 1
            $this->logger = new NullLogger();
328 1
        }
329
330 3
        return $this->logger;
331
    }
332
333
    /**
334
     * @param LoggerInterface $logger
335
     * @return $this
336
     */
337 2
    public function setLogger(LoggerInterface $logger)
338
    {
339 2
        $this->logger = $logger;
340
341 2
        return $this;
342
    }
343
344
    /**
345
     * @param Url $url
346
     */
347
    protected function addUrlToQueue(Url $url)
348
    {
349
        $this->urlsQueued[(string)$url] = $url;
350
    }
351
352
    /**
353
     * @param $url
354
     * @return Url
355
     */
356
    protected function createHttpUrlString($url)
357
    {
358
        return Url::createFromString($url);
359
    }
360
361
    /**
362
     * @param Url $url
363
     */
364
    protected function reset(Url $url)
365
    {
366
        $this->baseUrl = $url;
367
        $this->urlsCrawled = [];
368
        $this->urlsQueued = [];
369
370
        $this->addUrlToQueue($url);
371
    }
372
373
    /**
374
     * @param string $url
375
     * @return \Generator|void
376
     */
377
    public function crawl($url)
378
    {
379
        $url = $this->createHttpUrlString($url);
380
        $this->reset($url);
381
382
        while (count($this->urlsQueued) > 0) {
383
384
            $url = array_shift($this->urlsQueued);
385
386
            try {
387
                $crawler = $this->requestPage((string)$url);
388
            } catch (\Exception $e) {
389
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
390
391
                if ($this->getStopOnError()) {
392
                    return;
393
                }
394
395
                continue;
396
            }
397
398
            $this->urlsCrawled[] = (string)$url;
399
            $this->updateQueue($crawler);
1 ignored issue
show
Bug introduced by
It seems like $crawler defined by $this->requestPage((string) $url) on line 387 can be null; however, MediaMonks\Crawler\Crawler::updateQueue() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
400
401
            if ($this->shouldReturnUrl($url)) {
402
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
403
404
                $this->urlsReturned[] = (string)$url;
405
406
                yield new Page($url, $crawler);
1 ignored issue
show
Bug introduced by
It seems like $crawler defined by $this->requestPage((string) $url) on line 387 can be null; however, MediaMonks\Crawler\Page::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
407
            }
408
409
            if ($this->isLimitReached()) {
410
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
411
412
                return;
413
            }
414
        }
415
    }
416
417
    /**
418
     * @param DomCrawler $crawler
419
     */
420
    protected function updateQueue(DomCrawler $crawler)
421
    {
422
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
423
            if (!in_array($url, $this->urlsRejected)) {
424
                $this->getLogger()->debug(sprintf('Found url %s in page', $url));
425
                try {
426
                    $url = $this->normalizeUrl($this->createHttpUrlString($url));
427
428
                    if ($this->shouldCrawlUrl($url)) {
429
                        $this->addUrlToQueue($url);
430
                    }
431
                } catch (\Exception $e) {
432
                    $this->getLogger()->warning(
433
                        sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
434
                    );
435
                    $this->urlsRejected[] = $url;
436
                }
437
            }
438
        }
439
    }
440
441
    /**
442
     * @param Url $url
443
     * @return Url
444
     */
445
    protected function normalizeUrl(Url $url)
446
    {
447
        foreach($this->urlNormalizers as $normalizer) {
448
            $url = $normalizer->normalize($url);
449
        }
450
451
        return $url;
452
    }
453
454
    /**
455
     * @param Url $url
456
     * @return bool
457
     */
458
    protected function shouldReturnUrl(Url $url)
459
    {
460
        if (!empty($this->whitelistUrlMatchers)) {
461
            foreach ($this->whitelistUrlMatchers as $matcher) {
462
                if ($matcher->matches($url)) {
463
                    return true;
464
                }
465
            }
466
            $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
467
468
            return false;
469
        }
470
471
        foreach ($this->blacklistUrlMatchers as $matcher) {
472
            if ($matcher->matches($url)) {
473
                $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
474
475
                return false;
476
            }
477
        }
478
479
        return true;
480
    }
481
482
    /**
483
     * @param Url $url
484
     * @return bool
485
     */
486
    protected function shouldCrawlUrl(Url $url)
487
    {
488
        $urlString = (string)$url;
489
        if (in_array($urlString, $this->urlsRejected)) {
490
            return false;
491
        }
492
        if (in_array($urlString, $this->urlsCrawled)) {
493
            return false;
494
        }
495
        if (isset($this->urlsQueued[$urlString])) {
496
            return false;
497
        }
498
499
        if (!$this->isUrlPartOfBaseUrl($url)) {
500
            $this->urlsRejected[] = (string)$url;
501
            return false;
502
        }
503
504
        return true;
505
    }
506
507
    /**
508
     * @param Url $url
509
     * @return bool
510
     */
511
    protected function isUrlPartOfBaseUrl(Url $url)
512
    {
513
        $baseUrlString = (string)$this->baseUrl;
514
        $this->getLogger()->debug($baseUrlString.' - '.$url);
515
        if (strpos((string)$url, $baseUrlString) === false) {
516
            return false;
517
        }
518
519
        return true;
520
    }
521
522
    /**
523
     * @return bool
524
     */
525
    private function isLimitReached()
526
    {
527
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
528
    }
529
530
    /**
531
     * @param DomCrawler $crawler
532
     * @return array
533
     */
534
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
535
    {
536
        return $crawler->filter('a')->each(
537
            function (DomCrawler $node) {
538
                return $node->link()->getUri();
539
            }
540
        );
541
    }
542
543
    /**
544
     * @param $url
545
     * @return DomCrawler
546
     */
547
    protected function requestPage($url)
548
    {
549
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
550
        $crawler = $this->client->request('GET', $url);
551
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
552
553
        return $crawler;
554
    }
555
}
556