Completed
Push — master ( 238530...f01411 )
by
unknown
02:58
created

Crawler::reset()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 8
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 8
ccs 6
cts 6
cp 1
rs 9.4285
c 0
b 0
f 0
cc 1
eloc 5
nc 1
nop 1
crap 1
1
<?php
2
3
namespace MediaMonks\Crawler;
4
5
use MediaMonks\Crawler\Url\Matcher\UrlMatcherInterface;
6
use MediaMonks\Crawler\Url\Normalizer\UrlNormalizerInterface;
7
use Symfony\Component\BrowserKit\Client;
8
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
9
use Psr\Log\LoggerAwareInterface;
10
use Psr\Log\LoggerInterface;
11
use Psr\Log\NullLogger;
12
13
class Crawler implements LoggerAwareInterface
14
{
15
    /**
16
     * @var Client
17
     */
18
    private $client;
19
20
    /**
21
     * @var int
22
     */
23
    private $limit = 0;
24
25
    /**
26
     * @var bool
27
     */
28
    private $stopOnError = false;
29
30
    /**
31
     * @var UrlMatcherInterface[]
32
     */
33
    private $whitelistUrlMatchers = [];
34
35
    /**
36
     * @var UrlMatcherInterface[]
37
     */
38
    private $blacklistUrlMatchers = [];
39
40
    /**
41
     * @var UrlNormalizerInterface[]
42
     */
43
    private $urlNormalizers = [];
44
45
    /**
46
     * @var Url
47
     */
48
    private $baseUrl;
49
50
    /**
51
     * @var array
52
     */
53
    private $urlsCrawled = [];
54
55
    /**
56
     * @var array
57
     */
58
    private $urlsQueued = [];
59
60
    /**
61
     * @var array
62
     */
63
    private $urlsRejected = [];
64
65
    /**
66
     * @var array
67
     */
68
    private $urlsReturned = [];
69
70
    /**
71
     * @var LoggerInterface
72
     */
73
    private $logger = null;
74
75
    /**
76
     * @param Client $client
77
     * @param array $options
78
     */
79 5
    public function __construct(Client $client = null, array $options = [])
80
    {
81 5
        if (empty($client)) {
82 3
            $client = new \Goutte\Client();
83 3
        }
84
85 5
        $this->setClient($client);
86 5
        $this->setOptions($options);
87
88 5
        return $this;
89
    }
90
91
    /**
92
     * @param Client $client
93
     */
94 5
    public function setClient(Client $client)
95
    {
96 5
        $this->client = $client;
97 5
    }
98
99
    /**
100
     * @return Client
101
     */
102 2
    public function getClient()
103
    {
104 2
        return $this->client;
105
    }
106
107
    /**
108
     * @param array $options
109
     */
110 5
    public function setOptions(array $options)
111
    {
112 5
        if (isset($options['limit'])) {
113 2
            $this->setLimit($options['limit']);
114 1
        }
115 5
        if (isset($options['stop_on_error'])) {
116 1
            $this->setStopOnError($options['stop_on_error']);
117 1
        }
118 5
        if (isset($options['logger'])) {
119 1
            $this->setLogger($options['logger']);
120 1
        }
121 5
        if (isset($options['whitelist_url_matchers'])) {
122 1
            $this->setWhitelistUrlMatchers($options['whitelist_url_matchers']);
123 1
        }
124 5
        if (isset($options['blacklist_url_matchers'])) {
125 1
            $this->setBlacklistUrlMatchers($options['blacklist_url_matchers']);
126 1
        }
127 5
    }
128
129
    /**
130
     * @return int
131
     */
132 3
    public function getLimit()
133
    {
134 3
        return $this->limit;
135
    }
136
137
    /**
138
     * @param int $limit
139
     * @return $this
140
     */
141 2
    public function setLimit($limit)
142
    {
143 2
        $this->limit = $limit;
144
145 2
        return $this;
146
    }
147
148
    /**
149
     * @return boolean
150
     */
151 3
    public function getStopOnError()
152
    {
153 3
        return $this->stopOnError;
154
    }
155
156
    /**
157
     * @param boolean $stopOnError
158
     * @return Crawler
159
     */
160 2
    public function setStopOnError($stopOnError)
161
    {
162 2
        $this->stopOnError = $stopOnError;
163
164 2
        return $this;
165
    }
166
167
    /**
168
     * @return array
169
     */
170 3
    public function getUrlsCrawled()
171
    {
172 3
        return $this->urlsCrawled;
173
    }
174
175
    /**
176
     * @return array
177
     */
178 1
    public function getUrlsQueued()
179
    {
180 1
        return $this->urlsQueued;
181
    }
182
183
    /**
184
     * @return array
185
     */
186 1
    public function getUrlsRejected()
187
    {
188 1
        return $this->urlsRejected;
189
    }
190
191
    /**
192
     * @return array
193
     */
194 1
    public function getUrlsReturned()
195
    {
196 1
        return $this->urlsReturned;
197
    }
198
199
    /**
200
     * @param $urlMatchers
201
     * @return $this
202
     */
203 1
    public function setWhitelistUrlMatchers(array $urlMatchers)
204
    {
205 1
        $this->clearWhitelistUrlMatchers();
206 1
        foreach ($urlMatchers as $matcher) {
207 1
            $this->addWhitelistUrlMatcher($matcher);
208 1
        }
209
210 1
        return $this;
211
    }
212
213
    /**
214
     * @return Url\Matcher\UrlMatcherInterface[]
215
     */
216 2
    public function getWhitelistUrlMatchers()
217
    {
218 2
        return $this->whitelistUrlMatchers;
219
    }
220
221
    /**
222
     * @param UrlMatcherInterface $urlMatcher
223
     * @return $this
224
     */
225 1
    public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher)
226
    {
227 1
        $this->whitelistUrlMatchers[] = $urlMatcher;
228
229 1
        return $this;
230
    }
231
232
    /**
233
     * @return $this
234
     */
235 1
    public function clearWhitelistUrlMatchers()
236
    {
237 1
        $this->whitelistUrlMatchers = [];
238
239 1
        return $this;
240
    }
241
242
    /**
243
     * @param array $urlMatchers
244
     * @return $this
245
     */
246 1
    public function setBlacklistUrlMatchers(array $urlMatchers)
247
    {
248 1
        $this->clearBlacklistUrlMatchers();
249 1
        foreach ($urlMatchers as $matcher) {
250 1
            $this->addBlacklistUrlMatcher($matcher);
251 1
        }
252
253 1
        return $this;
254
    }
255
256
    /**
257
     * @return Url\Matcher\UrlMatcherInterface[]
258
     */
259 2
    public function getBlacklistUrlMatchers()
260
    {
261 2
        return $this->blacklistUrlMatchers;
262
    }
263
264
    /**
265
     * @param UrlMatcherInterface $urlMatcher
266
     * @return $this
267
     */
268 1
    public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher)
269
    {
270 1
        $this->blacklistUrlMatchers[] = $urlMatcher;
271
272 1
        return $this;
273
    }
274
275
    /**
276
     * @return $this
277
     */
278 1
    public function clearBlacklistUrlMatchers()
279
    {
280 1
        $this->blacklistUrlMatchers = [];
281
282 1
        return $this;
283
    }
284
285
    /**
286
     * @param array $normalizers
287
     * @return $this
288
     */
289
    public function setUrlNormalizers(array $normalizers)
290
    {
291
        $this->clearUrlNormalizers();
292
293
        foreach ($normalizers as $normalizer) {
294
            $this->addUrlNormalizer($normalizer);
295
        }
296
297
        return $this;
298
    }
299
300
    /**
301
     * @param UrlNormalizerInterface $normalizer
302
     * @return $this
303
     */
304
    public function addUrlNormalizer(UrlNormalizerInterface $normalizer)
305
    {
306
        $this->urlNormalizers[] = $normalizer;
307
308
        return $this;
309
    }
310
311
    /**
312
     * @return $this
313
     */
314
    public function clearUrlNormalizers()
315
    {
316
        $this->urlNormalizers = [];
317
318
        return $this;
319
    }
320
321
    /**
322
     * @return LoggerInterface
323
     */
324 5
    public function getLogger()
325
    {
326 5
        if (is_null($this->logger)) {
327 3
            $this->logger = new NullLogger();
328 3
        }
329
330 5
        return $this->logger;
331
    }
332
333
    /**
334
     * @param LoggerInterface $logger
335
     * @return $this
336
     */
337 2
    public function setLogger(LoggerInterface $logger)
338
    {
339 2
        $this->logger = $logger;
340
341 2
        return $this;
342
    }
343
344
    /**
345
     * @param Url $url
346
     */
347 2
    protected function addUrlToQueue(Url $url)
348
    {
349 2
        $this->urlsQueued[(string)$url] = $url;
350 2
    }
351
352
    /**
353
     * @param $url
354
     * @return Url
355
     */
356 2
    protected function createHttpUrlString($url)
357
    {
358 2
        return Url::createFromString($url);
359
    }
360
361
    /**
362
     * @param Url $url
363
     */
364 2
    protected function reset(Url $url)
365
    {
366 2
        $this->baseUrl = $url;
367 2
        $this->urlsCrawled = [];
368 2
        $this->urlsQueued = [];
369
370 2
        $this->addUrlToQueue($url);
371 2
    }
372
373
    /**
374
     * @param string $url
375
     * @return \Generator|void
376
     */
377 2
    public function crawl($url)
378
    {
379 2
        $url = $this->createHttpUrlString($url);
380 2
        $this->reset($url);
381
382 2
        while (count($this->urlsQueued) > 0) {
383
384 2
            $url = array_shift($this->urlsQueued);
385
386
            try {
387 2
                $crawler = $this->requestPage((string)$url);
388 2
            } catch (\Exception $e) {
389
                $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage()));
390
391
                if ($this->getStopOnError()) {
392
                    return;
393
                }
394
395
                continue;
396
            }
397
398 2
            $this->urlsCrawled[] = (string)$url;
399 2
            $this->updateQueue($crawler);
400
401 2
            if ($this->shouldReturnUrl($url)) {
402 2
                $this->getLogger()->debug(sprintf('Return url "%s"', $url));
403
404 2
                $this->urlsReturned[] = (string)$url;
405
406 2
                yield new Page($url, $crawler);
407 2
            }
408
409 2
            if ($this->isLimitReached()) {
410
                $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit));
411
412
                return;
413
            }
414 2
        }
415 2
    }
416
417
    /**
418
     * @param DomCrawler $crawler
419
     */
420 2
    protected function updateQueue(DomCrawler $crawler)
421
    {
422 2
        foreach ($this->extractUrlsFromCrawler($crawler) as $url) {
423 1
            if (!in_array($url, $this->urlsRejected)) {
424 1
                $this->getLogger()->debug(sprintf('Found url %s in page', $url));
425
                try {
426 1
                    $url = $this->normalizeUrl($this->createHttpUrlString($url));
427
428 1
                    if ($this->shouldCrawlUrl($url)) {
429 1
                        $this->addUrlToQueue($url);
430 1
                    }
431 1
                } catch (\Exception $e) {
432
                    $this->getLogger()->warning(
433
                        sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage())
434
                    );
435
                    $this->urlsRejected[] = $url;
436
                }
437 1
            }
438 2
        }
439 2
    }
440
441
    /**
442
     * @param Url $url
443
     * @return Url
444
     */
445 1
    protected function normalizeUrl(Url $url)
446
    {
447 1
        foreach($this->urlNormalizers as $normalizer) {
448
            $url = $normalizer->normalize($url);
449 1
        }
450
451 1
        return $url;
452
    }
453
454
    /**
455
     * @param Url $url
456
     * @return bool
457
     */
458 2
    protected function shouldReturnUrl(Url $url)
459
    {
460 2
        if (!empty($this->whitelistUrlMatchers)) {
461
            foreach ($this->whitelistUrlMatchers as $matcher) {
462
                if ($matcher->matches($url)) {
463
                    return true;
464
                }
465
            }
466
            $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url));
467
468
            return false;
469
        }
470
471 2
        foreach ($this->blacklistUrlMatchers as $matcher) {
472
            if ($matcher->matches($url)) {
473
                $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url));
474
475
                return false;
476
            }
477 2
        }
478
479 2
        return true;
480
    }
481
482
    /**
483
     * @param Url $url
484
     * @return bool
485
     */
486 1
    protected function shouldCrawlUrl(Url $url)
487
    {
488 1
        $urlString = (string)$url;
489 1
        if (in_array($urlString, $this->urlsRejected)) {
490
            return false;
491
        }
492 1
        if (in_array($urlString, $this->urlsCrawled)) {
493
            return false;
494
        }
495 1
        if (isset($this->urlsQueued[$urlString])) {
496
            return false;
497
        }
498
499 1
        if (!$this->isUrlPartOfBaseUrl($url)) {
500
            $this->urlsRejected[] = (string)$url;
501
            return false;
502
        }
503
504 1
        return true;
505
    }
506
507
    /**
508
     * @param Url $url
509
     * @return bool
510
     */
511 1
    protected function isUrlPartOfBaseUrl(Url $url)
512
    {
513 1
        $baseUrlString = (string)$this->baseUrl;
514 1
        $this->getLogger()->debug($baseUrlString.' - '.$url);
515 1
        if (strpos((string)$url, $baseUrlString) === false) {
516
            return false;
517
        }
518
519 1
        return true;
520
    }
521
522
    /**
523
     * @return bool
524
     */
525 2
    private function isLimitReached()
526
    {
527 2
        return (!empty($this->limit) && count($this->urlsReturned) === $this->limit);
528
    }
529
530
    /**
531
     * @param DomCrawler $crawler
532
     * @return array
533
     */
534 2
    protected function extractUrlsFromCrawler(DomCrawler $crawler)
535
    {
536 2
        return $crawler->filter('a')->each(
537 1
            function (DomCrawler $node) {
538 1
                return $node->link()->getUri();
539
            }
540 2
        );
541
    }
542
543
    /**
544
     * @param $url
545
     * @return DomCrawler
546
     */
547 2
    protected function requestPage($url)
548
    {
549 2
        $this->getLogger()->info(sprintf('Crawling page %s', $url));
550 2
        $crawler = $this->client->request('GET', $url);
551 2
        $this->getLogger()->info(sprintf('Crawled page %s', $url));
552
553 2
        return $crawler;
554
    }
555
}
556