Test Setup Failed
Pull Request — master (#4)
by Radosław
03:05
created

Crawler::getClient()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 3
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 7
ccs 0
cts 4
cp 0
crap 6
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Radowoj\Crawla;
6
7
use Radowoj\Crawla\Link\Collection as LinkCollection;
8
use Radowoj\Crawla\Link\CollectionInterface;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Contracts\HttpClient\HttpClientInterface;
11
12
final class Crawler implements CrawlerInterface
13
{
14
    public const DEPTH_ONLY_TARGET = 0;
15
    public const DEPTH_DEFAULT = 2;
16
    public const DEPTH_INFINITE = -100;
17
18
    private string $linkSelector = 'a';
19
    /** @var callable|null  */
20
    private $urlValidatorCallback = null;
21
    /** @var callable|null  */
22
    private $pageVisitedCallback = null;
23
    private int $maxDepth = self::DEPTH_INFINITE;
24
25
    public function __construct(
26
        private string $baseUrl,
27
        private HttpClientInterface $client,
28
        private CollectionInterface $linksVisited = new LinkCollection(),
29
        private CollectionInterface $linksQueued = new LinkCollection(),
30
        private CollectionInterface $linksTooDeep = new LinkCollection(),
31
    ) {
32
    }
33
34
    public function getLinkSelector(): string
35
    {
36
        return $this->linkSelector;
37
    }
38
39
    public function setLinkSelector(string $linkSelector): CrawlerInterface
40
    {
41
        $this->linkSelector = $linkSelector;
42
43
        return $this;
44
    }
45
46
    public function setVisited(CollectionInterface $linksVisited): CrawlerInterface
47
    {
48
        $this->linksVisited = $linksVisited;
49
50
        return $this;
51
    }
52
53
    public function setQueued(CollectionInterface $linksQueued): CrawlerInterface
54
    {
55
        $this->linksQueued = $linksQueued;
56
57
        return $this;
58
    }
59
60
    public function getVisited(): CollectionInterface
61
    {
62
        return $this->linksVisited;
63
    }
64
65
    public function getQueued(): CollectionInterface
66
    {
67
        return $this->linksQueued;
68
    }
69
70
    public function getTooDeep(): CollectionInterface
71
    {
72
        return $this->linksTooDeep;
73
    }
74
75
    public function setUrlValidatorCallback(callable $urlValidatorCallback): CrawlerInterface
76
    {
77
        $this->urlValidatorCallback = $urlValidatorCallback;
78
79
        return $this;
80
    }
81
82
    public function setPageVisitedCallback(callable $pageVisitedCallback): CrawlerInterface
83
    {
84
        $this->pageVisitedCallback = $pageVisitedCallback;
85
86
        return $this;
87
    }
88
89
    public function crawl(int $maxDepth = self::DEPTH_DEFAULT): void
90
    {
91
        $this->maxDepth = $maxDepth;
92
        $this->getQueued()->appendUrlsAtDepth([$this->baseUrl], 0);
93
        $this->crawlPages();
94
    }
95
96
    private function crawlPages(): void
97
    {
98
        while ($link = $this->getQueued()->shift()) {
99
            if (self::DEPTH_INFINITE !== $this->maxDepth && $link->getDepth() > $this->maxDepth) {
100
                $this->getTooDeep()->push($link);
101
                continue;
102
            }
103
104
            $response = $this->client->request('GET', $link->getUrl());
105
            if (200 !== $response->getStatusCode()) {
106
                continue;
107
            }
108
109
            $this->getVisited()->push($link);
110
111
            $domCrawler = new DomCrawler(
112
                (string) $response->getContent(),
113
                $link->getUrl()
114
            );
115
116
            if (\is_callable($this->pageVisitedCallback)) {
117
                \call_user_func($this->pageVisitedCallback, $domCrawler);
0 ignored issues
show
Bug introduced by
It seems like $this->pageVisitedCallback can also be of type null; however, parameter $callback of call_user_func() does only seem to accept callable, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

117
                \call_user_func(/** @scrutinizer ignore-type */ $this->pageVisitedCallback, $domCrawler);
Loading history...
118
            }
119
120
            $urls = $this->getUrls($domCrawler);
121
            $urls = $this->filterUrls($urls);
122
            $this->queueUrls($link->getDepth() + 1, $urls);
123
        }
124
    }
125
126
    private function isWithinBaseUrl($url): bool
0 ignored issues
show
Unused Code introduced by
The method isWithinBaseUrl() is not used, and could be removed.

This check looks for private methods that have been defined, but are not used inside the class.

Loading history...
127
    {
128
        return 0 === mb_strpos($url, $this->baseUrl);
129
    }
130
131
    private function getUrls(DomCrawler $domCrawler): array
132
    {
133
        $links = $domCrawler->filter($this->linkSelector)->links();
134
135
        $urls = array_map(function ($link) {
136
            $url = $link->getUri();
137
            $url = explode('#', $url);
138
139
            return $url[0];
140
        }, $links);
141
142
        return array_unique($urls);
143
    }
144
145
    private function filterUrls(array $urls): array
146
    {
147
        $urlConstraintCallback = \is_callable($this->urlValidatorCallback)
148
            ? $this->urlValidatorCallback
149
            : $this->isWithinBaseUrl(...);
150
151
        return array_filter($urls, $urlConstraintCallback);
152
    }
153
154
    private function queueUrls(int $depth, array $urls): void
155
    {
156
        $this->getQueued()->appendUrlsAtDepth(
157
            array_diff(
158
                $urls,
159
                $this->getQueued()->all(),
160
                $this->getVisited()->all()
161
            ),
162
            $depth
163
        );
164
    }
165
}
166