1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Radowoj\Crawla; |
||
6 | |||
7 | use Radowoj\Crawla\Link\Collection as LinkCollection; |
||
8 | use Radowoj\Crawla\Link\CollectionInterface; |
||
9 | use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
||
10 | use Symfony\Contracts\HttpClient\HttpClientInterface; |
||
11 | |||
12 | final class Crawler implements CrawlerInterface |
||
13 | { |
||
14 | public const DEPTH_ONLY_TARGET = 0; |
||
15 | public const DEPTH_DEFAULT = 2; |
||
16 | public const DEPTH_INFINITE = -100; |
||
17 | |||
18 | private string $linkSelector = 'a'; |
||
19 | /** @var callable|null */ |
||
20 | private $urlValidatorCallback = null; |
||
21 | /** @var callable|null */ |
||
22 | private $pageVisitedCallback = null; |
||
23 | private int $maxDepth = self::DEPTH_INFINITE; |
||
24 | |||
25 | 15 | public function __construct( |
|
26 | private string $baseUrl, |
||
27 | private HttpClientInterface $client, |
||
28 | private CollectionInterface $linksVisited = new LinkCollection(), |
||
29 | private CollectionInterface $linksQueued = new LinkCollection(), |
||
30 | private CollectionInterface $linksTooDeep = new LinkCollection(), |
||
31 | ) { |
||
32 | 15 | } |
|
33 | |||
34 | 2 | public function getLinkSelector(): string |
|
35 | { |
||
36 | 2 | return $this->linkSelector; |
|
37 | } |
||
38 | |||
39 | 1 | public function setLinkSelector(string $linkSelector): CrawlerInterface |
|
40 | { |
||
41 | 1 | $this->linkSelector = $linkSelector; |
|
42 | |||
43 | 1 | return $this; |
|
44 | } |
||
45 | |||
46 | 1 | public function setVisited(CollectionInterface $linksVisited): CrawlerInterface |
|
47 | { |
||
48 | 1 | $this->linksVisited = $linksVisited; |
|
49 | |||
50 | 1 | return $this; |
|
51 | } |
||
52 | |||
53 | 1 | public function setQueued(CollectionInterface $linksQueued): CrawlerInterface |
|
54 | { |
||
55 | 1 | $this->linksQueued = $linksQueued; |
|
56 | |||
57 | 1 | return $this; |
|
58 | } |
||
59 | |||
60 | 8 | public function getVisited(): CollectionInterface |
|
61 | { |
||
62 | 8 | return $this->linksVisited; |
|
63 | } |
||
64 | |||
65 | 8 | public function getQueued(): CollectionInterface |
|
66 | { |
||
67 | 8 | return $this->linksQueued; |
|
68 | } |
||
69 | |||
70 | 2 | public function getTooDeep(): CollectionInterface |
|
71 | { |
||
72 | 2 | return $this->linksTooDeep; |
|
73 | } |
||
74 | |||
75 | 4 | public function setUrlValidatorCallback(callable $urlValidatorCallback): CrawlerInterface |
|
76 | { |
||
77 | 4 | $this->urlValidatorCallback = $urlValidatorCallback; |
|
78 | |||
79 | 4 | return $this; |
|
80 | } |
||
81 | |||
82 | 2 | public function setPageVisitedCallback(callable $pageVisitedCallback): CrawlerInterface |
|
83 | { |
||
84 | 2 | $this->pageVisitedCallback = $pageVisitedCallback; |
|
85 | |||
86 | 2 | return $this; |
|
87 | } |
||
88 | |||
89 | 6 | public function crawl(int $maxDepth = self::DEPTH_DEFAULT): void |
|
90 | { |
||
91 | 6 | $this->maxDepth = $maxDepth; |
|
92 | 6 | $this->getQueued()->appendUrlsAtDepth([$this->baseUrl], 0); |
|
93 | 6 | $this->crawlPages(); |
|
94 | } |
||
95 | |||
96 | 6 | private function crawlPages(): void |
|
97 | { |
||
98 | 6 | while ($link = $this->getQueued()->shift()) { |
|
99 | 6 | if (self::DEPTH_INFINITE !== $this->maxDepth && $link->getDepth() > $this->maxDepth) { |
|
100 | 1 | $this->getTooDeep()->push($link); |
|
101 | 1 | continue; |
|
102 | } |
||
103 | |||
104 | 5 | $response = $this->client->request('GET', $link->getUrl()); |
|
105 | 5 | if (200 !== $response->getStatusCode()) { |
|
106 | 3 | continue; |
|
107 | } |
||
108 | |||
109 | 4 | $this->getVisited()->push($link); |
|
110 | |||
111 | 4 | $domCrawler = new DomCrawler( |
|
112 | 4 | (string) $response->getContent(), |
|
113 | 4 | $link->getUrl() |
|
114 | 4 | ); |
|
115 | |||
116 | 4 | if (\is_callable($this->pageVisitedCallback)) { |
|
117 | 1 | \call_user_func($this->pageVisitedCallback, $domCrawler); |
|
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
118 | } |
||
119 | |||
120 | 4 | $urls = $this->getUrls($domCrawler); |
|
121 | 4 | $urls = $this->filterUrls($urls); |
|
122 | 4 | $this->queueUrls($link->getDepth() + 1, $urls); |
|
123 | } |
||
124 | } |
||
125 | |||
126 | 1 | private function isWithinBaseUrl($url): bool |
|
0 ignored issues
–
show
|
|||
127 | { |
||
128 | 1 | return 0 === mb_strpos($url, $this->baseUrl); |
|
129 | } |
||
130 | |||
131 | 4 | private function getUrls(DomCrawler $domCrawler): array |
|
132 | { |
||
133 | 4 | $links = $domCrawler->filter($this->linkSelector)->links(); |
|
134 | |||
135 | 4 | $urls = array_map(function ($link) { |
|
136 | 4 | $url = $link->getUri(); |
|
137 | 4 | $url = explode('#', $url); |
|
138 | |||
139 | 4 | return $url[0]; |
|
140 | 4 | }, $links); |
|
141 | |||
142 | 4 | return array_unique($urls); |
|
143 | } |
||
144 | |||
145 | 4 | private function filterUrls(array $urls): array |
|
146 | { |
||
147 | 4 | $urlConstraintCallback = \is_callable($this->urlValidatorCallback) |
|
148 | 3 | ? $this->urlValidatorCallback |
|
149 | 1 | : $this->isWithinBaseUrl(...); |
|
150 | |||
151 | 4 | return array_filter($urls, $urlConstraintCallback); |
|
152 | } |
||
153 | |||
154 | 4 | private function queueUrls(int $depth, array $urls): void |
|
155 | { |
||
156 | 4 | $this->getQueued()->appendUrlsAtDepth( |
|
157 | 4 | array_diff( |
|
158 | 4 | $urls, |
|
159 | 4 | $this->getQueued()->all(), |
|
160 | 4 | $this->getVisited()->all() |
|
161 | 4 | ), |
|
162 | 4 | $depth |
|
163 | 4 | ); |
|
164 | } |
||
165 | } |
||
166 |