Passed
Push — master ( 1b4681...5e494c )
by Radosław
02:17
created

Crawler::getClient()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 3
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 7
ccs 0
cts 4
cp 0
crap 6
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Radowoj\Crawla;
6
7
use Radowoj\Crawla\Link\Collection as LinkCollection;
8
use Radowoj\Crawla\Link\CollectionInterface;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Contracts\HttpClient\HttpClientInterface;
11
12
final class Crawler implements CrawlerInterface
13
{
14
    public const DEPTH_ONLY_TARGET = 0;
15
    public const DEPTH_DEFAULT = 2;
16
    public const DEPTH_INFINITE = -100;
17
18
    private string $linkSelector = 'a';
19
    /** @var callable|null  */
20
    private $urlValidatorCallback = null;
21
    /** @var callable|null  */
22
    private $pageVisitedCallback = null;
23
    private int $maxDepth = self::DEPTH_INFINITE;
24
25 15
    public function __construct(
26
        private string $baseUrl,
27
        private HttpClientInterface $client,
28
        private CollectionInterface $linksVisited = new LinkCollection(),
29
        private CollectionInterface $linksQueued = new LinkCollection(),
30
        private CollectionInterface $linksTooDeep = new LinkCollection(),
31
    ) {
32 15
    }
33
34 2
    public function getLinkSelector(): string
35
    {
36 2
        return $this->linkSelector;
37
    }
38
39 1
    public function setLinkSelector(string $linkSelector): CrawlerInterface
40
    {
41 1
        $this->linkSelector = $linkSelector;
42
43 1
        return $this;
44
    }
45
46 1
    public function setVisited(CollectionInterface $linksVisited): CrawlerInterface
47
    {
48 1
        $this->linksVisited = $linksVisited;
49
50 1
        return $this;
51
    }
52
53 1
    public function setQueued(CollectionInterface $linksQueued): CrawlerInterface
54
    {
55 1
        $this->linksQueued = $linksQueued;
56
57 1
        return $this;
58
    }
59
60 8
    public function getVisited(): CollectionInterface
61
    {
62 8
        return $this->linksVisited;
63
    }
64
65 8
    public function getQueued(): CollectionInterface
66
    {
67 8
        return $this->linksQueued;
68
    }
69
70 2
    public function getTooDeep(): CollectionInterface
71
    {
72 2
        return $this->linksTooDeep;
73
    }
74
75 4
    public function setUrlValidatorCallback(callable $urlValidatorCallback): CrawlerInterface
76
    {
77 4
        $this->urlValidatorCallback = $urlValidatorCallback;
78
79 4
        return $this;
80
    }
81
82 2
    public function setPageVisitedCallback(callable $pageVisitedCallback): CrawlerInterface
83
    {
84 2
        $this->pageVisitedCallback = $pageVisitedCallback;
85
86 2
        return $this;
87
    }
88
89 6
    public function crawl(int $maxDepth = self::DEPTH_DEFAULT): void
90
    {
91 6
        $this->maxDepth = $maxDepth;
92 6
        $this->getQueued()->appendUrlsAtDepth([$this->baseUrl], 0);
93 6
        $this->crawlPages();
94
    }
95
96 6
    private function crawlPages(): void
97
    {
98 6
        while ($link = $this->getQueued()->shift()) {
99 6
            if (self::DEPTH_INFINITE !== $this->maxDepth && $link->getDepth() > $this->maxDepth) {
100 1
                $this->getTooDeep()->push($link);
101 1
                continue;
102
            }
103
104 5
            $response = $this->client->request('GET', $link->getUrl());
105 5
            if (200 !== $response->getStatusCode()) {
106 3
                continue;
107
            }
108
109 4
            $this->getVisited()->push($link);
110
111 4
            $domCrawler = new DomCrawler(
112 4
                (string) $response->getContent(),
113 4
                $link->getUrl()
114 4
            );
115
116 4
            if (\is_callable($this->pageVisitedCallback)) {
117 1
                \call_user_func($this->pageVisitedCallback, $domCrawler);
0 ignored issues
show
Bug introduced by
It seems like $this->pageVisitedCallback can also be of type null; however, parameter $callback of call_user_func() does only seem to accept callable, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

117
                \call_user_func(/** @scrutinizer ignore-type */ $this->pageVisitedCallback, $domCrawler);
Loading history...
118
            }
119
120 4
            $urls = $this->getUrls($domCrawler);
121 4
            $urls = $this->filterUrls($urls);
122 4
            $this->queueUrls($link->getDepth() + 1, $urls);
123
        }
124
    }
125
126 1
    private function isWithinBaseUrl($url): bool
0 ignored issues
show
Unused Code introduced by
The method isWithinBaseUrl() is not used, and could be removed.

This check looks for private methods that have been defined, but are not used inside the class.

Loading history...
127
    {
128 1
        return 0 === mb_strpos($url, $this->baseUrl);
129
    }
130
131 4
    private function getUrls(DomCrawler $domCrawler): array
132
    {
133 4
        $links = $domCrawler->filter($this->linkSelector)->links();
134
135 4
        $urls = array_map(function ($link) {
136 4
            $url = $link->getUri();
137 4
            $url = explode('#', $url);
138
139 4
            return $url[0];
140 4
        }, $links);
141
142 4
        return array_unique($urls);
143
    }
144
145 4
    private function filterUrls(array $urls): array
146
    {
147 4
        $urlConstraintCallback = \is_callable($this->urlValidatorCallback)
148 3
            ? $this->urlValidatorCallback
149 1
            : $this->isWithinBaseUrl(...);
150
151 4
        return array_filter($urls, $urlConstraintCallback);
152
    }
153
154 4
    private function queueUrls(int $depth, array $urls): void
155
    {
156 4
        $this->getQueued()->appendUrlsAtDepth(
157 4
            array_diff(
158 4
                $urls,
159 4
                $this->getQueued()->all(),
160 4
                $this->getVisited()->all()
161 4
            ),
162 4
            $depth
163 4
        );
164
    }
165
}
166