Issues (6)

src/Crawler.php (1 issue)

1
<?php
2
3
declare(strict_types=1);
4
5
namespace Radowoj\Crawla;
6
7
use Radowoj\Crawla\Link\Collection as LinkCollection;
8
use Radowoj\Crawla\Link\CollectionInterface;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Contracts\HttpClient\HttpClientInterface;
11
12
final class Crawler implements CrawlerInterface
13
{
14
    public const DEPTH_ONLY_TARGET = 0;
15
    public const DEPTH_DEFAULT = 2;
16
    public const DEPTH_INFINITE = -100;
17
18
    private string $linkSelector = 'a';
19
    /** @var callable|null  */
20
    private $urlValidatorCallback = null;
21
    /** @var callable|null  */
22
    private $pageVisitedCallback = null;
23
    private int $maxDepth = self::DEPTH_INFINITE;
24
25 15
    public function __construct(
26
        private string $baseUrl,
27
        private HttpClientInterface $client,
28
        private CollectionInterface $linksVisited = new LinkCollection(),
29
        private CollectionInterface $linksQueued = new LinkCollection(),
30
        private CollectionInterface $linksTooDeep = new LinkCollection(),
31
    ) {
32 15
    }
33
34 2
    public function getLinkSelector(): string
35
    {
36 2
        return $this->linkSelector;
37
    }
38
39 1
    public function setLinkSelector(string $linkSelector): CrawlerInterface
40
    {
41 1
        $this->linkSelector = $linkSelector;
42
43 1
        return $this;
44
    }
45
46 1
    public function setVisited(CollectionInterface $linksVisited): CrawlerInterface
47
    {
48 1
        $this->linksVisited = $linksVisited;
49
50 1
        return $this;
51
    }
52
53 1
    public function setQueued(CollectionInterface $linksQueued): CrawlerInterface
54
    {
55 1
        $this->linksQueued = $linksQueued;
56
57 1
        return $this;
58
    }
59
60 8
    public function getVisited(): CollectionInterface
61
    {
62 8
        return $this->linksVisited;
63
    }
64
65 8
    public function getQueued(): CollectionInterface
66
    {
67 8
        return $this->linksQueued;
68
    }
69
70 2
    public function getTooDeep(): CollectionInterface
71
    {
72 2
        return $this->linksTooDeep;
73
    }
74
75 4
    public function setUrlValidatorCallback(callable $urlValidatorCallback): CrawlerInterface
76
    {
77 4
        $this->urlValidatorCallback = $urlValidatorCallback;
78
79 4
        return $this;
80
    }
81
82 2
    public function setPageVisitedCallback(callable $pageVisitedCallback): CrawlerInterface
83
    {
84 2
        $this->pageVisitedCallback = $pageVisitedCallback;
85
86 2
        return $this;
87
    }
88
89 6
    public function crawl(int $maxDepth = self::DEPTH_DEFAULT): void
90
    {
91 6
        $this->maxDepth = $maxDepth;
92 6
        $this->getQueued()->appendUrlsAtDepth([$this->baseUrl], 0);
93 6
        $this->crawlPages();
94
    }
95
96 6
    private function crawlPages(): void
97
    {
98 6
        while ($link = $this->getQueued()->shift()) {
99 6
            if (self::DEPTH_INFINITE !== $this->maxDepth && $link->getDepth() > $this->maxDepth) {
100 1
                $this->getTooDeep()->push($link);
101 1
                continue;
102
            }
103
104 5
            $response = $this->client->request('GET', $link->getUrl());
105 5
            if (200 !== $response->getStatusCode()) {
106 3
                continue;
107
            }
108
109 4
            $this->getVisited()->push($link);
110
111 4
            $domCrawler = new DomCrawler(
112 4
                (string) $response->getContent(),
113 4
                $link->getUrl()
114 4
            );
115
116 4
            if (\is_callable($this->pageVisitedCallback)) {
117 1
                \call_user_func($this->pageVisitedCallback, $domCrawler);
0 ignored issues
show
It seems like $this->pageVisitedCallback can also be of type null; however, parameter $callback of call_user_func() does only seem to accept callable, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

117
                \call_user_func(/** @scrutinizer ignore-type */ $this->pageVisitedCallback, $domCrawler);
Loading history...
118
            }
119
120 4
            $urls = $this->getUrls($domCrawler);
121 4
            $urls = $this->filterUrls($urls);
122 4
            $this->queueUrls($link->getDepth() + 1, $urls);
123
        }
124
    }
125
126 1
    private function isWithinBaseUrl($url): bool
127
    {
128 1
        return 0 === mb_strpos($url, $this->baseUrl);
129
    }
130
131 4
    private function getUrls(DomCrawler $domCrawler): array
132
    {
133 4
        $links = $domCrawler->filter($this->linkSelector)->links();
134
135 4
        $urls = array_map(function ($link) {
136 4
            $url = $link->getUri();
137 4
            $url = explode('#', $url);
138
139 4
            return $url[0];
140 4
        }, $links);
141
142 4
        return array_unique($urls);
143
    }
144
145 4
    private function filterUrls(array $urls): array
146
    {
147 4
        $urlConstraintCallback = \is_callable($this->urlValidatorCallback)
148 3
            ? $this->urlValidatorCallback
149 1
            : $this->isWithinBaseUrl(...);
150
151 4
        return array_filter($urls, $urlConstraintCallback);
152
    }
153
154 4
    private function queueUrls(int $depth, array $urls): void
155
    {
156 4
        $this->getQueued()->appendUrlsAtDepth(
157 4
            array_diff(
158 4
                $urls,
159 4
                $this->getQueued()->all(),
160 4
                $this->getVisited()->all()
161 4
            ),
162 4
            $depth
163 4
        );
164
    }
165
}
166