1 | <?php |
||
18 | class Crawler |
||
19 | { |
||
20 | /** @var \GuzzleHttp\Client */ |
||
21 | protected $client; |
||
22 | |||
23 | /** @var \Spatie\Crawler\Url */ |
||
24 | protected $baseUrl; |
||
25 | |||
26 | /** @var \Spatie\Crawler\CrawlObserver */ |
||
27 | protected $crawlObserver; |
||
28 | |||
29 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
30 | protected $crawlProfile; |
||
31 | |||
32 | /** @var int */ |
||
33 | protected $concurrency; |
||
34 | |||
35 | /** @var \Spatie\Crawler\CrawlQueue */ |
||
36 | protected $crawlQueue; |
||
37 | |||
38 | /** @var int */ |
||
39 | protected $crawledUrlCount = 0; |
||
40 | |||
41 | /** @var int|null */ |
||
42 | protected $maximumCrawlCount = null; |
||
43 | |||
44 | /** @var int|null */ |
||
45 | protected $maximumDepth = null; |
||
46 | |||
47 | /** @var \Tree\Node\Node */ |
||
48 | protected $depthTree; |
||
49 | |||
50 | /** @var false */ |
||
51 | protected $executeJavaScript = false; |
||
52 | |||
53 | /** @var string|null */ |
||
54 | protected $pathToChromeBinary = null; |
||
55 | |||
56 | protected static $defaultClientOptions = [ |
||
57 | RequestOptions::COOKIES => true, |
||
58 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
59 | RequestOptions::TIMEOUT => 10, |
||
60 | RequestOptions::ALLOW_REDIRECTS => false, |
||
61 | ]; |
||
62 | |||
63 | /** |
||
64 | * @param array $clientOptions |
||
65 | * |
||
66 | * @return static |
||
67 | */ |
||
68 | public static function create(array $clientOptions = []) |
||
78 | |||
79 | public function __construct(Client $client, int $concurrency = 10) |
||
89 | |||
90 | /** |
||
91 | * @param int $concurrency |
||
92 | * |
||
93 | * @return $this |
||
94 | */ |
||
95 | public function setConcurrency(int $concurrency) |
||
101 | |||
102 | /** |
||
103 | * @param int $maximumCrawlCount |
||
104 | * |
||
105 | * @return $this |
||
106 | */ |
||
107 | public function setMaximumCrawlCount(int $maximumCrawlCount) |
||
113 | |||
114 | /** |
||
115 | * @param int $maximumDepth |
||
116 | * |
||
117 | * @return $this |
||
118 | */ |
||
119 | public function setMaximumDepth(int $maximumDepth) |
||
125 | |||
126 | /** |
||
127 | * @return $this |
||
128 | */ |
||
129 | public function executeJavaScript($pathToChromeBinary = null) |
||
137 | |||
138 | /** |
||
139 | * @return $this |
||
140 | */ |
||
141 | public function doNotExecuteJavaScript() |
||
147 | |||
148 | /** |
||
149 | * @param \Spatie\Crawler\CrawlObserver $crawlObserver |
||
150 | * |
||
151 | * @return $this |
||
152 | */ |
||
153 | public function setCrawlObserver(CrawlObserver $crawlObserver) |
||
159 | |||
160 | /** |
||
161 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
162 | * |
||
163 | * @return $this |
||
164 | */ |
||
165 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
171 | |||
172 | /** |
||
173 | * @param \Spatie\Crawler\Url|string $baseUrl |
||
174 | */ |
||
175 | public function startCrawling($baseUrl) |
||
193 | |||
194 | protected function startCrawlingQueue() |
||
227 | |||
228 | /** |
||
229 | * @param ResponseInterface|null $response |
||
230 | * @param int $index |
||
231 | */ |
||
232 | protected function handleResponse($response, int $index) |
||
238 | |||
239 | protected function getCrawlRequests(): Generator |
||
262 | |||
263 | protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl) |
||
296 | |||
297 | protected function shouldCrawl(Node $node): bool |
||
298 | { |
||
299 | if (is_null($this->maximumDepth)) { |
||
300 | return true; |
||
301 | } |
||
302 | |||
303 | return $node->getDepth() <= $this->maximumDepth; |
||
304 | } |
||
305 | |||
306 | protected function extractAllLinks(string $html, Url $foundOnUrl): Collection |
||
319 | |||
320 | protected function normalizeUrl(Url $url): Url |
||
324 | |||
325 | protected function addtoDepthTree(Node $node, string $url, string $parentUrl) |
||
347 | |||
348 | protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string |
||
360 | |||
361 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
369 | |||
370 | protected function maximumCrawlCountReached(): bool |
||
378 | } |
||
379 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.