1 | <?php |
||
18 | class Crawler |
||
19 | { |
||
20 | /** @var \GuzzleHttp\Client */ |
||
21 | protected $client; |
||
22 | |||
23 | /** @var \Spatie\Crawler\Url */ |
||
24 | protected $baseUrl; |
||
25 | |||
26 | /** @var \Spatie\Crawler\CrawlObserver */ |
||
27 | protected $crawlObserver; |
||
28 | |||
29 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
30 | protected $crawlProfile; |
||
31 | |||
32 | /** @var int */ |
||
33 | protected $concurrency; |
||
34 | |||
35 | /** @var \Spatie\Crawler\CrawlQueue */ |
||
36 | protected $crawlQueue; |
||
37 | |||
38 | /** @var int */ |
||
39 | protected $crawledUrlCount = 0; |
||
40 | |||
41 | /** @var int|null */ |
||
42 | protected $maximumCrawlCount = null; |
||
43 | |||
44 | /** @var int|null */ |
||
45 | protected $maximumDepth = null; |
||
46 | |||
47 | /** @var \Tree\Node\Node */ |
||
48 | protected $depthTree; |
||
49 | |||
50 | /** @var false */ |
||
51 | protected $executeJavaScript = false; |
||
52 | |||
53 | /** @var string|null */ |
||
54 | protected $pathToChromeBinary = null; |
||
55 | |||
56 | protected static $defaultClientOptions = [ |
||
57 | RequestOptions::COOKIES => true, |
||
58 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
59 | RequestOptions::TIMEOUT => 10, |
||
60 | RequestOptions::ALLOW_REDIRECTS => false, |
||
61 | ]; |
||
62 | |||
63 | /** |
||
64 | * @param array $clientOptions |
||
65 | * |
||
66 | * @return static |
||
67 | */ |
||
68 | public static function create(array $clientOptions = []) |
||
78 | |||
79 | public function __construct(Client $client, int $concurrency = 10) |
||
89 | |||
90 | /** |
||
91 | * @param int $concurrency |
||
92 | * |
||
93 | * @return $this |
||
94 | */ |
||
95 | public function setConcurrency(int $concurrency) |
||
101 | |||
102 | /** |
||
103 | * @param int $maximumCrawlCount |
||
104 | * |
||
105 | * @return $this |
||
106 | */ |
||
107 | public function setMaximumCrawlCount(int $maximumCrawlCount) |
||
113 | |||
114 | /** |
||
115 | * @param int $maximumDepth |
||
116 | * |
||
117 | * @return $this |
||
118 | */ |
||
119 | public function setMaximumDepth(int $maximumDepth) |
||
125 | |||
126 | /** |
||
127 | * @param CrawlQueue $crawlQueue |
||
128 | * @return $this |
||
129 | */ |
||
130 | public function setCrawlQueue(CrawlQueue $crawlQueue) |
||
136 | |||
137 | /** |
||
138 | * @return $this |
||
139 | */ |
||
140 | public function executeJavaScript($pathToChromeBinary = null) |
||
148 | |||
149 | /** |
||
150 | * @return $this |
||
151 | */ |
||
152 | public function doNotExecuteJavaScript() |
||
158 | |||
159 | /** |
||
160 | * @param \Spatie\Crawler\CrawlObserver $crawlObserver |
||
161 | * |
||
162 | * @return $this |
||
163 | */ |
||
164 | public function setCrawlObserver(CrawlObserver $crawlObserver) |
||
170 | |||
171 | /** |
||
172 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
173 | * |
||
174 | * @return $this |
||
175 | */ |
||
176 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
182 | |||
183 | /** |
||
184 | * @param \Spatie\Crawler\Url|string $baseUrl |
||
185 | */ |
||
186 | public function startCrawling($baseUrl) |
||
204 | |||
205 | protected function startCrawlingQueue() |
||
238 | |||
239 | /** |
||
240 | * @param ResponseInterface|null $response |
||
241 | * @param CrawlUrl $crawlUrl |
||
242 | */ |
||
243 | protected function handleResponse($response, CrawlUrl $crawlUrl) |
||
247 | |||
248 | protected function getCrawlRequests(): Generator |
||
266 | |||
267 | protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl) |
||
300 | |||
301 | protected function shouldCrawl(Node $node): bool |
||
302 | { |
||
303 | if (is_null($this->maximumDepth)) { |
||
304 | return true; |
||
305 | } |
||
306 | |||
307 | return $node->getDepth() <= $this->maximumDepth; |
||
308 | } |
||
309 | |||
310 | protected function extractAllLinks(string $html, Url $foundOnUrl): Collection |
||
323 | |||
324 | protected function normalizeUrl(Url $url): Url |
||
328 | |||
329 | protected function addtoDepthTree(Node $node, string $url, string $parentUrl) |
||
351 | |||
352 | protected function getBodyAfterExecutingJavaScript(Url $foundOnUrl): string |
||
364 | |||
365 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
373 | |||
374 | protected function maximumCrawlCountReached(): bool |
||
382 | } |
||
383 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.