1 | <?php |
||
17 | class Crawler |
||
18 | { |
||
19 | /** @var \GuzzleHttp\Client */ |
||
20 | protected $client; |
||
21 | |||
22 | /** @var \Spatie\Crawler\Url */ |
||
23 | protected $baseUrl; |
||
24 | |||
25 | /** @var \Spatie\Crawler\CrawlObserver */ |
||
26 | protected $crawlObserver; |
||
27 | |||
28 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
29 | protected $crawlProfile; |
||
30 | |||
31 | /** @var int */ |
||
32 | protected $concurrency; |
||
33 | |||
34 | /** @var \Spatie\Crawler\CrawlQueue */ |
||
35 | protected $crawlQueue; |
||
36 | |||
37 | /** @var int */ |
||
38 | protected $depth = 0; |
||
39 | |||
40 | /** @var \Tree\Node\Node */ |
||
41 | protected $linkTree; |
||
42 | |||
43 | /** |
||
44 | * @param array $clientOptions |
||
45 | * |
||
46 | * @return static |
||
47 | */ |
||
48 | public static function create(array $clientOptions = []) |
||
60 | |||
61 | public function __construct(Client $client, int $concurrency = 10) |
||
71 | |||
72 | /** |
||
73 | * @param int $concurrency |
||
74 | * |
||
75 | * @return $this |
||
76 | */ |
||
77 | public function setConcurrency(int $concurrency) |
||
83 | |||
84 | /** |
||
85 | * @param int $depth |
||
86 | * |
||
87 | * @return $this |
||
88 | */ |
||
89 | public function setDepth(int $depth) |
||
95 | |||
96 | /** |
||
97 | * @param \Spatie\Crawler\CrawlObserver $crawlObserver |
||
98 | * |
||
99 | * @return $this |
||
100 | */ |
||
101 | public function setCrawlObserver(CrawlObserver $crawlObserver) |
||
107 | |||
108 | /** |
||
109 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
110 | * |
||
111 | * @return $this |
||
112 | */ |
||
113 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
119 | |||
120 | /** |
||
121 | * @param \Spatie\Crawler\Url|string $baseUrl |
||
122 | */ |
||
123 | public function startCrawling($baseUrl) |
||
141 | |||
142 | protected function startCrawlingQueue() |
||
173 | |||
174 | /** |
||
175 | * @param ResponseInterface|null $response |
||
176 | * @param int $index |
||
177 | */ |
||
178 | protected function handleResponse($response, int $index) |
||
184 | |||
185 | protected function getCrawlRequests(): Generator |
||
207 | |||
208 | protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl) |
||
238 | |||
239 | protected function extractAllLinks(string $html, Url $foundOnUrl): Collection |
||
248 | |||
249 | /** |
||
250 | * @param \Spatie\Crawler\Url $url |
||
251 | * |
||
252 | * @return \Spatie\Crawler\Url |
||
253 | */ |
||
254 | protected function normalizeUrl(Url $url): Url |
||
258 | |||
259 | |||
260 | /** |
||
261 | * @param $node \Tree\Node\Node |
||
262 | * @param $url string |
||
263 | * @param $parentUrl string |
||
264 | * @param $newNode \Tree\Node\Node |
||
265 | */ |
||
266 | protected function addToLinkTree($node, string $url, string $parentUrl, &$newNode) { |
||
277 | |||
278 | } |
||
279 |