1 | <?php |
||
17 | class Crawler |
||
18 | { |
||
19 | /** @var \GuzzleHttp\Client */ |
||
20 | protected $client; |
||
21 | |||
22 | /** @var \Spatie\Crawler\Url */ |
||
23 | protected $baseUrl; |
||
24 | |||
25 | /** @var \Spatie\Crawler\CrawlObserver */ |
||
26 | protected $crawlObserver; |
||
27 | |||
28 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
29 | protected $crawlProfile; |
||
30 | |||
31 | /** @var int */ |
||
32 | protected $concurrency; |
||
33 | |||
34 | /** @var \Spatie\Crawler\CrawlQueue */ |
||
35 | protected $crawlQueue; |
||
36 | |||
37 | /** @var int */ |
||
38 | protected $maximumDepth = 0; |
||
39 | |||
40 | /** @var \Tree\Node\Node */ |
||
41 | protected $linkTree; |
||
42 | |||
43 | /** |
||
44 | * @param array $clientOptions |
||
45 | * |
||
46 | * @return static |
||
47 | */ |
||
48 | public static function create(array $clientOptions = []) |
||
60 | |||
61 | public function __construct(Client $client, int $concurrency = 10) |
||
71 | |||
72 | /** |
||
73 | * @param int $concurrency |
||
74 | * |
||
75 | * @return $this |
||
76 | */ |
||
77 | public function setConcurrency(int $concurrency) |
||
83 | |||
84 | /** |
||
85 | * @param int $maximumDepth |
||
86 | * |
||
87 | * @return $this |
||
88 | */ |
||
89 | public function setMaximumDepth(int $maximumDepth) |
||
90 | { |
||
91 | $this->maximumDepth = $maximumDepth; |
||
92 | |||
93 | return $this; |
||
94 | } |
||
95 | |||
96 | /** |
||
97 | * @param \Spatie\Crawler\CrawlObserver $crawlObserver |
||
98 | * |
||
99 | * @return $this |
||
100 | */ |
||
101 | public function setCrawlObserver(CrawlObserver $crawlObserver) |
||
107 | |||
108 | /** |
||
109 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
110 | * |
||
111 | * @return $this |
||
112 | */ |
||
113 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
119 | |||
120 | /** |
||
121 | * @param \Spatie\Crawler\Url|string $baseUrl |
||
122 | */ |
||
123 | public function startCrawling($baseUrl) |
||
141 | |||
142 | protected function startCrawlingQueue() |
||
173 | |||
174 | /** |
||
175 | * @param ResponseInterface|null $response |
||
176 | * @param int $index |
||
177 | */ |
||
178 | protected function handleResponse($response, int $index) |
||
184 | |||
185 | protected function getCrawlRequests(): Generator |
||
207 | |||
208 | protected function addAllLinksToCrawlQueue(string $html, Url $foundOnUrl) |
||
236 | |||
237 | protected function extractAllLinks(string $html, Url $foundOnUrl): Collection |
||
246 | |||
247 | /** |
||
248 | * @param \Spatie\Crawler\Url $url |
||
249 | * |
||
250 | * @return \Spatie\Crawler\Url |
||
251 | */ |
||
252 | protected function normalizeUrl(Url $url): Url |
||
256 | |||
257 | |||
258 | /** |
||
259 | * @param $node \Tree\Node\Node |
||
260 | * @param $url string |
||
261 | * @param $parentUrl string |
||
262 | */ |
||
263 | protected function addToLinkTree($node, string $url, string $parentUrl) { |
||
283 | |||
284 | } |
||
285 |