1 | <?php |
||
11 | class Crawler |
||
12 | { |
||
13 | /** |
||
14 | * @var \GuzzleHttp\Client |
||
15 | */ |
||
16 | protected $client; |
||
17 | |||
18 | /** |
||
19 | * @var \Spatie\Crawler\Url; |
||
20 | */ |
||
21 | protected $baseUrl; |
||
22 | |||
23 | /** |
||
24 | * @var \Illuminate\Support\Collection |
||
25 | */ |
||
26 | protected $crawledUrls; |
||
27 | |||
28 | /** |
||
29 | * @var \Spatie\Crawler\CrawlObserver |
||
30 | */ |
||
31 | protected $crawlObserver; |
||
32 | |||
33 | /** |
||
34 | * @var \Spatie\Crawler\CrawlProfile |
||
35 | */ |
||
36 | protected $crawlProfile; |
||
37 | |||
38 | /** |
||
39 | * @return static |
||
40 | */ |
||
41 | public static function create() |
||
50 | |||
51 | /** |
||
52 | * @param \GuzzleHttp\Client $client |
||
53 | */ |
||
54 | public function __construct(Client $client) |
||
55 | { |
||
56 | $this->client = $client; |
||
57 | |||
58 | $this->crawlProfile = new CrawlAllUrls(); |
||
59 | |||
60 | $this->crawledUrls = collect(); |
||
61 | } |
||
62 | |||
63 | /** |
||
64 | * Set the crawl observer. |
||
65 | * |
||
66 | * @param \Spatie\Crawler\CrawlObserver $crawlObserver |
||
67 | * |
||
68 | * @return $this |
||
69 | */ |
||
70 | public function setCrawlObserver(CrawlObserver $crawlObserver) |
||
71 | { |
||
72 | $this->crawlObserver = $crawlObserver; |
||
73 | |||
74 | return $this; |
||
75 | } |
||
76 | |||
77 | /** |
||
78 | * Set the crawl profile. |
||
79 | * |
||
80 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
81 | * |
||
82 | * @return $this |
||
83 | */ |
||
84 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
90 | |||
91 | /** |
||
92 | * Start the crawling process. |
||
93 | * |
||
94 | * @param \Spatie\Crawler\Url|string $baseUrl |
||
95 | * |
||
96 | * @throws \Spatie\Crawler\Exceptions\InvalidBaseUrl |
||
97 | */ |
||
98 | public function startCrawling($baseUrl) |
||
114 | |||
115 | /** |
||
116 | * Crawl the given url. |
||
117 | * |
||
118 | * @param \Spatie\Crawler\Url $url |
||
119 | */ |
||
120 | protected function crawlUrl(Url $url) |
||
150 | |||
151 | /** |
||
152 | * Crawl all links in the given html. |
||
153 | * |
||
154 | * @param string $html |
||
155 | */ |
||
156 | protected function crawlAllLinks($html) |
||
178 | |||
179 | /** |
||
180 | * Get all links in the given html. |
||
181 | * |
||
182 | * @param string $html |
||
183 | * |
||
184 | * @return \Spatie\Crawler\Url[] |
||
185 | */ |
||
186 | protected function getAllLinks($html) |
||
196 | |||
197 | /** |
||
198 | * Determine if the crawled has already crawled the given url. |
||
199 | * |
||
200 | * @param \Spatie\Crawler\Url $url |
||
201 | * |
||
202 | * @return bool |
||
203 | */ |
||
204 | protected function hasAlreadyCrawled(Url $url) |
||
214 | |||
215 | /** |
||
216 | * Normalize the given url. |
||
217 | * |
||
218 | * @param \Spatie\Crawler\Url $url |
||
219 | * |
||
220 | * @return $this |
||
221 | */ |
||
222 | protected function normalizeUrl(Url $url) |
||
236 | } |
||
237 |