Total Complexity | 53 |
Total Lines | 319 |
Duplicated Lines | 0 % |
Changes | 1 | ||
Bugs | 0 | Features | 1 |
Complex classes like CrawlerConfig often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use CrawlerConfig, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
9 | class CrawlerConfig |
||
10 | { |
||
11 | /** |
||
12 | * @var string contain the user agent used during the crawl |
||
13 | */ |
||
14 | protected $userAgent; |
||
15 | |||
16 | /** |
||
17 | * @var string crawl id |
||
18 | */ |
||
19 | protected $id; |
||
20 | |||
21 | /** |
||
22 | * @var string page to ignore during the crawl |
||
23 | */ |
||
24 | protected $ignore; |
||
25 | |||
26 | /** |
||
27 | * @var int depth max where to crawl |
||
28 | */ |
||
29 | protected $limit; |
||
30 | |||
31 | /** |
||
32 | * @var string contain https://domain.tdl from start url |
||
33 | */ |
||
34 | protected $base; |
||
35 | |||
36 | /** |
||
37 | * @var int |
||
38 | */ |
||
39 | protected $wait; |
||
40 | |||
41 | /** |
||
42 | * @var int |
||
43 | */ |
||
44 | protected $cacheMethod; |
||
45 | |||
46 | /** |
||
47 | * @var string |
||
48 | */ |
||
49 | protected $dataDirectory; |
||
50 | |||
51 | /** |
||
52 | * @var string |
||
53 | */ |
||
54 | protected $startUrl; |
||
55 | |||
56 | protected $request; |
||
57 | protected $robotsTxt; |
||
58 | |||
59 | /** @var Recorder */ |
||
60 | protected $recorder; |
||
61 | |||
62 | public function __construct( |
||
80 | } |
||
81 | |||
82 | /** |
||
83 | * @return string id |
||
84 | */ |
||
85 | public static function getLastCrawl(string $dataDirectory): string |
||
86 | { |
||
87 | $dir = scandir($dataDirectory); |
||
88 | $lastCrawl = null; |
||
89 | $lastRunAt = null; |
||
90 | |||
91 | foreach ($dir as $file) { |
||
92 | if ('.' != $file && '..' != $file |
||
93 | && is_dir($dataDirectory.'/'.$file) |
||
94 | && filemtime($dataDirectory.'/'.$file) > $lastRunAt) { |
||
95 | $lastCrawl = $file; |
||
96 | $lastRunAt = filemtime($dataDirectory.'/'.$file); |
||
97 | } |
||
98 | } |
||
99 | |||
100 | if (null === $lastCrawl) { |
||
101 | throw new \Exception('No crawl previously runned'); |
||
102 | } |
||
103 | |||
104 | return $lastCrawl; |
||
105 | } |
||
106 | |||
107 | public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self |
||
108 | { |
||
109 | if ('last' === $crawlId) { |
||
110 | $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/')); |
||
111 | } |
||
112 | |||
113 | $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json'; |
||
114 | if (!file_exists($configFilePath)) { |
||
115 | throw new \Exception('Crawl `'.$crawlId.'` not found.'); |
||
116 | } |
||
117 | $config = json_decode(file_get_contents($configFilePath), true); |
||
118 | |||
119 | return (new self( |
||
120 | $config['base'].$config['startUrl'], |
||
121 | $config['ignore'], |
||
122 | intval($config['limit']), |
||
123 | (string) $config['userAgent'], |
||
124 | intval($config['cacheMethod']), |
||
125 | intval($config['wait']), |
||
126 | $dataDirectory |
||
127 | ))->setId($crawlId); |
||
128 | } |
||
129 | |||
130 | public function recordConfig() |
||
131 | { |
||
132 | $this->getRecorder(); // permit to create folder |
||
133 | file_put_contents($this->getDataFolder().'/config.json', json_encode([ |
||
134 | 'startUrl' => $this->startUrl, |
||
135 | 'base' => $this->base, |
||
136 | 'ignore' => $this->ignore, |
||
137 | 'limit' => $this->limit, |
||
138 | 'userAgent' => $this->userAgent, |
||
139 | 'cacheMethod' => $this->cacheMethod, |
||
140 | 'wait' => $this->wait, |
||
141 | ])); |
||
142 | } |
||
143 | |||
144 | protected function setBaseAndStartUrl(string $url) |
||
145 | { |
||
146 | if (!filter_var($url, FILTER_VALIDATE_URL)) { |
||
147 | throw new \Exception('start is not a valid URL `'.$url.'`'); |
||
148 | } |
||
149 | |||
150 | $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url; |
||
151 | |||
152 | $url = substr($url, strlen($this->base)); |
||
153 | |||
154 | $this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url; |
||
155 | } |
||
156 | |||
157 | public static function getDataFolderFrom(string $id, ?string $path) |
||
158 | { |
||
159 | return ($path ?? __DIR__.'/../data').'/'.$id; |
||
160 | } |
||
161 | |||
162 | public function getDataFolder() |
||
163 | { |
||
164 | return $this->dataDirectory.'/'.$this->id; |
||
165 | } |
||
166 | |||
167 | public function getId() |
||
168 | { |
||
169 | return $this->id; |
||
170 | } |
||
171 | |||
172 | public function getBase() |
||
173 | { |
||
174 | return $this->base; |
||
175 | } |
||
176 | |||
177 | public function getStartUrl() |
||
180 | } |
||
181 | |||
182 | public function getWait() |
||
183 | { |
||
184 | return $this->wait; |
||
185 | } |
||
186 | |||
187 | public function getUserAgent() |
||
188 | { |
||
189 | return $this->userAgent; |
||
190 | } |
||
191 | |||
192 | public function getLimit() |
||
193 | { |
||
194 | return $this->limit; |
||
195 | } |
||
196 | |||
197 | public function getCacheMethod() |
||
198 | { |
||
199 | return $this->cacheMethod; |
||
200 | } |
||
201 | |||
202 | public function getDataDirectory() |
||
203 | { |
||
204 | $this->dataDirectory; |
||
205 | } |
||
206 | |||
207 | /** @var RobotsTxt */ |
||
208 | protected $virtualRobots; |
||
209 | |||
210 | public function getVirtualRobots() |
||
217 | } |
||
218 | |||
219 | public function setId(string $id): self |
||
220 | { |
||
221 | $this->id = $id; |
||
222 | |||
223 | return $this; |
||
224 | } |
||
225 | |||
226 | public function getDataFromPreviousCrawl() |
||
227 | { |
||
228 | $dataFilePath = $this->getDataFolder().'/data.csv'; |
||
229 | if (!file_exists($dataFilePath)) { |
||
230 | throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
||
231 | } |
||
232 | |||
233 | $urls = []; |
||
234 | $counter = 0; |
||
235 | |||
236 | $csv = Reader::createFromPath($dataFilePath, 'r'); |
||
237 | $csv->setHeaderOffset(0); |
||
238 | |||
239 | $records = $csv->getRecords(); |
||
240 | foreach ($records as $r) { |
||
241 | $urls[$r['uri']] = new Url($this->base.$r['uri'], 0); |
||
242 | foreach ($r as $k => $v) { |
||
243 | if ('can_be_crawled' == $k && !empty($v)) { |
||
244 | $v = (bool) $v; |
||
245 | } |
||
246 | $urls[$r['uri']]->$k = $v; |
||
247 | } |
||
248 | if (!empty($r['can_be_crawled'])) { |
||
249 | ++$counter; |
||
250 | } |
||
251 | } |
||
252 | |||
253 | $currentClick = $r['click'] ?? 0; |
||
254 | |||
255 | return [ |
||
256 | 'urls' => $urls, |
||
257 | 'counter' => $counter, |
||
258 | 'currentClick' => $currentClick, |
||
259 | ]; |
||
260 | } |
||
261 | |||
262 | // could be add in an other class.. |
||
263 | protected $index; |
||
264 | |||
265 | protected function getIndexFromPreviousCrawl() |
||
266 | { |
||
267 | if (null !== $this->index) { |
||
268 | return $this->index; |
||
269 | } |
||
270 | |||
271 | $this->index = []; |
||
272 | |||
273 | $indexFilePath = $this->getDataFolder().'/index.csv'; |
||
274 | if (!file_exists($indexFilePath)) { |
||
275 | throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
||
276 | } |
||
277 | |||
278 | $csv = Reader::createFromPath($indexFilePath, 'r'); |
||
279 | $csv->setHeaderOffset(0); |
||
280 | |||
281 | $records = $csv->getRecords(); |
||
282 | foreach ($records as $r) { |
||
283 | $this->index[$r['id']] = new Url($this->base.$r['uri'], 0); |
||
284 | $this->index[$r['id']]->id = $r['id']; |
||
285 | } |
||
286 | |||
287 | return $this->index; |
||
288 | } |
||
289 | |||
290 | public function getUrlFromId($id, $base = true) |
||
295 | } |
||
296 | |||
297 | public function cacheRequest($harvest) |
||
298 | { |
||
299 | if ($harvest instanceof Harvest && null !== $harvest->getResponse()->getRequest()) { |
||
304 | } |
||
305 | |||
306 | public function getRequestCached() |
||
307 | { |
||
308 | return $this->request; |
||
309 | } |
||
310 | |||
311 | public function cacheRobotsTxt($harvest) |
||
312 | { |
||
313 | if (null === $this->robotsTxt && $harvest instanceof Harvest) { |
||
314 | $this->robotsTxt = $harvest->getRobotsTxt(); |
||
315 | } |
||
316 | |||
317 | return $this; |
||
318 | } |
||
319 | |||
320 | public function getRobotsTxtCached() |
||
323 | } |
||
324 | |||
325 | public function getRecorder() |
||
328 | } |
||
329 | } |
||
330 |