Total Complexity | 44 |
Total Lines | 279 |
Duplicated Lines | 0 % |
Changes | 1 | ||
Bugs | 0 | Features | 1 |
Complex classes like CrawlerConfig often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use CrawlerConfig, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
8 | class CrawlerConfig |
||
9 | { |
||
10 | /** |
||
11 | * @var string contain the user agent used during the crawl |
||
12 | */ |
||
13 | protected $userAgent; |
||
14 | |||
15 | /** |
||
16 | * @var string crawl id |
||
17 | */ |
||
18 | protected $id; |
||
19 | |||
20 | /** |
||
21 | * @var string page to ignore during the crawl |
||
22 | */ |
||
23 | protected $ignore; |
||
24 | |||
25 | /** |
||
26 | * @var int depth max where to crawl |
||
27 | */ |
||
28 | protected $limit; |
||
29 | |||
30 | /** |
||
31 | * @var string contain https://domain.tdl from start url |
||
32 | */ |
||
33 | protected $base; |
||
34 | |||
35 | /** |
||
36 | * @var int |
||
37 | */ |
||
38 | protected $wait; |
||
39 | |||
40 | /** |
||
41 | * @var int |
||
42 | */ |
||
43 | protected $cacheMethod; |
||
44 | |||
45 | /** |
||
46 | * @var string |
||
47 | */ |
||
48 | protected $dataDirectory; |
||
49 | |||
50 | /** |
||
51 | * @var string |
||
52 | */ |
||
53 | protected $startUrl; |
||
54 | |||
55 | public function __construct( |
||
73 | } |
||
74 | |||
75 | /** |
||
76 | * @return string id |
||
77 | */ |
||
78 | public static function getLastCrawl(string $dataDirectory): string |
||
79 | { |
||
80 | $dir = scandir($dataDirectory); |
||
81 | $lastCrawl = null; |
||
82 | $lastRunAt = null; |
||
83 | |||
84 | foreach ($dir as $file) { |
||
85 | if ('.' != $file && '..' != $file |
||
86 | && is_dir($dataDirectory.'/'.$file) |
||
87 | && filemtime($dataDirectory.'/'.$file) > $lastRunAt) { |
||
88 | $lastCrawl = $file; |
||
89 | $lastRunAt = filemtime($dataDirectory.'/'.$file); |
||
90 | } |
||
91 | } |
||
92 | |||
93 | if (null === $lastCrawl) { |
||
94 | throw new \Exception('No crawl previously runned'); |
||
95 | } |
||
96 | |||
97 | return $lastCrawl; |
||
98 | } |
||
99 | |||
100 | public static function loadFrom(string $crawlId, ?string $dataDirectory = null): self |
||
101 | { |
||
102 | if ('last' === $crawlId) { |
||
103 | $crawlId = self::getLastCrawl(rtrim(self::getDataFolderFrom('', $dataDirectory), '/')); |
||
104 | } |
||
105 | |||
106 | $configFilePath = self::getDataFolderFrom($crawlId, $dataDirectory).'/config.json'; |
||
107 | if (!file_exists($configFilePath)) { |
||
108 | throw new \Exception('Crawl `'.$crawlId.'` not found.'); |
||
109 | } |
||
110 | $config = json_decode(file_get_contents($configFilePath), true); |
||
111 | |||
112 | return (new self( |
||
113 | $config['base'].$config['startUrl'], |
||
114 | $config['ignore'], |
||
115 | intval($config['limit']), |
||
116 | (string) $config['userAgent'], |
||
117 | intval($config['cacheMethod']), |
||
118 | intval($config['wait']), |
||
119 | $dataDirectory |
||
120 | ))->setId($crawlId); |
||
121 | } |
||
122 | |||
123 | public function recordConfig() |
||
124 | { |
||
125 | file_put_contents($this->getDataFolder().'/config.json', json_encode([ |
||
126 | 'startUrl' => $this->startUrl, |
||
127 | 'base' => $this->base, |
||
128 | 'ignore' => $this->ignore, |
||
129 | 'limit' => $this->limit, |
||
130 | 'userAgent' => $this->userAgent, |
||
131 | 'cacheMethod' => $this->cacheMethod, |
||
132 | 'wait' => $this->wait, |
||
133 | ])); |
||
134 | } |
||
135 | |||
136 | protected function setBaseAndStartUrl(string $url) |
||
137 | { |
||
138 | if (!filter_var($url, FILTER_VALIDATE_URL)) { |
||
139 | throw new \Exception('start is not a valid URL `'.$url.'`'); |
||
140 | } |
||
141 | |||
142 | $this->base = preg_match('@^(http://|https://)?[^/\?#]+@', $url, $match) ? $match[0] : $url; |
||
143 | |||
144 | $url = substr($url, strlen($this->base)); |
||
145 | |||
146 | $this->startUrl = (!isset($url[0]) || '/' != $url[0] ? '/' : '').$url; |
||
147 | } |
||
148 | |||
149 | public static function getDataFolderFrom(string $id, ?string $path) |
||
150 | { |
||
151 | return ($path ?? __DIR__.'/../data').'/'.$id; |
||
152 | } |
||
153 | |||
154 | public function getDataFolder() |
||
155 | { |
||
156 | return $this->dataDirectory.'/'.$this->id; |
||
157 | } |
||
158 | |||
159 | public function getId() |
||
160 | { |
||
161 | return $this->id; |
||
162 | } |
||
163 | |||
164 | public function getBase() |
||
165 | { |
||
166 | return $this->base; |
||
167 | } |
||
168 | |||
169 | public function getStartUrl() |
||
172 | } |
||
173 | |||
174 | public function getWait() |
||
175 | { |
||
176 | return $this->wait; |
||
177 | } |
||
178 | |||
179 | public function getUserAgent() |
||
180 | { |
||
181 | return $this->userAgent; |
||
182 | } |
||
183 | |||
184 | public function getLimit() |
||
185 | { |
||
186 | return $this->limit; |
||
187 | } |
||
188 | |||
189 | public function getCacheMethod() |
||
190 | { |
||
191 | return $this->cacheMethod; |
||
192 | } |
||
193 | |||
194 | public function getDataDirectory() |
||
195 | { |
||
196 | $this->dataDirectory; |
||
197 | } |
||
198 | |||
199 | /** @var RobotsTxt */ |
||
200 | protected $virtualRobots; |
||
201 | |||
202 | public function getVirtualRobots() |
||
209 | } |
||
210 | |||
211 | public function setId(string $id): self |
||
212 | { |
||
213 | $this->id = $id; |
||
214 | |||
215 | return $this; |
||
216 | } |
||
217 | |||
218 | public function getDataFromPreviousCrawl() |
||
219 | { |
||
220 | $dataFilePath = $this->getDataFolder().'/data.csv'; |
||
221 | if (!file_exists($dataFilePath)) { |
||
222 | throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
||
223 | } |
||
224 | |||
225 | $urls = []; |
||
226 | $counter = 0; |
||
227 | |||
228 | $csv = Reader::createFromPath($dataFilePath, 'r'); |
||
229 | $csv->setHeaderOffset(0); |
||
230 | |||
231 | $records = $csv->getRecords(); |
||
232 | foreach ($records as $r) { |
||
233 | $urls[$r['uri']] = new Url($this->base.$r['uri'], 0); |
||
234 | foreach ($r as $k => $v) { |
||
235 | if ('can_be_crawled' == $k && !empty($v)) { |
||
236 | $v = (bool) $v; |
||
237 | } |
||
238 | $urls[$r['uri']]->$k = $v; |
||
239 | } |
||
240 | if (!empty($r['can_be_crawled'])) { |
||
241 | ++$counter; |
||
242 | } |
||
243 | } |
||
244 | |||
245 | $currentClick = $r['click'] ?? 0; |
||
246 | |||
247 | return [ |
||
248 | 'urls' => $urls, |
||
249 | 'counter' => $counter, |
||
250 | 'currentClick' => $currentClick, |
||
251 | ]; |
||
252 | } |
||
253 | |||
254 | // could be add in an other class.. |
||
255 | protected $index; |
||
256 | |||
257 | protected function getIndexFromPreviousCrawl() |
||
258 | { |
||
259 | if (null !== $this->index) { |
||
260 | return $this->index; |
||
261 | } |
||
262 | |||
263 | $this->index = []; |
||
264 | |||
265 | $indexFilePath = $this->getDataFolder().'/index.csv'; |
||
266 | if (!file_exists($indexFilePath)) { |
||
267 | throw new \Exception('Previous crawl\'s data not found (index.csv)'); |
||
268 | } |
||
269 | |||
270 | $csv = Reader::createFromPath($indexFilePath, 'r'); |
||
271 | $csv->setHeaderOffset(0); |
||
272 | |||
273 | $records = $csv->getRecords(); |
||
274 | foreach ($records as $r) { |
||
275 | $this->index[$r['id']] = new Url($this->base.$r['uri'], 0); |
||
276 | $this->index[$r['id']]->id = $r['id']; |
||
277 | } |
||
278 | |||
279 | return $this->index; |
||
280 | } |
||
281 | |||
282 | public function getUrlFromId($id, $base = true) |
||
287 | } |
||
288 | } |
||
289 |