Total Complexity | 55 |
Total Lines | 350 |
Duplicated Lines | 60.57 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like hyperactive.memory often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # Author: Simon Blanke |
||
2 | # Email: [email protected] |
||
3 | # License: MIT License |
||
4 | |||
5 | import os |
||
6 | import glob |
||
7 | import json |
||
8 | import dill |
||
9 | import time |
||
10 | import datetime |
||
11 | import hashlib |
||
12 | import inspect |
||
13 | |||
14 | import numpy as np |
||
15 | import pandas as pd |
||
16 | |||
17 | def apply_tobytes(df): |
||
18 | return df.values.tobytes() |
||
19 | |||
20 | |||
21 | |||
22 | |||
23 | class Memory: |
||
24 | def __init__(self, _space_, _main_args_, _cand_): |
||
25 | self._space_ = _space_ |
||
26 | self._main_args_ = _main_args_ |
||
27 | |||
28 | self.pos_best = None |
||
29 | self.score_best = -np.inf |
||
30 | |||
31 | self.memory_type = _main_args_.memory |
||
32 | self.memory_dict = {} |
||
33 | |||
34 | self.meta_data_found = False |
||
35 | |||
36 | self.n_dims = None |
||
37 | |||
38 | |||
39 | class ShortTermMemory(Memory): |
||
40 | def __init__(self, _space_, _main_args_, _cand_): |
||
41 | super().__init__(_space_, _main_args_, _cand_) |
||
42 | |||
43 | |||
44 | class LongTermMemory(Memory): |
||
45 | View Code Duplication | def __init__(self, _space_, _main_args_, _cand_): |
|
|
|||
46 | super().__init__(_space_, _main_args_, _cand_) |
||
47 | |||
48 | self.nth_process = _cand_.nth_process |
||
49 | |||
50 | self.score_col_name = "mean_test_score" |
||
51 | |||
52 | self.feature_hash = self._get_hash(_main_args_.X) |
||
53 | self.label_hash = self._get_hash(_main_args_.y) |
||
54 | |||
55 | current_path = os.path.realpath(__file__) |
||
56 | meta_learn_path, _ = current_path.rsplit("/", 1) |
||
57 | |||
58 | self.datetime = "run_data/" + datetime.datetime.now().strftime("%d.%m.%Y - %H:%M:%S") |
||
59 | func_str = self._get_func_str(_cand_.func_) |
||
60 | self.func_path_ = self._get_hash(func_str.encode("utf-8")) + "/" |
||
61 | |||
62 | self.meta_path = meta_learn_path + "/meta_data/" |
||
63 | self.func_path = self.meta_path + self.func_path_ |
||
64 | self.date_path = self.meta_path + self.func_path_ + self.datetime + "/" |
||
65 | |||
66 | if not os.path.exists(self.date_path): |
||
67 | os.makedirs(self.date_path, exist_ok=True) |
||
68 | |||
69 | View Code Duplication | def load_memory(self, _cand_, _verb_): |
|
70 | para, score = self._read_func_metadata(_cand_.func_, _verb_) |
||
71 | if para is None or score is None: |
||
72 | return |
||
73 | |||
74 | _verb_.load_samples(para) |
||
75 | _cand_.eval_time = list(para["eval_time"]) |
||
76 | |||
77 | self._load_data_into_memory(para, score) |
||
78 | self.n_dims = len(para.columns) |
||
79 | |||
80 | View Code Duplication | def save_memory(self, _main_args_, _opt_args_, _cand_): |
|
81 | path = self._get_file_path(_cand_.func_) |
||
82 | meta_data = self._collect(_cand_) |
||
83 | |||
84 | meta_data["run"] = self.datetime |
||
85 | self._save_toCSV(meta_data, path) |
||
86 | |||
87 | obj_func_path = self.func_path + "objective_function.py" |
||
88 | if not os.path.exists(obj_func_path): |
||
89 | file = open(obj_func_path, "w") |
||
90 | file.write(self._get_func_str(_cand_.func_)) |
||
91 | file.close() |
||
92 | |||
93 | search_config_path = self.date_path + "search_config.py" |
||
94 | search_config_temp = dict(self._main_args_.search_config) |
||
95 | |||
96 | for key in search_config_temp.keys(): |
||
97 | if isinstance(key, str): |
||
98 | continue |
||
99 | search_config_temp[key.__name__] = search_config_temp[key] |
||
100 | del search_config_temp[key] |
||
101 | |||
102 | search_config_str = "search_config = " + str(search_config_temp) |
||
103 | |||
104 | if not os.path.exists(search_config_path): |
||
105 | file = open(search_config_path, "w") |
||
106 | file.write(search_config_str) |
||
107 | file.close() |
||
108 | |||
109 | """ |
||
110 | os.chdir(self.date_path) |
||
111 | os.system("black search_config.py") |
||
112 | os.getcwd() |
||
113 | """ |
||
114 | |||
115 | run_data = { |
||
116 | "random_state": self._main_args_.random_state, |
||
117 | "max_time": self._main_args_.random_state, |
||
118 | "n_iter": self._main_args_.n_iter, |
||
119 | "optimizer": self._main_args_.optimizer, |
||
120 | "n_jobs": self._main_args_.n_jobs, |
||
121 | "eval_time": np.array(_cand_.eval_time).sum(), |
||
122 | "total_time": _cand_.total_time, |
||
123 | } |
||
124 | |||
125 | with open(self.date_path + "run_data.json", "w") as f: |
||
126 | json.dump(run_data, f, indent=4) |
||
127 | |||
128 | """ |
||
129 | print("_opt_args_.kwargs_opt", _opt_args_.kwargs_opt) |
||
130 | |||
131 | opt_para = pd.DataFrame.from_dict(_opt_args_.kwargs_opt, dtype=object) |
||
132 | print("opt_para", opt_para) |
||
133 | opt_para.to_csv(self.date_path + "opt_para", index=False) |
||
134 | """ |
||
135 | |||
136 | View Code Duplication | def _save_toCSV(self, meta_data_new, path): |
|
137 | if os.path.exists(path): |
||
138 | meta_data_old = pd.read_csv(path) |
||
139 | |||
140 | if len(meta_data_old.columns) != len(meta_data_new.columns): |
||
141 | print("Warning meta data dimensionality does not match") |
||
142 | print("Meta data will not be saved") |
||
143 | return |
||
144 | |||
145 | meta_data = meta_data_old.append(meta_data_new) |
||
146 | |||
147 | columns = list(meta_data.columns) |
||
148 | noScore = ["mean_test_score", "cv_default_score", "eval_time", "run"] |
||
149 | columns_noScore = [c for c in columns if c not in noScore] |
||
150 | |||
151 | meta_data = meta_data.drop_duplicates(subset=columns_noScore) |
||
152 | else: |
||
153 | meta_data = meta_data_new |
||
154 | |||
155 | meta_data.to_csv(path, index=False) |
||
156 | |||
157 | View Code Duplication | def _read_func_metadata(self, model_func, _verb_): |
|
158 | paths = self._get_func_data_names() |
||
159 | |||
160 | meta_data_list = [] |
||
161 | for path in paths: |
||
162 | meta_data = pd.read_csv(path) |
||
163 | meta_data_list.append(meta_data) |
||
164 | self.meta_data_found = True |
||
165 | |||
166 | if len(meta_data_list) > 0: |
||
167 | meta_data = pd.concat(meta_data_list, ignore_index=True) |
||
168 | |||
169 | column_names = meta_data.columns |
||
170 | score_name = [name for name in column_names if self.score_col_name in name] |
||
171 | |||
172 | para = meta_data.drop(score_name, axis=1) |
||
173 | score = meta_data[score_name] |
||
174 | |||
175 | _verb_.load_meta_data() |
||
176 | return para, score |
||
177 | |||
178 | else: |
||
179 | _verb_.no_meta_data(model_func) |
||
180 | return None, None |
||
181 | |||
182 | View Code Duplication | def _get_opt_meta_data(self): |
|
183 | results_dict = {} |
||
184 | para_list = [] |
||
185 | score_list = [] |
||
186 | |||
187 | for key in self.memory_dict.keys(): |
||
188 | pos = np.fromstring(key, dtype=int) |
||
189 | para = self._space_.pos2para(pos) |
||
190 | score = self.memory_dict[key] |
||
191 | |||
192 | for key in para.keys(): |
||
193 | if ( |
||
194 | not isinstance(para[key], int) |
||
195 | and not isinstance(para[key], float) |
||
196 | and not isinstance(para[key], str) |
||
197 | ): |
||
198 | |||
199 | para_dill = dill.dumps(para[key]) |
||
200 | para_hash = self._get_hash(para_dill) |
||
201 | |||
202 | with open( |
||
203 | self.func_path + str(para_hash) + ".pkl", "wb" |
||
204 | ) as pickle_file: |
||
205 | dill.dump(para_dill, pickle_file) |
||
206 | |||
207 | para[key] = para_hash |
||
208 | |||
209 | if score != 0: |
||
210 | para_list.append(para) |
||
211 | score_list.append(score) |
||
212 | |||
213 | results_dict["params"] = para_list |
||
214 | results_dict["mean_test_score"] = score_list |
||
215 | |||
216 | return results_dict |
||
217 | |||
218 | View Code Duplication | def _load_data_into_memory(self, paras, scores): |
|
219 | |||
220 | paras = paras.replace(self._hash2obj()) |
||
221 | pos = self.para2pos(paras) |
||
222 | |||
223 | if len(pos) == 0: |
||
224 | return |
||
225 | |||
226 | df_temp = pd.DataFrame() |
||
227 | df_temp["pos_str"] = pos.apply(apply_tobytes, axis=1) |
||
228 | df_temp["score"] = scores |
||
229 | |||
230 | self.memory_dict = df_temp.set_index('pos_str').to_dict()['score'] |
||
231 | |||
232 | scores = np.array(scores) |
||
233 | paras = np.array(paras) |
||
234 | |||
235 | idx = np.argmax(scores) |
||
236 | self.score_best = scores[idx] |
||
237 | self.pos_best = paras[idx] |
||
238 | |||
239 | def apply_index(self, pos_key, df): |
||
240 | return self._space_.search_space[pos_key].index(df) if df in self._space_.search_space[pos_key] else None |
||
241 | |||
242 | def para2pos(self, paras): |
||
243 | from functools import partial |
||
244 | |||
245 | paras = paras[self._space_.para_names] |
||
246 | pos = paras.copy() |
||
247 | |||
248 | for pos_key in self._space_.search_space: |
||
249 | apply_index = partial(self.apply_index, pos_key) |
||
250 | pos[pos_key] = paras[pos_key].apply( |
||
251 | apply_index |
||
252 | ) |
||
253 | |||
254 | pos.dropna(how='any', inplace=True) |
||
255 | pos = pos.astype('int64') |
||
256 | |||
257 | return pos |
||
258 | |||
259 | View Code Duplication | def _collect(self, _cand_): |
|
260 | results_dict = self._get_opt_meta_data() |
||
261 | |||
262 | para_pd = pd.DataFrame(results_dict["params"]) |
||
263 | metric_pd = pd.DataFrame( |
||
264 | results_dict["mean_test_score"], columns=["mean_test_score"] |
||
265 | ) |
||
266 | |||
267 | eval_time = pd.DataFrame(_cand_.eval_time[-len(para_pd):], columns=["eval_time"]) |
||
268 | md_model = pd.concat( |
||
269 | [para_pd, metric_pd, eval_time], axis=1, ignore_index=False |
||
270 | ) |
||
271 | |||
272 | return md_model |
||
273 | |||
274 | def _get_hash(self, object): |
||
275 | return hashlib.sha1(object).hexdigest() |
||
276 | |||
277 | def _get_func_str(self, func): |
||
278 | return inspect.getsource(func) |
||
279 | |||
280 | def _get_subdirs(self): |
||
281 | subdirs = glob.glob(self.func_path + "*/") |
||
282 | |||
283 | return subdirs |
||
284 | |||
285 | def _get_func_data_names1(self): |
||
286 | subdirs = self._get_subdirs() |
||
287 | |||
288 | path_list = [] |
||
289 | for subdir in subdirs: |
||
290 | paths = glob.glob(subdir + "*.csv") |
||
291 | path_list = path_list + paths |
||
292 | |||
293 | return path_list |
||
294 | |||
295 | def _get_func_data_names(self): |
||
296 | paths = glob.glob( |
||
297 | self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv") |
||
298 | ) |
||
299 | |||
300 | return paths |
||
301 | |||
302 | def _read_dill(self, value): |
||
303 | paths = self._get_pkl_hash(value) |
||
304 | for path in paths: |
||
305 | with open(path, "rb") as fp: |
||
306 | value = dill.load(fp) |
||
307 | value = dill.loads(value) |
||
308 | break |
||
309 | |||
310 | return value |
||
311 | |||
312 | def _hash2obj(self): |
||
313 | hash2obj_dict = {} |
||
314 | para_hash_list = self._get_para_hash_list() |
||
315 | |||
316 | for para_hash in para_hash_list: |
||
317 | obj = self._read_dill(para_hash) |
||
318 | hash2obj_dict[para_hash] = obj |
||
319 | |||
320 | return hash2obj_dict |
||
321 | |||
322 | View Code Duplication | def _get_para_hash_list(self): |
|
323 | para_hash_list = [] |
||
324 | for key in self._space_.search_space.keys(): |
||
325 | values = self._space_.search_space[key] |
||
326 | |||
327 | for value in values: |
||
328 | if ( |
||
329 | not isinstance(value, int) |
||
330 | and not isinstance(value, float) |
||
331 | and not isinstance(value, str) |
||
332 | ): |
||
333 | |||
334 | para_dill = dill.dumps(value) |
||
335 | para_hash = self._get_hash(para_dill) |
||
336 | para_hash_list.append(para_hash) |
||
337 | |||
338 | return para_hash_list |
||
339 | |||
340 | def _get_pkl_hash(self, hash): |
||
341 | paths = glob.glob(self.func_path + hash + "*.pkl") |
||
342 | |||
343 | return paths |
||
344 | |||
345 | def _get_file_path(self, model_func): |
||
346 | if not os.path.exists(self.date_path): |
||
347 | os.makedirs(self.date_path) |
||
348 | |||
349 | return self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv") |
||
350 |