Completed
Push — master ( 75ac4c...d2135c )
by Simon
12:39
created

LongTermMemory._get_func_str()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
# Author: Simon Blanke
2
# Email: [email protected]
3
# License: MIT License
4
5
import os
6
import glob
7
import json
8
import dill
9
import datetime
10
import hashlib
11
import inspect
12
13
import numpy as np
14
import pandas as pd
15
16
from functools import partial
17
18
19
def apply_tobytes(df):
20
    return df.values.tobytes()
21
22
23
class Memory:
24
    def __init__(self, _space_, _main_args_, _cand_):
25
        self._space_ = _space_
26
        self._main_args_ = _main_args_
27
28
        self.pos_best = None
29
        self.score_best = -np.inf
30
31
        self.memory_type = _main_args_.memory
32
        self.memory_dict = {}
33
34
        self.meta_data_found = False
35
36
        self.n_dims = None
37
38
39
class ShortTermMemory(Memory):
40
    def __init__(self, _space_, _main_args_, _cand_):
41
        super().__init__(_space_, _main_args_, _cand_)
42
43
44
class LongTermMemory(Memory):
45
    def __init__(self, _space_, _main_args_, _cand_):
46
        super().__init__(_space_, _main_args_, _cand_)
47
48
        self.nth_process = _cand_.nth_process
49
50
        self.score_col_name = "mean_test_score"
51
52
        self.feature_hash = self._get_hash(_main_args_.X)
53
        self.label_hash = self._get_hash(_main_args_.y)
54
55
        current_path = os.path.realpath(__file__)
56
        meta_learn_path, _ = current_path.rsplit("/", 1)
57
58
        self.datetime = "run_data/" + datetime.datetime.now().strftime(
59
            "%d.%m.%Y - %H:%M:%S"
60
        )
61
        func_str = self._get_func_str(_cand_.func_)
62
        self.func_path_ = self._get_hash(func_str.encode("utf-8")) + "/"
63
64
        self.meta_path = meta_learn_path + "/meta_data/"
65
        self.func_path = self.meta_path + self.func_path_
66
        self.date_path = self.meta_path + self.func_path_ + self.datetime + "/"
67
68
        if not os.path.exists(self.date_path):
69
            os.makedirs(self.date_path, exist_ok=True)
70
71
    def load_memory(self, _cand_, _verb_):
72
        para, score = self._read_func_metadata(_cand_.func_, _verb_)
73
        if para is None or score is None:
74
            return
75
76
        _verb_.load_samples(para)
77
        _cand_.eval_time = list(para["eval_time"])
78
79
        self._load_data_into_memory(para, score)
80
        self.n_dims = len(para.columns)
81
82
    def save_memory(self, _main_args_, _opt_args_, _cand_):
83
        path = self._get_file_path(_cand_.func_)
84
        meta_data = self._collect(_cand_)
85
86
        meta_data["run"] = self.datetime
87
        self._save_toCSV(meta_data, path)
88
89
        obj_func_path = self.func_path + "objective_function.py"
90
        if not os.path.exists(obj_func_path):
91
            file = open(obj_func_path, "w")
92
            file.write(self._get_func_str(_cand_.func_))
93
            file.close()
94
95
        search_config_path = self.date_path + "search_config.py"
96
        search_config_temp = dict(self._main_args_.search_config)
97
98
        for key in search_config_temp.keys():
99
            if isinstance(key, str):
100
                continue
101
            search_config_temp[key.__name__] = search_config_temp[key]
102
            del search_config_temp[key]
103
104
        search_config_str = "search_config = " + str(search_config_temp)
105
106
        if not os.path.exists(search_config_path):
107
            file = open(search_config_path, "w")
108
            file.write(search_config_str)
109
            file.close()
110
111
        """
112
        os.chdir(self.date_path)
113
        os.system("black search_config.py")
114
        os.getcwd()
115
        """
116
117
        run_data = {
118
            "random_state": self._main_args_.random_state,
119
            "max_time": self._main_args_.random_state,
120
            "n_iter": self._main_args_.n_iter,
121
            "optimizer": self._main_args_.optimizer,
122
            "n_jobs": self._main_args_.n_jobs,
123
            "eval_time": np.array(_cand_.eval_time).sum(),
124
            "total_time": _cand_.total_time,
125
        }
126
127
        with open(self.date_path + "run_data.json", "w") as f:
128
            json.dump(run_data, f, indent=4)
129
130
        """
131
        print("_opt_args_.kwargs_opt", _opt_args_.kwargs_opt)
132
133
        opt_para = pd.DataFrame.from_dict(_opt_args_.kwargs_opt, dtype=object)
134
        print("opt_para", opt_para)
135
        opt_para.to_csv(self.date_path + "opt_para", index=False)
136
        """
137
138
    def _save_toCSV(self, meta_data_new, path):
139
        if os.path.exists(path):
140
            meta_data_old = pd.read_csv(path)
141
142
            if len(meta_data_old.columns) != len(meta_data_new.columns):
143
                print("Warning meta data dimensionality does not match")
144
                print("Meta data will not be saved")
145
                return
146
147
            meta_data = meta_data_old.append(meta_data_new)
148
149
            columns = list(meta_data.columns)
150
            noScore = ["mean_test_score", "cv_default_score", "eval_time", "run"]
151
            columns_noScore = [c for c in columns if c not in noScore]
152
153
            meta_data = meta_data.drop_duplicates(subset=columns_noScore)
154
        else:
155
            meta_data = meta_data_new
156
157
        meta_data.to_csv(path, index=False)
158
159
    def _read_func_metadata(self, model_func, _verb_):
160
        paths = self._get_func_data_names()
161
162
        meta_data_list = []
163
        for path in paths:
164
            meta_data = pd.read_csv(path)
165
            meta_data_list.append(meta_data)
166
            self.meta_data_found = True
167
168
        if len(meta_data_list) > 0:
169
            meta_data = pd.concat(meta_data_list, ignore_index=True)
170
171
            column_names = meta_data.columns
172
            score_name = [name for name in column_names if self.score_col_name in name]
173
174
            para = meta_data.drop(score_name, axis=1)
175
            score = meta_data[score_name]
176
177
            _verb_.load_meta_data()
178
            return para, score
179
180
        else:
181
            _verb_.no_meta_data(model_func)
182
            return None, None
183
184
    def _get_opt_meta_data(self):
185
        results_dict = {}
186
        para_list = []
187
        score_list = []
188
189
        for key in self.memory_dict.keys():
190
            pos = np.fromstring(key, dtype=int)
191
            para = self._space_.pos2para(pos)
192
            score = self.memory_dict[key]
193
194
            for key in para.keys():
195
                if (
196
                    not isinstance(para[key], int)
197
                    and not isinstance(para[key], float)
198
                    and not isinstance(para[key], str)
199
                ):
200
201
                    para_dill = dill.dumps(para[key])
202
                    para_hash = self._get_hash(para_dill)
203
204
                    with open(
205
                        self.func_path + str(para_hash) + ".pkl", "wb"
206
                    ) as pickle_file:
207
                        dill.dump(para_dill, pickle_file)
208
209
                    para[key] = para_hash
210
211
            if score != 0:
212
                para_list.append(para)
213
                score_list.append(score)
214
215
        results_dict["params"] = para_list
216
        results_dict["mean_test_score"] = score_list
217
218
        return results_dict
219
220
    def _load_data_into_memory(self, paras, scores):
221
222
        paras = paras.replace(self._hash2obj())
223
        pos = self.para2pos(paras)
224
225
        if len(pos) == 0:
226
            return
227
228
        df_temp = pd.DataFrame()
229
        df_temp["pos_str"] = pos.apply(apply_tobytes, axis=1)
230
        df_temp["score"] = scores
231
232
        self.memory_dict = df_temp.set_index("pos_str").to_dict()["score"]
233
234
        scores = np.array(scores)
235
        paras = np.array(paras)
236
237
        idx = np.argmax(scores)
238
        self.score_best = scores[idx]
239
        self.pos_best = paras[idx]
240
241
    def apply_index(self, pos_key, df):
242
        return (
243
            self._space_.search_space[pos_key].index(df)
244
            if df in self._space_.search_space[pos_key]
245
            else None
246
        )
247
248
    def para2pos(self, paras):
249
        paras = paras[self._space_.para_names]
250
        pos = paras.copy()
251
252
        for pos_key in self._space_.search_space:
253
            apply_index = partial(self.apply_index, pos_key)
254
            pos[pos_key] = paras[pos_key].apply(apply_index)
255
256
        pos.dropna(how="any", inplace=True)
257
        pos = pos.astype("int64")
258
259
        return pos
260
261
    def _collect(self, _cand_):
262
        results_dict = self._get_opt_meta_data()
263
264
        para_pd = pd.DataFrame(results_dict["params"])
265
        metric_pd = pd.DataFrame(
266
            results_dict["mean_test_score"], columns=["mean_test_score"]
267
        )
268
        n_rows = len(para_pd)
269
        eval_time = pd.DataFrame(_cand_.eval_time[-n_rows:], columns=["eval_time"])
270
        md_model = pd.concat(
271
            [para_pd, metric_pd, eval_time], axis=1, ignore_index=False
272
        )
273
274
        return md_model
275
276
    def _get_hash(self, object):
277
        return hashlib.sha1(object).hexdigest()
278
279
    def _get_func_str(self, func):
280
        return inspect.getsource(func)
281
282
    def _get_func_data_names(self):
283
        paths = glob.glob(
284
            self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv")
285
        )
286
287
        return paths
288
289
    def _read_dill(self, value):
290
        paths = self._get_pkl_hash(value)
291
        for path in paths:
292
            with open(path, "rb") as fp:
293
                value = dill.load(fp)
294
                value = dill.loads(value)
295
                break
296
297
        return value
298
299
    def _hash2obj(self):
300
        hash2obj_dict = {}
301
        para_hash_list = self._get_para_hash_list()
302
303
        for para_hash in para_hash_list:
304
            obj = self._read_dill(para_hash)
305
            hash2obj_dict[para_hash] = obj
306
307
        return hash2obj_dict
308
309
    def _get_para_hash_list(self):
310
        para_hash_list = []
311
        for key in self._space_.search_space.keys():
312
            values = self._space_.search_space[key]
313
314
            for value in values:
315
                if (
316
                    not isinstance(value, int)
317
                    and not isinstance(value, float)
318
                    and not isinstance(value, str)
319
                ):
320
321
                    para_dill = dill.dumps(value)
322
                    para_hash = self._get_hash(para_dill)
323
                    para_hash_list.append(para_hash)
324
325
        return para_hash_list
326
327
    def _get_pkl_hash(self, hash):
328
        paths = glob.glob(self.func_path + hash + "*.pkl")
329
330
        return paths
331
332
    def _get_file_path(self, model_func):
333
        if not os.path.exists(self.date_path):
334
            os.makedirs(self.date_path)
335
336
        return self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv")
337