hyperactive.memory - Code Metrics - Inspection of "fix short to long term memory if search space rang..." - SimonBlanke/Hyperactive - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 4229b2...c65cd4 )

by Simon

created 2020-01-17 10:29 UTC

hyperactive.memory C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	350
Duplicated Lines	60.57 %

Importance

Changes

Metric	Value
eloc	230
dl	212
loc	350
rs	6
c	0
b	0
f	0
wmc	55

22 Methods

Rating	Name	Duplication	Size	Complexity
A	LongTermMemory.load_memory()	10	10	3
B	LongTermMemory._get_para_hash_list()	17	17	6
A	LongTermMemory.__init__()	23	23	2
A	LongTermMemory._get_func_data_names()	0	6	1
A	LongTermMemory._get_pkl_hash()	0	4	1
A	Memory.__init__()	0	13	1
B	LongTermMemory.save_memory()	49	49	6
A	LongTermMemory._get_subdirs()	0	4	1
A	LongTermMemory._get_file_path()	0	5	2
B	LongTermMemory._get_opt_meta_data()	35	35	8
A	LongTermMemory._hash2obj()	0	9	2
A	LongTermMemory.para2pos()	0	16	2
A	LongTermMemory._read_dill()	0	9	3
A	LongTermMemory._read_func_metadata()	24	24	3
A	LongTermMemory._get_hash()	0	2	1
A	LongTermMemory.apply_index()	0	2	2
A	LongTermMemory._get_func_str()	0	2	1
A	ShortTermMemory.__init__()	0	2	1
A	LongTermMemory._save_toCSV()	20	20	3
A	LongTermMemory._collect()	14	14	1
A	LongTermMemory._get_func_data_names1()	0	9	2
A	LongTermMemory._load_data_into_memory()	20	20	2

1 Function

Rating	Name	Duplication	Size	Complexity
A	apply_tobytes()	0	2	1

How to fix Duplicated Code Complexity

# Author: Simon Blanke
# Email: [email protected]
# License: MIT License

import os
import glob
import json
import dill
import time
import datetime
import hashlib
import inspect

import numpy as np
import pandas as pd

def apply_tobytes(df):
    return df.values.tobytes()




class Memory:
    def __init__(self, _space_, _main_args_, _cand_):
        self._space_ = _space_
        self._main_args_ = _main_args_

        self.pos_best = None
        self.score_best = -np.inf

        self.memory_type = _main_args_.memory
        self.memory_dict = {}

        self.meta_data_found = False

        self.n_dims = None


class ShortTermMemory(Memory):
    def __init__(self, _space_, _main_args_, _cand_):
        super().__init__(_space_, _main_args_, _cand_)


class LongTermMemory(Memory):
    def __init__(self, _space_, _main_args_, _cand_):

        super().__init__(_space_, _main_args_, _cand_)

        self.nth_process = _cand_.nth_process

        self.score_col_name = "mean_test_score"

        self.feature_hash = self._get_hash(_main_args_.X)
        self.label_hash = self._get_hash(_main_args_.y)

        current_path = os.path.realpath(__file__)
        meta_learn_path, _ = current_path.rsplit("/", 1)

        self.datetime = "run_data/" + datetime.datetime.now().strftime("%d.%m.%Y - %H:%M:%S")
        func_str = self._get_func_str(_cand_.func_)
        self.func_path_ = self._get_hash(func_str.encode("utf-8")) + "/"

        self.meta_path = meta_learn_path + "/meta_data/"
        self.func_path = self.meta_path + self.func_path_
        self.date_path = self.meta_path + self.func_path_ + self.datetime + "/"

        if not os.path.exists(self.date_path):
            os.makedirs(self.date_path, exist_ok=True)

    def load_memory(self, _cand_, _verb_):

        para, score = self._read_func_metadata(_cand_.func_, _verb_)
        if para is None or score is None:
            return

        _verb_.load_samples(para)
        _cand_.eval_time = list(para["eval_time"])

        self._load_data_into_memory(para, score)
        self.n_dims = len(para.columns)

    def save_memory(self, _main_args_, _opt_args_, _cand_):

        path = self._get_file_path(_cand_.func_)
        meta_data = self._collect(_cand_)

        meta_data["run"] = self.datetime
        self._save_toCSV(meta_data, path)

        obj_func_path = self.func_path + "objective_function.py"
        if not os.path.exists(obj_func_path):
            file = open(obj_func_path, "w")
            file.write(self._get_func_str(_cand_.func_))
            file.close()

        search_config_path = self.date_path + "search_config.py"
        search_config_temp = dict(self._main_args_.search_config)

        for key in search_config_temp.keys():
            if isinstance(key, str):
                continue
            search_config_temp[key.__name__] = search_config_temp[key]
            del search_config_temp[key]

        search_config_str = "search_config = " + str(search_config_temp)

        if not os.path.exists(search_config_path):
            file = open(search_config_path, "w")
            file.write(search_config_str)
            file.close()

        """
        os.chdir(self.date_path)
        os.system("black search_config.py")
        os.getcwd()
        """

        run_data = {
            "random_state": self._main_args_.random_state,
            "max_time": self._main_args_.random_state,
            "n_iter": self._main_args_.n_iter,
            "optimizer": self._main_args_.optimizer,
            "n_jobs": self._main_args_.n_jobs,
            "eval_time": np.array(_cand_.eval_time).sum(),
            "total_time": _cand_.total_time,
        }

        with open(self.date_path + "run_data.json", "w") as f:
            json.dump(run_data, f, indent=4)

        """
        print("_opt_args_.kwargs_opt", _opt_args_.kwargs_opt)

        opt_para = pd.DataFrame.from_dict(_opt_args_.kwargs_opt, dtype=object)
        print("opt_para", opt_para)
        opt_para.to_csv(self.date_path + "opt_para", index=False)
        """

    def _save_toCSV(self, meta_data_new, path):

        if os.path.exists(path):
            meta_data_old = pd.read_csv(path)

            if len(meta_data_old.columns) != len(meta_data_new.columns):
                print("Warning meta data dimensionality does not match")
                print("Meta data will not be saved")
                return

            meta_data = meta_data_old.append(meta_data_new)

            columns = list(meta_data.columns)
            noScore = ["mean_test_score", "cv_default_score", "eval_time", "run"]
            columns_noScore = [c for c in columns if c not in noScore]

            meta_data = meta_data.drop_duplicates(subset=columns_noScore)
        else:
            meta_data = meta_data_new

        meta_data.to_csv(path, index=False)

    def _read_func_metadata(self, model_func, _verb_):

        paths = self._get_func_data_names()

        meta_data_list = []
        for path in paths:
            meta_data = pd.read_csv(path)
            meta_data_list.append(meta_data)
            self.meta_data_found = True

        if len(meta_data_list) > 0:
            meta_data = pd.concat(meta_data_list, ignore_index=True)

            column_names = meta_data.columns
            score_name = [name for name in column_names if self.score_col_name in name]

            para = meta_data.drop(score_name, axis=1)
            score = meta_data[score_name]

            _verb_.load_meta_data()
            return para, score

        else:
            _verb_.no_meta_data(model_func)
            return None, None

    def _get_opt_meta_data(self):

        results_dict = {}
        para_list = []
        score_list = []

        for key in self.memory_dict.keys():
            pos = np.fromstring(key, dtype=int)
            para = self._space_.pos2para(pos)
            score = self.memory_dict[key]

            for key in para.keys():
                if (
                    not isinstance(para[key], int)
                    and not isinstance(para[key], float)
                    and not isinstance(para[key], str)
                ):

                    para_dill = dill.dumps(para[key])
                    para_hash = self._get_hash(para_dill)

                    with open(
                        self.func_path + str(para_hash) + ".pkl", "wb"
                    ) as pickle_file:
                        dill.dump(para_dill, pickle_file)

                    para[key] = para_hash

            if score != 0:
                para_list.append(para)
                score_list.append(score)

        results_dict["params"] = para_list
        results_dict["mean_test_score"] = score_list

        return results_dict

    def _load_data_into_memory(self, paras, scores):


        paras = paras.replace(self._hash2obj())
        pos = self.para2pos(paras)

        if len(pos) == 0:
            return 

        df_temp = pd.DataFrame()
        df_temp["pos_str"] = pos.apply(apply_tobytes, axis=1)
        df_temp["score"] = scores

        self.memory_dict = df_temp.set_index('pos_str').to_dict()['score']

        scores = np.array(scores)
        paras = np.array(paras)

        idx = np.argmax(scores)
        self.score_best = scores[idx]
        self.pos_best = paras[idx]

    def apply_index(self, pos_key, df):
        return self._space_.search_space[pos_key].index(df) if df in self._space_.search_space[pos_key] else None

    def para2pos(self, paras):
        from functools import partial

        paras = paras[self._space_.para_names]
        pos = paras.copy()

        for pos_key in self._space_.search_space:
            apply_index = partial(self.apply_index, pos_key)
            pos[pos_key] = paras[pos_key].apply(
                apply_index
            )

        pos.dropna(how='any', inplace=True) 
        pos = pos.astype('int64')

        return pos

    def _collect(self, _cand_):

        results_dict = self._get_opt_meta_data()

        para_pd = pd.DataFrame(results_dict["params"])
        metric_pd = pd.DataFrame(
            results_dict["mean_test_score"], columns=["mean_test_score"]
        )

        eval_time = pd.DataFrame(_cand_.eval_time[-len(para_pd):], columns=["eval_time"])
        md_model = pd.concat(
            [para_pd, metric_pd, eval_time], axis=1, ignore_index=False
        )

        return md_model

    def _get_hash(self, object):
        return hashlib.sha1(object).hexdigest()

    def _get_func_str(self, func):
        return inspect.getsource(func)

    def _get_subdirs(self):
        subdirs = glob.glob(self.func_path + "*/")

        return subdirs

    def _get_func_data_names1(self):
        subdirs = self._get_subdirs()

        path_list = []
        for subdir in subdirs:
            paths = glob.glob(subdir + "*.csv")
            path_list = path_list + paths

        return path_list

    def _get_func_data_names(self):
        paths = glob.glob(
            self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv")
        )

        return paths

    def _read_dill(self, value):
        paths = self._get_pkl_hash(value)
        for path in paths:
            with open(path, "rb") as fp:
                value = dill.load(fp)
                value = dill.loads(value)
                break

        return value

    def _hash2obj(self):
        hash2obj_dict = {}
        para_hash_list = self._get_para_hash_list()

        for para_hash in para_hash_list:
            obj = self._read_dill(para_hash)
            hash2obj_dict[para_hash] = obj

        return hash2obj_dict

    def _get_para_hash_list(self):

        para_hash_list = []
        for key in self._space_.search_space.keys():
            values = self._space_.search_space[key]

            for value in values:
                if (
                    not isinstance(value, int)
                    and not isinstance(value, float)
                    and not isinstance(value, str)
                ):

                    para_dill = dill.dumps(value)
                    para_hash = self._get_hash(para_dill)
                    para_hash_list.append(para_hash)

        return para_hash_list

    def _get_pkl_hash(self, hash):
        paths = glob.glob(self.func_path + hash + "*.pkl")

        return paths

    def _get_file_path(self, model_func):
        if not os.path.exists(self.date_path):
            os.makedirs(self.date_path)

        return self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv")


1		# Author: Simon Blanke
2		# Email: [email protected]
3		# License: MIT License
4
5		import os
6		import glob
7		import json
8		import dill
9		import time
10		import datetime
11		import hashlib
12		import inspect
13
14		import numpy as np
15		import pandas as pd
16
17		def apply_tobytes(df):
18		return df.values.tobytes()
19
20
21
22
23		class Memory:
24		def __init__(self, _space_, _main_args_, _cand_):
25		self._space_ = _space_
26		self._main_args_ = _main_args_
27
28		self.pos_best = None
29		self.score_best = -np.inf
30
31		self.memory_type = _main_args_.memory
32		self.memory_dict = {}
33
34		self.meta_data_found = False
35
36		self.n_dims = None
37
38
39		class ShortTermMemory(Memory):
40		def __init__(self, _space_, _main_args_, _cand_):
41		super().__init__(_space_, _main_args_, _cand_)
42
43
44		class LongTermMemory(Memory):
45	View Code Duplication	def __init__(self, _space_, _main_args_, _cand_):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
46		super().__init__(_space_, _main_args_, _cand_)
47
48		self.nth_process = _cand_.nth_process
49
50		self.score_col_name = "mean_test_score"
51
52		self.feature_hash = self._get_hash(_main_args_.X)
53		self.label_hash = self._get_hash(_main_args_.y)
54
55		current_path = os.path.realpath(__file__)
56		meta_learn_path, _ = current_path.rsplit("/", 1)
57
58		self.datetime = "run_data/" + datetime.datetime.now().strftime("%d.%m.%Y - %H:%M:%S")
59		func_str = self._get_func_str(_cand_.func_)
60		self.func_path_ = self._get_hash(func_str.encode("utf-8")) + "/"
61
62		self.meta_path = meta_learn_path + "/meta_data/"
63		self.func_path = self.meta_path + self.func_path_
64		self.date_path = self.meta_path + self.func_path_ + self.datetime + "/"
65
66		if not os.path.exists(self.date_path):
67		os.makedirs(self.date_path, exist_ok=True)
68
69	View Code Duplication	def load_memory(self, _cand_, _verb_):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
70		para, score = self._read_func_metadata(_cand_.func_, _verb_)
71		if para is None or score is None:
72		return
73
74		_verb_.load_samples(para)
75		_cand_.eval_time = list(para["eval_time"])
76
77		self._load_data_into_memory(para, score)
78		self.n_dims = len(para.columns)
79
80	View Code Duplication	def save_memory(self, _main_args_, _opt_args_, _cand_):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
81		path = self._get_file_path(_cand_.func_)
82		meta_data = self._collect(_cand_)
83
84		meta_data["run"] = self.datetime
85		self._save_toCSV(meta_data, path)
86
87		obj_func_path = self.func_path + "objective_function.py"
88		if not os.path.exists(obj_func_path):
89		file = open(obj_func_path, "w")
90		file.write(self._get_func_str(_cand_.func_))
91		file.close()
92
93		search_config_path = self.date_path + "search_config.py"
94		search_config_temp = dict(self._main_args_.search_config)
95
96		for key in search_config_temp.keys():
97		if isinstance(key, str):
98		continue
99		search_config_temp[key.__name__] = search_config_temp[key]
100		del search_config_temp[key]
101
102		search_config_str = "search_config = " + str(search_config_temp)
103
104		if not os.path.exists(search_config_path):
105		file = open(search_config_path, "w")
106		file.write(search_config_str)
107		file.close()
108
109		"""
110		os.chdir(self.date_path)
111		os.system("black search_config.py")
112		os.getcwd()
113		"""
114
115		run_data = {
116		"random_state": self._main_args_.random_state,
117		"max_time": self._main_args_.random_state,
118		"n_iter": self._main_args_.n_iter,
119		"optimizer": self._main_args_.optimizer,
120		"n_jobs": self._main_args_.n_jobs,
121		"eval_time": np.array(_cand_.eval_time).sum(),
122		"total_time": _cand_.total_time,
123		}
124
125		with open(self.date_path + "run_data.json", "w") as f:
126		json.dump(run_data, f, indent=4)
127
128		"""
129		print("_opt_args_.kwargs_opt", _opt_args_.kwargs_opt)
130
131		opt_para = pd.DataFrame.from_dict(_opt_args_.kwargs_opt, dtype=object)
132		print("opt_para", opt_para)
133		opt_para.to_csv(self.date_path + "opt_para", index=False)
134		"""
135
136	View Code Duplication	def _save_toCSV(self, meta_data_new, path):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
137		if os.path.exists(path):
138		meta_data_old = pd.read_csv(path)
139
140		if len(meta_data_old.columns) != len(meta_data_new.columns):
141		print("Warning meta data dimensionality does not match")
142		print("Meta data will not be saved")
143		return
144
145		meta_data = meta_data_old.append(meta_data_new)
146
147		columns = list(meta_data.columns)
148		noScore = ["mean_test_score", "cv_default_score", "eval_time", "run"]
149		columns_noScore = [c for c in columns if c not in noScore]
150
151		meta_data = meta_data.drop_duplicates(subset=columns_noScore)
152		else:
153		meta_data = meta_data_new
154
155		meta_data.to_csv(path, index=False)
156
157	View Code Duplication	def _read_func_metadata(self, model_func, _verb_):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
158		paths = self._get_func_data_names()
159
160		meta_data_list = []
161		for path in paths:
162		meta_data = pd.read_csv(path)
163		meta_data_list.append(meta_data)
164		self.meta_data_found = True
165
166		if len(meta_data_list) > 0:
167		meta_data = pd.concat(meta_data_list, ignore_index=True)
168
169		column_names = meta_data.columns
170		score_name = [name for name in column_names if self.score_col_name in name]
171
172		para = meta_data.drop(score_name, axis=1)
173		score = meta_data[score_name]
174
175		_verb_.load_meta_data()
176		return para, score
177
178		else:
179		_verb_.no_meta_data(model_func)
180		return None, None
181
182	View Code Duplication	def _get_opt_meta_data(self):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
183		results_dict = {}
184		para_list = []
185		score_list = []
186
187		for key in self.memory_dict.keys():
188		pos = np.fromstring(key, dtype=int)
189		para = self._space_.pos2para(pos)
190		score = self.memory_dict[key]
191
192		for key in para.keys():
193		if (
194		not isinstance(para[key], int)
195		and not isinstance(para[key], float)
196		and not isinstance(para[key], str)
197		):
198
199		para_dill = dill.dumps(para[key])
200		para_hash = self._get_hash(para_dill)
201
202		with open(
203		self.func_path + str(para_hash) + ".pkl", "wb"
204		) as pickle_file:
205		dill.dump(para_dill, pickle_file)
206
207		para[key] = para_hash
208
209		if score != 0:
210		para_list.append(para)
211		score_list.append(score)
212
213		results_dict["params"] = para_list
214		results_dict["mean_test_score"] = score_list
215
216		return results_dict
217
218	View Code Duplication	def _load_data_into_memory(self, paras, scores):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
219
220		paras = paras.replace(self._hash2obj())
221		pos = self.para2pos(paras)
222
223		if len(pos) == 0:
224		return
225
226		df_temp = pd.DataFrame()
227		df_temp["pos_str"] = pos.apply(apply_tobytes, axis=1)
228		df_temp["score"] = scores
229
230		self.memory_dict = df_temp.set_index('pos_str').to_dict()['score']
231
232		scores = np.array(scores)
233		paras = np.array(paras)
234
235		idx = np.argmax(scores)
236		self.score_best = scores[idx]
237		self.pos_best = paras[idx]
238
239		def apply_index(self, pos_key, df):
240		return self._space_.search_space[pos_key].index(df) if df in self._space_.search_space[pos_key] else None
241
242		def para2pos(self, paras):
243		from functools import partial
244
245		paras = paras[self._space_.para_names]
246		pos = paras.copy()
247
248		for pos_key in self._space_.search_space:
249		apply_index = partial(self.apply_index, pos_key)
250		pos[pos_key] = paras[pos_key].apply(
251		apply_index
252		)
253
254		pos.dropna(how='any', inplace=True)
255		pos = pos.astype('int64')
256
257		return pos
258
259	View Code Duplication	def _collect(self, _cand_):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
260		results_dict = self._get_opt_meta_data()
261
262		para_pd = pd.DataFrame(results_dict["params"])
263		metric_pd = pd.DataFrame(
264		results_dict["mean_test_score"], columns=["mean_test_score"]
265		)
266
267		eval_time = pd.DataFrame(_cand_.eval_time[-len(para_pd):], columns=["eval_time"])
268		md_model = pd.concat(
269		[para_pd, metric_pd, eval_time], axis=1, ignore_index=False
270		)
271
272		return md_model
273
274		def _get_hash(self, object):
275		return hashlib.sha1(object).hexdigest()
276
277		def _get_func_str(self, func):
278		return inspect.getsource(func)
279
280		def _get_subdirs(self):
281		subdirs = glob.glob(self.func_path + "*/")
282
283		return subdirs
284
285		def _get_func_data_names1(self):
286		subdirs = self._get_subdirs()
287
288		path_list = []
289		for subdir in subdirs:
290		paths = glob.glob(subdir + "*.csv")
291		path_list = path_list + paths
292
293		return path_list
294
295		def _get_func_data_names(self):
296		paths = glob.glob(
297		self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv")
298		)
299
300		return paths
301
302		def _read_dill(self, value):
303		paths = self._get_pkl_hash(value)
304		for path in paths:
305		with open(path, "rb") as fp:
306		value = dill.load(fp)
307		value = dill.loads(value)
308		break
309
310		return value
311
312		def _hash2obj(self):
313		hash2obj_dict = {}
314		para_hash_list = self._get_para_hash_list()
315
316		for para_hash in para_hash_list:
317		obj = self._read_dill(para_hash)
318		hash2obj_dict[para_hash] = obj
319
320		return hash2obj_dict
321
322	View Code Duplication	def _get_para_hash_list(self):
		0 ignored issues – show Duplication introduced 2020-01-17 10:30 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
323		para_hash_list = []
324		for key in self._space_.search_space.keys():
325		values = self._space_.search_space[key]
326
327		for value in values:
328		if (
329		not isinstance(value, int)
330		and not isinstance(value, float)
331		and not isinstance(value, str)
332		):
333
334		para_dill = dill.dumps(value)
335		para_hash = self._get_hash(para_dill)
336		para_hash_list.append(para_hash)
337
338		return para_hash_list
339
340		def _get_pkl_hash(self, hash):
341		paths = glob.glob(self.func_path + hash + "*.pkl")
342
343		return paths
344
345		def _get_file_path(self, model_func):
346		if not os.path.exists(self.date_path):
347		os.makedirs(self.date_path)
348
349		return self.func_path + (self.feature_hash + "_" + self.label_hash + "_.csv")
350

SimonBlanke / Hyperactive

Push — master ( 4229b2...c65cd4 )

hyperactive.memory C

Complexity

Size/Duplication

Importance

22 Methods

1 Function

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like