|
1
|
|
|
from abc import ABC, abstractmethod |
|
2
|
|
|
import inspect |
|
3
|
|
|
import attr |
|
4
|
|
|
import pandas as pd |
|
5
|
|
|
from so_magic.utils import SubclassRegistry |
|
6
|
|
|
|
|
7
|
|
|
|
|
8
|
|
|
class DiscretizerInterface(ABC): |
|
9
|
|
|
def discretize(self, *args, **kwargs): |
|
10
|
|
|
raise NotImplementedError |
|
11
|
|
|
|
|
12
|
|
|
|
|
13
|
|
|
class AbstractDiscretizer(DiscretizerInterface): |
|
14
|
|
|
def discretize(self, *args, **kwargs): |
|
15
|
|
|
raise NotImplementedError |
|
16
|
|
|
|
|
17
|
|
|
|
|
18
|
|
|
@attr.s |
|
19
|
|
|
class BaseDiscretizer(AbstractDiscretizer): |
|
20
|
|
|
binner = attr.ib() |
|
21
|
|
|
|
|
22
|
|
|
def discretize(self, *args, **kwargs): |
|
23
|
|
|
"""Expects args: dataset, feature and kwargs; 'nb_bins'.""" |
|
24
|
|
|
datapoints = args[0] |
|
25
|
|
|
attribute = args[1] |
|
26
|
|
|
bins = args[2] |
|
27
|
|
|
try: |
|
28
|
|
|
output = self.binner.bin(datapoints.column(attribute), bins, **kwargs) |
|
29
|
|
|
except TypeError as type_error: |
|
30
|
|
|
msg = f'Table column being processed: {attribute}. Exception text: {str(type_error)}' |
|
31
|
|
|
raise TypeError(msg) from type_error |
|
32
|
|
|
|
|
33
|
|
|
return output |
|
34
|
|
|
|
|
35
|
|
|
|
|
36
|
|
|
@attr.s |
|
37
|
|
|
class FeatureDiscretizer(BaseDiscretizer): |
|
38
|
|
|
feature = attr.ib(init=True) |
|
39
|
|
|
|
|
40
|
|
|
def discretize(self, *args, **kwargs): |
|
41
|
|
|
"""Expects args: dataset, nb_bins.""" |
|
42
|
|
|
return super().discretize(args[0], self.feature, args[1]) |
|
43
|
|
|
|
|
44
|
|
|
@attr.s |
|
45
|
|
|
class FeatureDiscretizerFactory: |
|
46
|
|
|
binner_factory = attr.ib(init=True) |
|
47
|
|
|
|
|
48
|
|
|
def categorical(self, feature, **kwargs) -> FeatureDiscretizer: |
|
49
|
|
|
binner_type = 'same-length' |
|
50
|
|
|
if kwargs.get('quantisized', False): |
|
51
|
|
|
binner_type = 'quantisized' |
|
52
|
|
|
return FeatureDiscretizer(self.binner_factory.create_binner(binner_type), feature) |
|
53
|
|
|
|
|
54
|
|
|
def numerical(self, feature, **kwargs) -> FeatureDiscretizer: |
|
55
|
|
|
binner_type = 'same-length' |
|
56
|
|
|
if kwargs.get('quantisized', False): |
|
57
|
|
|
binner_type = 'quantisized' |
|
58
|
|
|
return FeatureDiscretizer(self.binner_factory.create_binner(binner_type), feature) |
|
59
|
|
|
|
|
60
|
|
|
|
|
61
|
|
|
######################################### |
|
62
|
|
|
|
|
63
|
|
|
class BinnerInterface(ABC): |
|
64
|
|
|
@abstractmethod |
|
65
|
|
|
def bin(self, values, bins): |
|
66
|
|
|
raise NotImplementedError |
|
67
|
|
|
|
|
68
|
|
|
|
|
69
|
|
|
@attr.s |
|
70
|
|
|
class BaseBinner(BinnerInterface): |
|
71
|
|
|
algorithm = attr.ib() |
|
72
|
|
|
|
|
73
|
|
|
def bin(self, values, bins): |
|
74
|
|
|
"""It is assumed numerical (ratio or interval) variable or ordinal (not nominal) categorical variable.""" |
|
75
|
|
|
try: |
|
76
|
|
|
return self.algorithm.run(values, bins) |
|
77
|
|
|
except TypeError as type_error: |
|
78
|
|
|
raise TypeError(f'Exception text: {str(type_error)}. Possible reasons: preprocessing is needed to make sure' |
|
79
|
|
|
f' suitable values are places in missing entries and/or all entries are of the same type') \ |
|
80
|
|
|
from type_error |
|
81
|
|
|
|
|
82
|
|
|
|
|
83
|
|
|
class BinnerClass(metaclass=SubclassRegistry): pass |
|
84
|
|
|
|
|
85
|
|
|
|
|
86
|
|
|
class BinnerFactory: |
|
87
|
|
|
parent_class = BinnerClass |
|
88
|
|
|
|
|
89
|
|
|
def equal_length_binner(self, *args, **kwargs) -> BaseBinner: |
|
90
|
|
|
"""Binner that create bins of equal size (max_value - min_value)""" |
|
91
|
|
|
raise NotImplementedError |
|
92
|
|
|
|
|
93
|
|
|
def quantisized_binner(self, *args, **kwargs) -> BaseBinner: |
|
94
|
|
|
"""Binner that will adjust the bin sizes so that the observations are evenly distributed in the bins |
|
95
|
|
|
|
|
96
|
|
|
Raises: |
|
97
|
|
|
NotImplementedError: [description] |
|
98
|
|
|
|
|
99
|
|
|
Returns: |
|
100
|
|
|
BaseBinner: [description] |
|
101
|
|
|
""" |
|
102
|
|
|
raise NotImplementedError |
|
103
|
|
|
|
|
104
|
|
|
def create_binner(self, *args, **kwargs) -> BaseBinner: |
|
105
|
|
|
raise NotImplementedError |
|
106
|
|
|
|
|
107
|
|
|
|
|
108
|
|
|
class AlgorithmInterface(ABC): |
|
109
|
|
|
@abstractmethod |
|
110
|
|
|
def run(self, *args, **kwargs): |
|
111
|
|
|
raise NotImplementedError |
|
112
|
|
|
|
|
113
|
|
|
|
|
114
|
|
|
@attr.s |
|
115
|
|
|
class AlgorithmArguments: |
|
116
|
|
|
"""An algorithms expected positional arguments.""" |
|
117
|
|
|
arg_types = attr.ib() |
|
118
|
|
|
default_values = attr.ib() |
|
119
|
|
|
_required_args = attr.ib(init=False, default=attr.Factory(lambda self: len(self.arg_types), takes_self=True)) |
|
120
|
|
|
|
|
121
|
|
|
def values(self, *args): |
|
122
|
|
|
if len(args) > len(self._required_args): |
|
123
|
|
|
raise AlgorithmArgumentsError(f'Given more than the supported naumber of arguments. ' |
|
124
|
|
|
f'{len(args)} > {len(self._required_args)}') |
|
125
|
|
|
missing = len(self._required_args) - len(args) |
|
126
|
|
|
computed_args_list = list(args) + self.default_values[-missing:] |
|
127
|
|
|
if not all(isinstance(arg_value, self.arg_types[i]) for i, arg_value in computed_args_list): |
|
128
|
|
|
raise AlgorithmArgumentsError('Type missmatch') |
|
129
|
|
|
return computed_args_list |
|
130
|
|
|
|
|
131
|
|
|
|
|
132
|
|
|
@attr.s |
|
133
|
|
|
class AbstractAlgorithm(AlgorithmInterface, ABC): |
|
134
|
|
|
callback: callable = attr.ib() |
|
135
|
|
|
arguments: list = attr.ib(default=attr.Factory(list)) |
|
136
|
|
|
parameters: dict = attr.ib(default=attr.Factory(dict)) |
|
137
|
|
|
default_parameter_values = attr.ib(init=False, default=attr.Factory( |
|
138
|
|
|
lambda self: {k: v['value'] for k, v in self.parameters.items()}, takes_self=True)) |
|
139
|
|
|
_args = attr.ib(init=False, default=attr.Factory(list)) |
|
140
|
|
|
|
|
141
|
|
|
|
|
142
|
|
|
@attr.s |
|
143
|
|
|
class MagicAlgorithm(AbstractAlgorithm): |
|
144
|
|
|
_signature = attr.ib(init=False, |
|
145
|
|
|
default=attr.Factory(lambda self: inspect.signature(self.callback), takes_self=True)) |
|
146
|
|
|
_output = attr.ib(init=False, default=attr.Factory(dict)) |
|
147
|
|
|
|
|
148
|
|
|
def run(self, *args, **kwargs): |
|
149
|
|
|
if not len(args) == len(self.arguments): |
|
150
|
|
|
raise MagicAlgorithmError( |
|
151
|
|
|
f'Number of runtime positional arguments do not match the expected number of positional argumnets. ' |
|
152
|
|
|
f'Given {len(args)} arguments: [{", ".join(str(_) for _ in args)}]. Expected {len(self.arguments)} ' |
|
153
|
|
|
f'arguments: [{", ".join(str(_) for _ in self.arguments)}].') |
|
154
|
|
|
if not all(isinstance(argument, self.arguments[i]) for i, argument in enumerate(args)): |
|
155
|
|
|
raise MagicAlgorithmError(f'Bad positional argument for algorithm. Expected arguments with types ' |
|
156
|
|
|
f'[{", ".join(self.arguments)}]. Instead got [{", ".join(self.arguments)}].') |
|
157
|
|
|
self._args = list(args) |
|
158
|
|
|
self.update_parameters(**kwargs) |
|
159
|
|
|
result = self._run_callback() |
|
160
|
|
|
self._output['settings'] = self._get_settings(result) |
|
161
|
|
|
self._output['result'] = self._get_result(result) |
|
162
|
|
|
return self._output |
|
163
|
|
|
|
|
164
|
|
|
def _run_callback(self): |
|
165
|
|
|
return self.callback(*self._args, **{k: v['value'] for k, v in self.parameters.items()}) |
|
166
|
|
|
|
|
167
|
|
|
@property |
|
168
|
|
|
def output(self): |
|
169
|
|
|
return self._output |
|
170
|
|
|
|
|
171
|
|
|
def _get_result(self, result): |
|
172
|
|
|
return result |
|
173
|
|
|
|
|
174
|
|
|
def _get_settings(self, _result): |
|
175
|
|
|
return { |
|
176
|
|
|
'arguments': self._args, |
|
177
|
|
|
'parameters': { |
|
178
|
|
|
param_name: param_data['value'] for param_name, param_data in self.parameters.items() |
|
179
|
|
|
}, |
|
180
|
|
|
} |
|
181
|
|
|
|
|
182
|
|
|
def update_parameters(self, **kwargs): |
|
183
|
|
|
if not all(isinstance(parameter_value, self.parameters['type']) for parameter_name, parameter_value in kwargs |
|
184
|
|
|
if parameter_name in self.parameters): |
|
185
|
|
|
raise MagicAlgorithmParametersError( |
|
186
|
|
|
f'Bad algorithm parameters. Allowed parameters with types ' |
|
187
|
|
|
f'[{", ".join(f"{k}: {v}" for k, v in self.parameters.items())}]. ' |
|
188
|
|
|
f'Instead got [{", ".join(f"{k}: {v}" for k, v in kwargs.items())}].') |
|
189
|
|
|
self._update_params(**kwargs) |
|
190
|
|
|
|
|
191
|
|
|
def set_default_parameters(self): |
|
192
|
|
|
self._update_params(**self.default_parameter_values) |
|
193
|
|
|
|
|
194
|
|
|
def _update_params(self, **kwargs): |
|
195
|
|
|
for key, value in kwargs.items(): |
|
196
|
|
|
self.parameters[key]['value'] = value |
|
197
|
|
|
|
|
198
|
|
|
|
|
199
|
|
|
class MagicAlgorithmError(Exception): pass |
|
200
|
|
|
class MagicAlgorithmParametersError(Exception): pass |
|
201
|
|
|
class AlgorithmArgumentsError(Exception): pass |
|
202
|
|
|
|
|
203
|
|
|
|
|
204
|
|
|
def call_method(a_callable): |
|
205
|
|
|
def _call(_self, *args, **kwargs): |
|
206
|
|
|
return a_callable(*args, **kwargs) |
|
207
|
|
|
return _call |
|
208
|
|
|
|
|
209
|
|
|
|
|
210
|
|
|
@attr.s |
|
211
|
|
|
class Discretizer(BaseDiscretizer): |
|
212
|
|
|
|
|
213
|
|
|
@property |
|
214
|
|
|
def algorithm(self): |
|
215
|
|
|
return self.binner.algorithm |
|
216
|
|
|
|
|
217
|
|
|
@classmethod |
|
218
|
|
|
def from_algorithm(cls, alg): |
|
219
|
|
|
binner = BaseBinner(alg) |
|
220
|
|
|
return Discretizer(binner) |
|
221
|
|
|
|
|
222
|
|
|
|
|
223
|
|
|
class BinningAlgorithm(metaclass=SubclassRegistry): |
|
224
|
|
|
|
|
225
|
|
|
@classmethod |
|
226
|
|
|
def from_built_in(cls, algorithm_id): |
|
227
|
|
|
return cls.create(algorithm_id, |
|
228
|
|
|
pd.cut, |
|
229
|
|
|
# TODO replace with call to dataclass |
|
230
|
|
|
[object, object], |
|
231
|
|
|
{ |
|
232
|
|
|
'right': { |
|
233
|
|
|
'type': bool, |
|
234
|
|
|
'value': True, |
|
235
|
|
|
}, |
|
236
|
|
|
'labels': { |
|
237
|
|
|
'type': object, |
|
238
|
|
|
'value': None |
|
239
|
|
|
}, |
|
240
|
|
|
'retbins': { |
|
241
|
|
|
'type': bool, |
|
242
|
|
|
'value': True |
|
243
|
|
|
}, |
|
244
|
|
|
'precision': { |
|
245
|
|
|
'type': int, |
|
246
|
|
|
'value': 3 |
|
247
|
|
|
}, |
|
248
|
|
|
'include_lowest': { |
|
249
|
|
|
'type': bool, |
|
250
|
|
|
'value': False |
|
251
|
|
|
}, |
|
252
|
|
|
'duplicates': { |
|
253
|
|
|
'type': str, |
|
254
|
|
|
'value': 'raise' |
|
255
|
|
|
}, |
|
256
|
|
|
} |
|
257
|
|
|
) |
|
258
|
|
|
|
|
259
|
|
|
|
|
260
|
|
|
@BinningAlgorithm.register_as_subclass('pd.cut') |
|
261
|
|
|
class PDCutBinningAlgorithm(MagicAlgorithm): |
|
262
|
|
|
|
|
263
|
|
|
def _get_settings(self, result): |
|
264
|
|
|
return dict(super()._get_settings(result), **{'used_bins': result[1]}) |
|
265
|
|
|
|
|
266
|
|
|
def _get_result(self, result): |
|
267
|
|
|
if bool(self.parameters['retbins']): |
|
268
|
|
|
return super()._get_result(result)[0] |
|
269
|
|
|
return super()._get_result(result) |
|
270
|
|
|
|