|
1
|
|
|
from .data_manager import DataManager |
|
2
|
|
|
from .features.phi import PhiFunctionRegistrator |
|
3
|
|
|
from .features import FeatureManager |
|
4
|
|
|
from .command_factories import MegaCommandFactory |
|
5
|
|
|
|
|
6
|
|
|
|
|
7
|
|
|
def init_data_manager(a_backend): |
|
8
|
|
|
data_manager = DataManager(a_backend, type('PhiFunction', (PhiFunctionRegistrator,), {}), FeatureManager([])) |
|
9
|
|
|
mega_cmd_factory = MegaCommandFactory(data_manager) |
|
10
|
|
|
mega_cmd_factory.attach(data_manager.commands_manager.command.accumulator) |
|
11
|
|
|
|
|
12
|
|
|
mega_cmd_factory('select_variables') |
|
13
|
|
|
|
|
14
|
|
|
@data_manager.backend.engine.dec() |
|
15
|
|
|
def encode_nominal_subsets(datapoints, attribute, new_attribute): |
|
16
|
|
|
from so_magic.data.features.phis import ListOfCategoricalPhi, DatapointsAttributePhi |
|
17
|
|
|
phi = ListOfCategoricalPhi(DatapointsAttributePhi(datapoints)) |
|
18
|
|
|
new_values = phi(attribute) |
|
19
|
|
|
datapoints.mutator.add_column(datapoints, new_values, new_attribute) |
|
20
|
|
|
|
|
21
|
|
|
import pandas as pd |
|
22
|
|
|
|
|
23
|
|
|
@data_manager.backend.engine.dec() |
|
24
|
|
|
def observations(file_path): |
|
25
|
|
|
return pd.read_json(file_path, lines=True) |
|
26
|
|
|
|
|
27
|
|
|
from so_magic.data.encoding import NominalAttributeEncoder |
|
28
|
|
|
|
|
29
|
|
|
@NominalAttributeEncoder.register_as_subclass('one_hot') |
|
30
|
|
|
class OneHotEncoder(NominalAttributeEncoder): |
|
31
|
|
|
|
|
32
|
|
|
def encode(self, *args, **kwargs): |
|
33
|
|
|
datapoints = args[0] |
|
34
|
|
|
attribute = args[1] |
|
35
|
|
|
prefix_separator = '_' |
|
36
|
|
|
dataframe = pd.get_dummies(datapoints.observations[attribute], prefix=attribute, prefix_sep='_', drop_first=False) |
|
37
|
|
|
self.values_set = [x.replace(f'{attribute}{prefix_separator}', '') for x in dataframe.columns] |
|
38
|
|
|
self.columns = [x for x in dataframe.columns] |
|
39
|
|
|
return dataframe |
|
40
|
|
|
|
|
41
|
|
|
from so_magic.data.command_factories import DataManagerCommandFactory |
|
42
|
|
|
from so_magic.utils import Command |
|
43
|
|
|
|
|
44
|
|
|
@DataManagerCommandFactory.register_as_subclass('one_hot_encoding') |
|
45
|
|
|
class EncodeNominalCommandFactory(DataManagerCommandFactory): |
|
46
|
|
|
|
|
47
|
|
|
def construct(self, *args, **kwargs) -> Command: |
|
48
|
|
|
_data_manager= args[0] |
|
49
|
|
|
def one_hot_encoding(_datapoints, _attribute): |
|
50
|
|
|
dataframe = OneHotEncoder().encode(_datapoints, _attribute) |
|
51
|
|
|
_data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1) |
|
52
|
|
|
return Command(one_hot_encoding, '__call__', *args[1:]) |
|
53
|
|
|
|
|
54
|
|
|
mega_cmd_factory('one_hot_encoding') |
|
55
|
|
|
|
|
56
|
|
|
import numpy as np |
|
57
|
|
|
from functools import reduce |
|
58
|
|
|
|
|
59
|
|
|
@NominalAttributeEncoder.register_as_subclass('one_hot_list') |
|
60
|
|
|
class OneHotListEncoder(NominalAttributeEncoder): |
|
61
|
|
|
binary_transformer = {True: 1.0, False: 0.0} |
|
62
|
|
|
|
|
63
|
|
|
def encode(self, *args, **kwargs): |
|
64
|
|
|
datapoints = args[0] |
|
65
|
|
|
attribute = args[1] |
|
66
|
|
|
self.values_set = reduce(lambda i, j: set(i).union(set(j)), [_ for _ in datapoints.observations[attribute] if type(_) == list]) |
|
67
|
|
|
self.columns = [_ for _ in self.values_set] |
|
68
|
|
|
return pd.DataFrame([self._yield_vector(datarow, attribute) for index, datarow in datapoints.iterrows()], columns=self.columns) |
|
69
|
|
|
|
|
70
|
|
|
def _yield_vector(self, datarow, attribute): |
|
71
|
|
|
decision = {True: self._encode, False: self._encode_none} |
|
72
|
|
|
return decision[type(datarow[attribute]) == list](datarow, attribute) |
|
73
|
|
|
|
|
74
|
|
|
def _encode(self, datarow, attribute): |
|
75
|
|
|
return [OneHotListEncoder.binary_transformer[column in datarow[attribute]] for column in self.columns] |
|
76
|
|
|
|
|
77
|
|
|
def _encode_none(self, datarow, attribute): |
|
78
|
|
|
return [0.0] * len(self.values_set) |
|
79
|
|
|
|
|
80
|
|
|
|
|
81
|
|
|
@DataManagerCommandFactory.register_as_subclass('one_hot_encoding_list') |
|
82
|
|
|
class EncodeNominalListCommandFactory(DataManagerCommandFactory): |
|
83
|
|
|
|
|
84
|
|
|
def construct(self, *args, **kwargs) -> Command: |
|
85
|
|
|
_data_manager = args[0] |
|
86
|
|
|
|
|
87
|
|
|
def one_hot_encoding_list(_datapoints, _attribute): |
|
88
|
|
|
_data_manager.datapoints.observations[_attribute].fillna(value=np.nan, inplace=True) |
|
89
|
|
|
dataframe = OneHotListEncoder().encode(_datapoints, _attribute) |
|
90
|
|
|
_data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], |
|
91
|
|
|
axis=1) |
|
92
|
|
|
|
|
93
|
|
|
return Command(one_hot_encoding_list, '__call__', *args[1:]) |
|
94
|
|
|
|
|
95
|
|
|
mega_cmd_factory('one_hot_encoding_list') |
|
96
|
|
|
|
|
97
|
|
|
return data_manager |
|
98
|
|
|
|