1
|
|
|
"""Defines backend-dependent (eg using pandas as backend library) functions that will serve as engine commands. |
2
|
|
|
These commands should be "built" using a suitable function/decorator. |
3
|
|
|
These commands should be able to be defined at runtime, as part of client code (with respect to this library). |
4
|
|
|
""" |
5
|
|
|
from functools import reduce |
6
|
|
|
import numpy as np |
7
|
|
|
import pandas as pd |
8
|
|
|
|
9
|
|
|
from so_magic.data.encoding import NominalAttributeEncoder |
10
|
|
|
|
11
|
|
|
|
12
|
|
|
__all__ = ['data_manager_commands', 'arbitrary_commands'] |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
# CMD 1 |
16
|
|
|
def observations_command(file_path): |
17
|
|
|
return pd.read_json(file_path, lines=True) |
18
|
|
|
|
19
|
|
|
|
20
|
|
|
class OneHotEncoder(NominalAttributeEncoder): |
21
|
|
|
|
22
|
|
|
def encode(self, *args, **kwargs): |
23
|
|
|
datapoints = args[0] |
24
|
|
|
attribute = args[1] |
25
|
|
|
prefix_separator = '_' |
26
|
|
|
dataframe = pd.get_dummies(datapoints.observations[attribute], prefix=attribute, prefix_sep='_', |
27
|
|
|
drop_first=False) |
28
|
|
|
self.values_set = [x.replace(f'{attribute}{prefix_separator}', '') for x in dataframe.columns] |
29
|
|
|
self.columns = list(dataframe.columns) |
30
|
|
|
return dataframe |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
# CMD 2 |
34
|
|
|
def one_hot_encoding_command(_data_manager, _datapoints, _attribute): |
35
|
|
|
dataframe = OneHotEncoder().encode(_datapoints, _attribute) |
36
|
|
|
# TODO add a add_columns method to the mutator interface |
37
|
|
|
# replace below with datapoints.mutator.add_columns(...) (similar to the encode_nominal_subsets_command above) |
38
|
|
|
_data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1) |
39
|
|
|
|
40
|
|
|
|
41
|
|
|
class OneHotListEncoder(NominalAttributeEncoder): |
42
|
|
|
binary_transformer = {True: 1.0, False: 0.0} |
43
|
|
|
|
44
|
|
|
def encode(self, *args, **kwargs): |
45
|
|
|
datapoints = args[0] |
46
|
|
|
attribute = args[1] |
47
|
|
|
self.values_set = reduce(lambda i, j: set(i).union(set(j)), |
48
|
|
|
[_ for _ in datapoints.observations[attribute] if isinstance(_, list)]) |
49
|
|
|
self.columns = list(self.values_set) |
50
|
|
|
return pd.DataFrame([self._yield_vector(datarow, attribute) for index, datarow in datapoints.iterrows()], |
51
|
|
|
columns=self.columns) |
52
|
|
|
|
53
|
|
|
def _yield_vector(self, datarow, attribute): |
54
|
|
|
decision = {True: self._encode, False: self._encode_none} |
55
|
|
|
return decision[isinstance(datarow[attribute], list)](datarow, attribute) |
56
|
|
|
|
57
|
|
|
def _encode(self, datarow, attribute): |
58
|
|
|
return [OneHotListEncoder.binary_transformer[column in datarow[attribute]] for column in self.columns] |
59
|
|
|
|
60
|
|
|
def _encode_none(self, _datarow, _attribute): |
61
|
|
|
return [0.0] * len(self.values_set) |
62
|
|
|
|
63
|
|
|
|
64
|
|
|
# CMD 3 |
65
|
|
|
def one_hot_encoding_list_command(_data_manager, _datapoints, _attribute): |
66
|
|
|
_data_manager.datapoints.observations[_attribute].fillna(value=np.nan, inplace=True) |
67
|
|
|
dataframe = OneHotListEncoder().encode(_datapoints, _attribute) |
68
|
|
|
_data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1) |
69
|
|
|
|
70
|
|
|
|
71
|
|
|
data_manager_commands = (one_hot_encoding_list_command, one_hot_encoding_command) |
72
|
|
|
arbitrary_commands = (observations_command,) |
73
|
|
|
|