Passed
Push — master ( 5f2d49...176334 )
by Konstantinos
49s queued 12s
created

so_magic.data.pd_commands   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 73
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 45
dl 0
loc 73
rs 10
c 0
b 0
f 0
wmc 9

5 Methods

Rating   Name   Duplication   Size   Complexity  
A OneHotListEncoder.encode() 0 8 2
A OneHotListEncoder._yield_vector() 0 3 1
A OneHotListEncoder._encode() 0 2 1
A OneHotListEncoder._encode_none() 0 2 1
A OneHotEncoder.encode() 0 9 1

3 Functions

Rating   Name   Duplication   Size   Complexity  
A observations_command() 0 2 1
A one_hot_encoding_list_command() 0 4 1
A one_hot_encoding_command() 0 5 1
1
"""Defines backend-dependent (eg using pandas as backend library) functions that will serve as engine commands.
2
These commands should be "built" using a suitable function/decorator.
3
These commands should be able to be defined at runtime, as part of client code (with respect to this library).
4
"""
5
from functools import reduce
6
import numpy as np
7
import pandas as pd
8
9
from so_magic.data.encoding import NominalAttributeEncoder
10
11
12
__all__ = ['data_manager_commands', 'arbitrary_commands']
13
14
15
# CMD 1
16
def observations_command(file_path):
17
    return pd.read_json(file_path, lines=True)
18
19
20
class OneHotEncoder(NominalAttributeEncoder):
21
22
    def encode(self, *args, **kwargs):
23
        datapoints = args[0]
24
        attribute = args[1]
25
        prefix_separator = '_'
26
        dataframe = pd.get_dummies(datapoints.observations[attribute], prefix=attribute, prefix_sep='_',
27
                                   drop_first=False)
28
        self.values_set = [x.replace(f'{attribute}{prefix_separator}', '') for x in dataframe.columns]
29
        self.columns = list(dataframe.columns)
30
        return dataframe
31
32
33
# CMD 2
34
def one_hot_encoding_command(_data_manager, _datapoints, _attribute):
35
    dataframe = OneHotEncoder().encode(_datapoints, _attribute)
36
    # TODO add a add_columns method to the mutator interface
37
    # replace below with datapoints.mutator.add_columns(...) (similar to the encode_nominal_subsets_command above)
38
    _data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1)
39
40
41
class OneHotListEncoder(NominalAttributeEncoder):
42
    binary_transformer = {True: 1.0, False: 0.0}
43
44
    def encode(self, *args, **kwargs):
45
        datapoints = args[0]
46
        attribute = args[1]
47
        self.values_set = reduce(lambda i, j: set(i).union(set(j)),
48
                                 [_ for _ in datapoints.observations[attribute] if isinstance(_, list)])
49
        self.columns = list(self.values_set)
50
        return pd.DataFrame([self._yield_vector(datarow, attribute) for index, datarow in datapoints.iterrows()],
51
                            columns=self.columns)
52
53
    def _yield_vector(self, datarow, attribute):
54
        decision = {True: self._encode, False: self._encode_none}
55
        return decision[isinstance(datarow[attribute], list)](datarow, attribute)
56
57
    def _encode(self, datarow, attribute):
58
        return [OneHotListEncoder.binary_transformer[column in datarow[attribute]] for column in self.columns]
59
60
    def _encode_none(self, _datarow, _attribute):
61
        return [0.0] * len(self.values_set)
62
63
64
# CMD 3
65
def one_hot_encoding_list_command(_data_manager, _datapoints, _attribute):
66
    _data_manager.datapoints.observations[_attribute].fillna(value=np.nan, inplace=True)
67
    dataframe = OneHotListEncoder().encode(_datapoints, _attribute)
68
    _data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1)
69
70
71
data_manager_commands = (one_hot_encoding_list_command, one_hot_encoding_command)
72
arbitrary_commands = (observations_command,)
73