Completed
Push — appveyor ( 280314...2c0e2c )
by Konstantinos
02:09
created

so_magic.data.backend.panda_handling.df_operations   A

Complexity

Total Complexity 35

Size/Duplication

Total Lines 237
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 80
dl 0
loc 237
rs 9.6
c 0
b 0
f 0
wmc 35

25 Functions

Rating   Name   Duplication   Size   Complexity  
A string_map() 0 3 1
A from_csv() 0 2 1
A add_qbin() 0 3 1
A drop_columns() 0 3 1
A add_reg() 0 3 1
A split_attributes() 0 4 1
A get_df_from_json() 0 2 1
A binned_indices() 0 4 1
A map_replace_string() 0 15 2
A _map() 0 4 1
A encode_bands() 0 4 1
A label_encode() 0 2 1
A missing_values() 0 2 1
A _op_gen() 0 7 4
A complete_numerical_with_median() 0 2 1
A encode_continuous() 0 3 1
A bin_column() 0 2 1
A add_bin_for_continuous() 0 2 1
A create_column() 0 2 1
A qbin_column() 0 2 1
A add_column() 0 3 1
A encode_continuous_many() 0 3 1
A df_map() 0 2 1
A encode_bands_many() 0 4 1
A complete_categorical_with_most_freq() 0 2 1

4 Methods

Rating   Name   Duplication   Size   Complexity  
A MedianFiller._condition() 0 2 2
A MedianFiller._convert() 0 2 1
A MedianFiller._set_value() 0 3 1
A MedianFiller.__call__() 0 11 2
1
import pandas as pd
2
3
# __all__ = []
4
5
def split_attributes(dataframe):
6
    """Return the categorical and numerical columns/attributes of the given dataframe"""
7
    _ = dataframe._get_numeric_data().columns.values
8
    return list(set(dataframe.columns) - set(_)), _
9
10
def missing_values(dataframe):
11
    return {k: v for k, v in dataframe.isnull().sum().to_dict().items() if v != 0}
12
13
# train_data.describe(include=['O'])
14
15
def drop_columns(dataframe, *columns):
16
    """Call this method to remove given columns for the given dataframe and get a new dataframe reference"""
17
    return dataframe.drop([*columns], axis=1)
18
19
def add_column(dataframe, name, values):
20
    """Call this method to add a new column with the given values and get a new dataframe reference"""
21
    return dataframe.assign(**{name: values})
22
########
23
def bin_column(column_ref, nb_bins):
24
    return pd.cut(column_ref, nb_bins)
25
26
def qbin_column(column_ref, nb_bins):
27
    return pd.qcut(column_ref, nb_bins)
28
29
def string_map(column_ref, strings, target_form):
30
    """Call this method to replace values found in 'strings' list with the target form, given a column (ie Series) reference and return the reference"""
31
    return column_ref.replace(strings, target_form)
32
####
33
def add_bin_for_continuous(dataframe, column, new_column, nb_bins):
34
    return add_column(dataframe, new_column, list(bin_column(dataframe[column], nb_bins)))
35
36
def add_reg(dataframe, name, regex, target):
37
    """Call this method to add a new column by applying a regex extractor to an existing column and get a new dataframe reference"""
38
    return add_column(dataframe, name, list(dataframe[target].str.extract(regex, expand=False)))
39
40
def map_replace_string(dataframe, column, norm):  #, strings_data, target_forms):
41
    """Call this method to replace strings with normalized form, given the input mapping
42
    Example input:
43
    norm = {
44
    'Rare': ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
45
    'Miss': ['Mlle', 'Ms'],
46
    'Mrs': ['Mme']
47
    }
48
    """
49
    for k, v in norm.items():
50
        dataframe[column] = string_map(dataframe[column], v, k)
51
    # c = list(reversed([list(_) for _ in zip(*list(norm.items()))]))
52
    # for strings, target in zip(strings_data, target_forms):
53
    #     dataframe[column] = string_map(dataframe[column], strings, target)
54
    return dataframe
55
56
57
###############################
58
def df_map(a_callable, dataframes, *args, **kwargs):
59
    return [a_callable(x, *args, **kwargs) for x in dataframes]
60
#########################
61
62
# # DROP 'Ticket' and 'Cabin' columns
63
# train_data, test_data = df_map(drop_columns, [train_data, test_data], 'Ticket', 'Cabin')
64
65
def complete_categorical_with_most_freq(dataframe, column):
66
    return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().mode()[0])})
67
68
def complete_numerical_with_median(dataframe, column):
69
    return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().median())})
70
71
from itertools import product
72
from functools import reduce
73
74
class MedianFiller:
75
    def __call__(self, dataframe, column, columns):
76
        """
77
        Call this method to fill missing values in a dataframe's column according to the medians computed on correlated columns\n
78
        :param str dataframe:
79
        :param str column: column with missing values
80
        :param list columns: correlated columns
81
        :return: a dataframe reference with the column completed
82
        """
83
        for vector in product(*[list(dataframe[c].unique()) for c in columns]):
84
            self._set_value(dataframe, column, self._condition(dataframe, columns, vector))
85
        return dataframe.assign(**{column: dataframe[column].astype(int)})
86
87
    def _set_value(self, dataframe, column, condition):
88
        dataframe.loc[(dataframe[column].isnull()) & condition, column] = self._convert(
89
            dataframe[condition][column].dropna().median())
90
91
    def _condition(self, dataframe, columns, values_vector):
92
        return reduce(lambda i, j: i & j, [dataframe[c] == values_vector[e] for e, c in enumerate(columns)])
93
94
    def _convert(self, value):
95
        return value
96
97
98
99
def create_column(dataframe, name, a_callable):
100
    return dataframe.assign(**{name: a_callable(dataframe)})
101
102
# # CREATE 5 BINS for 'Age' column (discreetize) and add a column in 'train' data
103
# train_data = train_data.assign(**{'AgeBand': pd.cut(train_data.Age.astype(int), 5)})
104
105
106
def add_qbin(dataframe, target, nb_bins, destination):
107
    """Call this function to create a column (with 'destination' name) of quantisized bins of the continuous variable given in the target column"""
108
    return dataframe.assign(**{destination: pd.qcut(dataframe[target], nb_bins)})
109
110
111
def binned_indices(values, left_boundaries):
112
    """Call this function to get an array of indices the given values belong based on the input boundaries.\n
113
        If values in `x` are beyond the bounds of `left_boundaries`, 0 or ``len(left_boundaries)`` is returned as appropriate."""
114
    return np.digitize(values, left_boundaries)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable np does not seem to be defined.
Loading history...
115
116
117
def _map(intervals_list):
118
    """Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..).
119
        Assumes that the input Intervals list is sorted"""
120
    return {(interval_obj): index for index, interval_obj in enumerate(intervals_list)}
121
122
123
## operations for dfs with constructed bins/bands
124
def encode_bands(dataframe, target_column, intervals_list, destination_column):
125
    """Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..).
126
        Assumes that the input Intervals list is sorted"""
127
    return dataframe.assign(**{destination_column: dataframe[target_column].map(_map(intervals_list)).astype(int)})
128
129
130
def encode_bands_many(dataframe, targets, intervals_lists, destinations):
131
    """"""
132
    return dataframe.assign(**{dest_c: dataframe[target_c].map(_map(intervals_list)).astype(int)
133
                               for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)})
134
135
136
def encode_continuous(dataframe, target_column, intervals_list, destination_column):
137
    return dataframe.assign(
138
        **{destination_column: binned_indices(dataframe[target_column], iter(x.left for x in intervals_list)) - 1})
139
140
141
def encode_continuous_many(dataframe, targets, intervals_lists, destinations):
142
    return dataframe.assign(**{dest_c: binned_indices(dataframe[target_c], [x.left for x in intervals_list]) - 1
143
                               for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)})
144
145
146
def _op_gen(dataframe, columns, band_str='Band', post_str='_Code'):
147
#     interval_lists = [sorted(dataframe[c+band_str].unique()) for c in columns]
148
#     coded = ['{}{}'.format(c, post_str) for c in columns]
149
    _ = [list(_) for _ in zip(*list([(sorted(dataframe[c+band_str].unique()), '{}{}'.format(c, post_str)) for c in columns]))]
150
    yield lambda x: encode_bands_many(x, [c+band_str for c in columns], _[0], _[1])
151
    while 1:
152
        yield lambda x: encode_continuous_many(x, columns, _[0], _[1])
153
154
155
#### CONSTANTS #####
156
POST_STR = '_Code'  # postfix string for encoded variables
157
BAND_STR = 'Band'
158
159
#### SETTINGS ###
160
# PICK columns with categorical variables to encode with sklearn LabelEncoder
161
TO_ENCODE_WITH_SKLEARN = ['Embarked', 'Sex', 'Title']
162
163
TO_ENCODE_WITH_INTERVALS = ['Age', 'Fare']
164
165
166
167
def label_encode(dataframe, columns, encode_callback, code_str='_Code'):
168
    return dataframe.assign(**{c+code_str: encode_callback(dataframe[c]) for c in columns})
169
170
#                                [train_data, test_data],
171
#                                ['Embarked', 'Sex', 'Title'],
172
#                                LabelEncoder().fit_transform)  # encodes categorical objects into indices starting from 0
173
174
175
# ENCODE 'Age' and 'Fare' by creating the 'Age_code' and 'Fare_Code' coluns in 'train_data' and 'test_data'
176
# train_data, test_data = encode_bands([train_data, test_data], TO_ENCODE_WITH_INTERVALS, bin_str=BAND_STR, post=POST_STR)
177
#
178
# op_gen = _op_gen(train_data, ['Age', 'Fare'], band_str='Band', post_str='_Code')
179
# train_data, test_data = [next(op_gen)(df) for df in [train_data, test_data]]
180
181
182
183
184
#### SELECTION
185
#
186
# #define y variable aka target/outcome
187
# Target = ['Survived']
188
#
189
# # FEATURE SLECTION
190
# # define variables (original and encoded)
191
# feature_titles = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] #pretty name/values for charts
192
# feature_names = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
193
# # data1_xy =  Target + data1_x
194
# # print('Original X Y: ', data1_xy, '\n')
195
#
196
#
197
# #define x variables for original w/bin variables to remove continuous variables
198
# data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'Age_Code', 'Fare_Code']
199
# # data1_xy_bin = Target + data1_x_bin
200
# # print('Bin X Y: ', data1_xy_bin, '\n')
201
#
202
#
203
# #define x and y variables for dummy variables original
204
# data1_dummy = pd.get_dummies(train_data[feature_titles])
205
# data1_x_dummy = data1_dummy.columns.tolist()
206
# # data1_xy_dummy = Target + data1_x_dummy
207
# # print('Dummy X Y: ', data1_xy_dummy, '\n')
208
#
209
#
210
# # SELECT variables
211
# numerical_feats = ['Pclass', 'Fare_Code', 'Age_Code', 'FamilySize']  # ordering makes sence: eg: class_1 < class_2,
212
# binary_feats = ['Sex_Code', 'IsAlone']
213
# categorical_feats = ['Embarked']
214
#
215
# assert all(all(x in train_dataframe.columns for x in y) for y in [numerical_feats, binary_feats, categorical_feats])
216
#
217
# # convert eg column "color" that takes {'white', 'black'} as values to
218
# # 2 columns: 'color_white' and 'color_black' (that take 0 or 1)
219
# pd.get_dummies(train_data[categorical_feats]).head()
220
#
221
#
222
# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1)
223
# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1)
224
# X_train.head()
225
#
226
#
227
# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1)
228
# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1)
229
# X_train.head()
230
231
232
def get_df_from_json(cls, json_path):
233
    return pd.read_json(path_or_buf=json_path)
234
235
def from_csv(file_path):
236
    return pd.read_csv(file_path)
237