|
1
|
|
|
import pandas as pd |
|
2
|
|
|
|
|
3
|
|
|
# __all__ = [] |
|
4
|
|
|
|
|
5
|
|
|
def split_attributes(dataframe): |
|
6
|
|
|
"""Return the categorical and numerical columns/attributes of the given dataframe""" |
|
7
|
|
|
_ = dataframe._get_numeric_data().columns.values |
|
8
|
|
|
return list(set(dataframe.columns) - set(_)), _ |
|
9
|
|
|
|
|
10
|
|
|
def missing_values(dataframe): |
|
11
|
|
|
return {k: v for k, v in dataframe.isnull().sum().to_dict().items() if v != 0} |
|
12
|
|
|
|
|
13
|
|
|
# train_data.describe(include=['O']) |
|
14
|
|
|
|
|
15
|
|
|
def drop_columns(dataframe, *columns): |
|
16
|
|
|
"""Call this method to remove given columns for the given dataframe and get a new dataframe reference""" |
|
17
|
|
|
return dataframe.drop([*columns], axis=1) |
|
18
|
|
|
|
|
19
|
|
|
def add_column(dataframe, name, values): |
|
20
|
|
|
"""Call this method to add a new column with the given values and get a new dataframe reference""" |
|
21
|
|
|
return dataframe.assign(**{name: values}) |
|
22
|
|
|
######## |
|
23
|
|
|
def bin_column(column_ref, nb_bins): |
|
24
|
|
|
return pd.cut(column_ref, nb_bins) |
|
25
|
|
|
|
|
26
|
|
|
def qbin_column(column_ref, nb_bins): |
|
27
|
|
|
return pd.qcut(column_ref, nb_bins) |
|
28
|
|
|
|
|
29
|
|
|
def string_map(column_ref, strings, target_form): |
|
30
|
|
|
"""Call this method to replace values found in 'strings' list with the target form, given a column (ie Series) reference and return the reference""" |
|
31
|
|
|
return column_ref.replace(strings, target_form) |
|
32
|
|
|
#### |
|
33
|
|
|
def add_bin_for_continuous(dataframe, column, new_column, nb_bins): |
|
34
|
|
|
return add_column(dataframe, new_column, list(bin_column(dataframe[column], nb_bins))) |
|
35
|
|
|
|
|
36
|
|
|
def add_reg(dataframe, name, regex, target): |
|
37
|
|
|
"""Call this method to add a new column by applying a regex extractor to an existing column and get a new dataframe reference""" |
|
38
|
|
|
return add_column(dataframe, name, list(dataframe[target].str.extract(regex, expand=False))) |
|
39
|
|
|
|
|
40
|
|
|
def map_replace_string(dataframe, column, norm): #, strings_data, target_forms): |
|
41
|
|
|
"""Call this method to replace strings with normalized form, given the input mapping |
|
42
|
|
|
Example input: |
|
43
|
|
|
norm = { |
|
44
|
|
|
'Rare': ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], |
|
45
|
|
|
'Miss': ['Mlle', 'Ms'], |
|
46
|
|
|
'Mrs': ['Mme'] |
|
47
|
|
|
} |
|
48
|
|
|
""" |
|
49
|
|
|
for k, v in norm.items(): |
|
50
|
|
|
dataframe[column] = string_map(dataframe[column], v, k) |
|
51
|
|
|
# c = list(reversed([list(_) for _ in zip(*list(norm.items()))])) |
|
52
|
|
|
# for strings, target in zip(strings_data, target_forms): |
|
53
|
|
|
# dataframe[column] = string_map(dataframe[column], strings, target) |
|
54
|
|
|
return dataframe |
|
55
|
|
|
|
|
56
|
|
|
|
|
57
|
|
|
############################### |
|
58
|
|
|
def df_map(a_callable, dataframes, *args, **kwargs): |
|
59
|
|
|
return [a_callable(x, *args, **kwargs) for x in dataframes] |
|
60
|
|
|
######################### |
|
61
|
|
|
|
|
62
|
|
|
# # DROP 'Ticket' and 'Cabin' columns |
|
63
|
|
|
# train_data, test_data = df_map(drop_columns, [train_data, test_data], 'Ticket', 'Cabin') |
|
64
|
|
|
|
|
65
|
|
|
def complete_categorical_with_most_freq(dataframe, column): |
|
66
|
|
|
return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().mode()[0])}) |
|
67
|
|
|
|
|
68
|
|
|
def complete_numerical_with_median(dataframe, column): |
|
69
|
|
|
return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().median())}) |
|
70
|
|
|
|
|
71
|
|
|
from itertools import product |
|
72
|
|
|
from functools import reduce |
|
73
|
|
|
|
|
74
|
|
|
class MedianFiller: |
|
75
|
|
|
def __call__(self, dataframe, column, columns): |
|
76
|
|
|
""" |
|
77
|
|
|
Call this method to fill missing values in a dataframe's column according to the medians computed on correlated columns\n |
|
78
|
|
|
:param str dataframe: |
|
79
|
|
|
:param str column: column with missing values |
|
80
|
|
|
:param list columns: correlated columns |
|
81
|
|
|
:return: a dataframe reference with the column completed |
|
82
|
|
|
""" |
|
83
|
|
|
for vector in product(*[list(dataframe[c].unique()) for c in columns]): |
|
84
|
|
|
self._set_value(dataframe, column, self._condition(dataframe, columns, vector)) |
|
85
|
|
|
return dataframe.assign(**{column: dataframe[column].astype(int)}) |
|
86
|
|
|
|
|
87
|
|
|
def _set_value(self, dataframe, column, condition): |
|
88
|
|
|
dataframe.loc[(dataframe[column].isnull()) & condition, column] = self._convert( |
|
89
|
|
|
dataframe[condition][column].dropna().median()) |
|
90
|
|
|
|
|
91
|
|
|
def _condition(self, dataframe, columns, values_vector): |
|
92
|
|
|
return reduce(lambda i, j: i & j, [dataframe[c] == values_vector[e] for e, c in enumerate(columns)]) |
|
93
|
|
|
|
|
94
|
|
|
def _convert(self, value): |
|
95
|
|
|
return value |
|
96
|
|
|
|
|
97
|
|
|
|
|
98
|
|
|
|
|
99
|
|
|
def create_column(dataframe, name, a_callable): |
|
100
|
|
|
return dataframe.assign(**{name: a_callable(dataframe)}) |
|
101
|
|
|
|
|
102
|
|
|
# # CREATE 5 BINS for 'Age' column (discreetize) and add a column in 'train' data |
|
103
|
|
|
# train_data = train_data.assign(**{'AgeBand': pd.cut(train_data.Age.astype(int), 5)}) |
|
104
|
|
|
|
|
105
|
|
|
|
|
106
|
|
|
def add_qbin(dataframe, target, nb_bins, destination): |
|
107
|
|
|
"""Call this function to create a column (with 'destination' name) of quantisized bins of the continuous variable given in the target column""" |
|
108
|
|
|
return dataframe.assign(**{destination: pd.qcut(dataframe[target], nb_bins)}) |
|
109
|
|
|
|
|
110
|
|
|
|
|
111
|
|
|
def binned_indices(values, left_boundaries): |
|
112
|
|
|
"""Call this function to get an array of indices the given values belong based on the input boundaries.\n |
|
113
|
|
|
If values in `x` are beyond the bounds of `left_boundaries`, 0 or ``len(left_boundaries)`` is returned as appropriate.""" |
|
114
|
|
|
return np.digitize(values, left_boundaries) |
|
|
|
|
|
|
115
|
|
|
|
|
116
|
|
|
|
|
117
|
|
|
def _map(intervals_list): |
|
118
|
|
|
"""Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..). |
|
119
|
|
|
Assumes that the input Intervals list is sorted""" |
|
120
|
|
|
return {(interval_obj): index for index, interval_obj in enumerate(intervals_list)} |
|
121
|
|
|
|
|
122
|
|
|
|
|
123
|
|
|
## operations for dfs with constructed bins/bands |
|
124
|
|
|
def encode_bands(dataframe, target_column, intervals_list, destination_column): |
|
125
|
|
|
"""Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..). |
|
126
|
|
|
Assumes that the input Intervals list is sorted""" |
|
127
|
|
|
return dataframe.assign(**{destination_column: dataframe[target_column].map(_map(intervals_list)).astype(int)}) |
|
128
|
|
|
|
|
129
|
|
|
|
|
130
|
|
|
def encode_bands_many(dataframe, targets, intervals_lists, destinations): |
|
131
|
|
|
"""""" |
|
132
|
|
|
return dataframe.assign(**{dest_c: dataframe[target_c].map(_map(intervals_list)).astype(int) |
|
133
|
|
|
for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)}) |
|
134
|
|
|
|
|
135
|
|
|
|
|
136
|
|
|
def encode_continuous(dataframe, target_column, intervals_list, destination_column): |
|
137
|
|
|
return dataframe.assign( |
|
138
|
|
|
**{destination_column: binned_indices(dataframe[target_column], iter(x.left for x in intervals_list)) - 1}) |
|
139
|
|
|
|
|
140
|
|
|
|
|
141
|
|
|
def encode_continuous_many(dataframe, targets, intervals_lists, destinations): |
|
142
|
|
|
return dataframe.assign(**{dest_c: binned_indices(dataframe[target_c], [x.left for x in intervals_list]) - 1 |
|
143
|
|
|
for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)}) |
|
144
|
|
|
|
|
145
|
|
|
|
|
146
|
|
|
def _op_gen(dataframe, columns, band_str='Band', post_str='_Code'): |
|
147
|
|
|
# interval_lists = [sorted(dataframe[c+band_str].unique()) for c in columns] |
|
148
|
|
|
# coded = ['{}{}'.format(c, post_str) for c in columns] |
|
149
|
|
|
_ = [list(_) for _ in zip(*list([(sorted(dataframe[c+band_str].unique()), '{}{}'.format(c, post_str)) for c in columns]))] |
|
150
|
|
|
yield lambda x: encode_bands_many(x, [c+band_str for c in columns], _[0], _[1]) |
|
151
|
|
|
while 1: |
|
152
|
|
|
yield lambda x: encode_continuous_many(x, columns, _[0], _[1]) |
|
153
|
|
|
|
|
154
|
|
|
|
|
155
|
|
|
#### CONSTANTS ##### |
|
156
|
|
|
POST_STR = '_Code' # postfix string for encoded variables |
|
157
|
|
|
BAND_STR = 'Band' |
|
158
|
|
|
|
|
159
|
|
|
#### SETTINGS ### |
|
160
|
|
|
# PICK columns with categorical variables to encode with sklearn LabelEncoder |
|
161
|
|
|
TO_ENCODE_WITH_SKLEARN = ['Embarked', 'Sex', 'Title'] |
|
162
|
|
|
|
|
163
|
|
|
TO_ENCODE_WITH_INTERVALS = ['Age', 'Fare'] |
|
164
|
|
|
|
|
165
|
|
|
|
|
166
|
|
|
|
|
167
|
|
|
def label_encode(dataframe, columns, encode_callback, code_str='_Code'): |
|
168
|
|
|
return dataframe.assign(**{c+code_str: encode_callback(dataframe[c]) for c in columns}) |
|
169
|
|
|
|
|
170
|
|
|
# [train_data, test_data], |
|
171
|
|
|
# ['Embarked', 'Sex', 'Title'], |
|
172
|
|
|
# LabelEncoder().fit_transform) # encodes categorical objects into indices starting from 0 |
|
173
|
|
|
|
|
174
|
|
|
|
|
175
|
|
|
# ENCODE 'Age' and 'Fare' by creating the 'Age_code' and 'Fare_Code' coluns in 'train_data' and 'test_data' |
|
176
|
|
|
# train_data, test_data = encode_bands([train_data, test_data], TO_ENCODE_WITH_INTERVALS, bin_str=BAND_STR, post=POST_STR) |
|
177
|
|
|
# |
|
178
|
|
|
# op_gen = _op_gen(train_data, ['Age', 'Fare'], band_str='Band', post_str='_Code') |
|
179
|
|
|
# train_data, test_data = [next(op_gen)(df) for df in [train_data, test_data]] |
|
180
|
|
|
|
|
181
|
|
|
|
|
182
|
|
|
|
|
183
|
|
|
|
|
184
|
|
|
#### SELECTION |
|
185
|
|
|
# |
|
186
|
|
|
# #define y variable aka target/outcome |
|
187
|
|
|
# Target = ['Survived'] |
|
188
|
|
|
# |
|
189
|
|
|
# # FEATURE SLECTION |
|
190
|
|
|
# # define variables (original and encoded) |
|
191
|
|
|
# feature_titles = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] #pretty name/values for charts |
|
192
|
|
|
# feature_names = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation |
|
193
|
|
|
# # data1_xy = Target + data1_x |
|
194
|
|
|
# # print('Original X Y: ', data1_xy, '\n') |
|
195
|
|
|
# |
|
196
|
|
|
# |
|
197
|
|
|
# #define x variables for original w/bin variables to remove continuous variables |
|
198
|
|
|
# data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'Age_Code', 'Fare_Code'] |
|
199
|
|
|
# # data1_xy_bin = Target + data1_x_bin |
|
200
|
|
|
# # print('Bin X Y: ', data1_xy_bin, '\n') |
|
201
|
|
|
# |
|
202
|
|
|
# |
|
203
|
|
|
# #define x and y variables for dummy variables original |
|
204
|
|
|
# data1_dummy = pd.get_dummies(train_data[feature_titles]) |
|
205
|
|
|
# data1_x_dummy = data1_dummy.columns.tolist() |
|
206
|
|
|
# # data1_xy_dummy = Target + data1_x_dummy |
|
207
|
|
|
# # print('Dummy X Y: ', data1_xy_dummy, '\n') |
|
208
|
|
|
# |
|
209
|
|
|
# |
|
210
|
|
|
# # SELECT variables |
|
211
|
|
|
# numerical_feats = ['Pclass', 'Fare_Code', 'Age_Code', 'FamilySize'] # ordering makes sence: eg: class_1 < class_2, |
|
212
|
|
|
# binary_feats = ['Sex_Code', 'IsAlone'] |
|
213
|
|
|
# categorical_feats = ['Embarked'] |
|
214
|
|
|
# |
|
215
|
|
|
# assert all(all(x in train_dataframe.columns for x in y) for y in [numerical_feats, binary_feats, categorical_feats]) |
|
216
|
|
|
# |
|
217
|
|
|
# # convert eg column "color" that takes {'white', 'black'} as values to |
|
218
|
|
|
# # 2 columns: 'color_white' and 'color_black' (that take 0 or 1) |
|
219
|
|
|
# pd.get_dummies(train_data[categorical_feats]).head() |
|
220
|
|
|
# |
|
221
|
|
|
# |
|
222
|
|
|
# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1) |
|
223
|
|
|
# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1) |
|
224
|
|
|
# X_train.head() |
|
225
|
|
|
# |
|
226
|
|
|
# |
|
227
|
|
|
# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1) |
|
228
|
|
|
# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1) |
|
229
|
|
|
# X_train.head() |
|
230
|
|
|
|
|
231
|
|
|
|
|
232
|
|
|
def get_df_from_json(cls, json_path): |
|
233
|
|
|
return pd.read_json(path_or_buf=json_path) |
|
234
|
|
|
|
|
235
|
|
|
def from_csv(file_path): |
|
236
|
|
|
return pd.read_csv(file_path) |
|
237
|
|
|
|