1
|
|
|
''' |
2
|
|
|
Functions for data preprocessing. |
3
|
|
|
|
4
|
|
|
:author: Andreas Kanz |
5
|
|
|
|
6
|
|
|
''' |
7
|
|
|
|
8
|
|
|
# Imports |
9
|
|
|
import numpy as np |
10
|
|
|
import pandas as pd |
11
|
|
|
|
12
|
|
|
from sklearn.model_selection import train_test_split |
13
|
|
|
|
14
|
|
|
from .describe import corr_mat |
15
|
|
|
from .utils import _missing_vals |
16
|
|
|
from .utils import _validate_input_int |
17
|
|
|
from .utils import _validate_input_range |
18
|
|
|
|
19
|
|
|
|
20
|
|
|
def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3): |
21
|
|
|
''' |
22
|
|
|
Converts columns with a high ratio of missing values into binary features and eventually drops them based on \ |
23
|
|
|
their correlation with other features and the target variable. This function follows a three step process: |
24
|
|
|
- 1) Identify features with a high ratio of missing values |
25
|
|
|
- 2) Identify high correlations of these features among themselves and with other features in the dataset. |
26
|
|
|
- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \ |
27
|
|
|
they correlate reasonably well with the target variable. |
28
|
|
|
|
29
|
|
|
Parameters |
30
|
|
|
---------- |
31
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. |
32
|
|
|
|
33
|
|
|
target: string, list, np.array or pd.Series, default None |
34
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
35
|
|
|
and the label. |
36
|
|
|
|
37
|
|
|
mv_threshold: float, default 0.1 |
38
|
|
|
Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
39
|
|
|
for dropping and undergo further analysis. |
40
|
|
|
|
41
|
|
|
corr_thresh_features: float, default 0.6 |
42
|
|
|
Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\ |
43
|
|
|
allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis. |
44
|
|
|
|
45
|
|
|
corr_thresh_target: float, default 0.3 |
46
|
|
|
Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \ |
47
|
|
|
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \ |
48
|
|
|
the feature is ultimately dropped. |
49
|
|
|
|
50
|
|
|
Returns |
51
|
|
|
------- |
52
|
|
|
data: Updated Pandas DataFrame |
53
|
|
|
drop_cols: List of dropped columns |
54
|
|
|
''' |
55
|
|
|
|
56
|
|
|
# Validate Inputs |
57
|
|
|
_validate_input_range(mv_threshold, 'mv_threshold', 0, 1) |
58
|
|
|
_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1) |
59
|
|
|
_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1) |
60
|
|
|
|
61
|
|
|
data = pd.DataFrame(data).copy() |
62
|
|
|
mv_ratios = _missing_vals(data)['mv_cols_ratio'] |
63
|
|
|
cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
64
|
|
|
data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
65
|
|
|
|
66
|
|
|
for col in cols_mv: |
67
|
|
|
data[col] = data_mv_binary[col] |
68
|
|
|
|
69
|
|
|
high_corr_features = [] |
70
|
|
|
data_temp = data.copy() |
71
|
|
|
for col in cols_mv: |
72
|
|
|
corrmat = corr_mat(data_temp, colored=False) |
73
|
|
|
if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
74
|
|
|
high_corr_features.append(col) |
75
|
|
|
data_temp = data_temp.drop(columns=[col]) |
76
|
|
|
|
77
|
|
|
drop_cols = [] |
78
|
|
|
if target is None: |
79
|
|
|
data = data_temp |
80
|
|
|
else: |
81
|
|
|
for col in high_corr_features: |
82
|
|
|
if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target: |
83
|
|
|
drop_cols.append(col) |
84
|
|
|
data = data.drop(columns=[col]) |
85
|
|
|
|
86
|
|
|
return data, drop_cols |
87
|
|
|
|
88
|
|
|
|
89
|
|
|
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=1234): |
90
|
|
|
''' |
91
|
|
|
Split a dataset and a label column into train, dev and test sets. |
92
|
|
|
|
93
|
|
|
Parameters: |
94
|
|
|
---------- |
95
|
|
|
|
96
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ |
97
|
|
|
information is used to label the plots. |
98
|
|
|
|
99
|
|
|
target: string, list, np.array or pd.Series, default None |
100
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
101
|
|
|
and the label. |
102
|
|
|
|
103
|
|
|
dev_size: float, default 0.1 |
104
|
|
|
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \ |
105
|
|
|
split. |
106
|
|
|
|
107
|
|
|
test_size: float, default 0.1 |
108
|
|
|
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \ |
109
|
|
|
split. |
110
|
|
|
|
111
|
|
|
stratify: target column, default None |
112
|
|
|
If not None, data is split in a stratified fashion, using the input as the class labels. |
113
|
|
|
|
114
|
|
|
random_state: integer |
115
|
|
|
Random_state is the seed used by the random number generator. |
116
|
|
|
|
117
|
|
|
Returns |
118
|
|
|
------- |
119
|
|
|
tuple: Tuple containing train-dev-test split of inputs. |
120
|
|
|
''' |
121
|
|
|
|
122
|
|
|
# Validate Inputs |
123
|
|
|
_validate_input_int(random_state, 'random_state') |
124
|
|
|
_validate_input_range(dev_size, 'dev_size', 0, 1) |
125
|
|
|
_validate_input_range(test_size, 'test_size', 0, 1) |
126
|
|
|
|
127
|
|
|
target_data = [] |
128
|
|
|
if isinstance(target, str): |
129
|
|
|
target_data = data[target] |
130
|
|
|
data = data.drop(target, axis=1) |
131
|
|
|
|
132
|
|
|
elif isinstance(target, (list, pd.Series, np.ndarray)): |
133
|
|
|
target_data = pd.Series(target) |
134
|
|
|
target = target.name |
135
|
|
|
|
136
|
|
|
X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data, |
137
|
|
|
test_size=dev_size+test_size, |
138
|
|
|
random_state=random_state, |
139
|
|
|
stratify=stratify) |
140
|
|
|
|
141
|
|
|
if (dev_size == 0) or (test_size == 0): |
142
|
|
|
return X_train, X_dev_test, y_train, y_dev_test |
143
|
|
|
|
144
|
|
|
else: |
145
|
|
|
X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, |
146
|
|
|
test_size=test_size/(dev_size+test_size), |
147
|
|
|
random_state=random_state, |
148
|
|
|
stratify=y_dev_test) |
149
|
|
|
return X_train, X_dev, X_test, y_train, y_dev, y_test |
150
|
|
|
|