1
|
|
|
''' |
2
|
|
|
Functions for data preprocessing. |
3
|
|
|
|
4
|
|
|
:author: Andreas Kanz |
5
|
|
|
|
6
|
|
|
''' |
7
|
|
|
|
8
|
|
|
# Imports |
9
|
|
|
import numpy as np |
10
|
|
|
import pandas as pd |
11
|
|
|
|
12
|
|
|
from sklearn.model_selection import train_test_split |
13
|
|
|
|
14
|
|
|
from .describe import corr_mat |
15
|
|
|
from .utils import _missing_vals |
16
|
|
|
from .utils import _validate_input_int |
17
|
|
|
from .utils import _validate_input_range |
18
|
|
|
|
19
|
|
|
|
20
|
|
|
def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3): |
21
|
|
|
''' |
22
|
|
|
Converts columns with a high ratio of missing values into binary features and eventually drops them based on \ |
23
|
|
|
their correlation with other features and the target variable. This function follows a three step process: |
24
|
|
|
- 1) Identify features with a high ratio of missing values |
25
|
|
|
- 2) Identify high correlations of these features among themselves and with other features in the dataset. |
26
|
|
|
- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \ |
27
|
|
|
they correlate reasonably well with the target variable. |
28
|
|
|
|
29
|
|
|
Note: If no target is provided, the process exits after step two and drops columns identified up to this point. |
30
|
|
|
|
31
|
|
|
Parameters |
32
|
|
|
---------- |
33
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. |
34
|
|
|
|
35
|
|
|
target: string, list, np.array or pd.Series, default None |
36
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
37
|
|
|
and the label. |
38
|
|
|
|
39
|
|
|
mv_threshold: float, default 0.1 |
40
|
|
|
Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
41
|
|
|
for dropping and undergo further analysis. |
42
|
|
|
|
43
|
|
|
corr_thresh_features: float, default 0.6 |
44
|
|
|
Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\ |
45
|
|
|
allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis. |
46
|
|
|
|
47
|
|
|
corr_thresh_target: float, default 0.3 |
48
|
|
|
Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \ |
49
|
|
|
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \ |
50
|
|
|
the feature is ultimately dropped. |
51
|
|
|
|
52
|
|
|
Returns |
53
|
|
|
------- |
54
|
|
|
data: Updated Pandas DataFrame |
55
|
|
|
cols_mv: Columns with missing values included in the analysis |
56
|
|
|
drop_cols: List of dropped columns |
57
|
|
|
''' |
58
|
|
|
|
59
|
|
|
# Validate Inputs |
60
|
|
|
_validate_input_range(mv_threshold, 'mv_threshold', 0, 1) |
61
|
|
|
_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1) |
62
|
|
|
_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1) |
63
|
|
|
|
64
|
|
|
data = pd.DataFrame(data).copy() |
65
|
|
|
data_local = data.copy() |
66
|
|
|
mv_ratios = _missing_vals(data_local)['mv_cols_ratio'] |
67
|
|
|
cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
68
|
|
|
data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
69
|
|
|
|
70
|
|
|
high_corr_features = [] |
71
|
|
|
data_temp = data_local.copy() |
72
|
|
|
for col in cols_mv: |
73
|
|
|
corrmat = corr_mat(data_temp, colored=False) |
74
|
|
|
if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
75
|
|
|
high_corr_features.append(col) |
76
|
|
|
data_temp = data_temp.drop(columns=[col]) |
77
|
|
|
|
78
|
|
|
drop_cols = [] |
79
|
|
|
if target is None: |
80
|
|
|
data = data.drop(columns=high_corr_features) |
81
|
|
|
else: |
82
|
|
|
for col in high_corr_features: |
83
|
|
|
if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target: |
84
|
|
|
drop_cols.append(col) |
85
|
|
|
data = data.drop(columns=[col]) |
86
|
|
|
|
87
|
|
|
return data, cols_mv, drop_cols |
88
|
|
|
|
89
|
|
|
|
90
|
|
|
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=1234): |
91
|
|
|
''' |
92
|
|
|
Split a dataset and a label column into train, dev and test sets. |
93
|
|
|
|
94
|
|
|
Parameters: |
95
|
|
|
---------- |
96
|
|
|
|
97
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ |
98
|
|
|
information is used to label the plots. |
99
|
|
|
|
100
|
|
|
target: string, list, np.array or pd.Series, default None |
101
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
102
|
|
|
and the label. |
103
|
|
|
|
104
|
|
|
dev_size: float, default 0.1 |
105
|
|
|
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \ |
106
|
|
|
split. |
107
|
|
|
|
108
|
|
|
test_size: float, default 0.1 |
109
|
|
|
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \ |
110
|
|
|
split. |
111
|
|
|
|
112
|
|
|
stratify: target column, default None |
113
|
|
|
If not None, data is split in a stratified fashion, using the input as the class labels. |
114
|
|
|
|
115
|
|
|
random_state: integer |
116
|
|
|
Random_state is the seed used by the random number generator. |
117
|
|
|
|
118
|
|
|
Returns |
119
|
|
|
------- |
120
|
|
|
tuple: Tuple containing train-dev-test split of inputs. |
121
|
|
|
''' |
122
|
|
|
|
123
|
|
|
# Validate Inputs |
124
|
|
|
_validate_input_range(dev_size, 'dev_size', 0, 1) |
125
|
|
|
_validate_input_range(test_size, 'test_size', 0, 1) |
126
|
|
|
_validate_input_int(random_state, 'random_state') |
127
|
|
|
|
128
|
|
|
target_data = [] |
129
|
|
|
if isinstance(target, str): |
130
|
|
|
target_data = data[target] |
131
|
|
|
data = data.drop(target, axis=1) |
132
|
|
|
|
133
|
|
|
elif isinstance(target, (list, pd.Series, np.ndarray)): |
134
|
|
|
target_data = pd.Series(target) |
135
|
|
|
target = target.name |
136
|
|
|
|
137
|
|
|
X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data, |
138
|
|
|
test_size=dev_size+test_size, |
139
|
|
|
random_state=random_state, |
140
|
|
|
stratify=stratify) |
141
|
|
|
|
142
|
|
|
if (dev_size == 0) or (test_size == 0): |
143
|
|
|
return X_train, X_dev_test, y_train, y_dev_test |
144
|
|
|
|
145
|
|
|
else: |
146
|
|
|
X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, |
147
|
|
|
test_size=test_size/(dev_size+test_size), |
148
|
|
|
random_state=random_state, |
149
|
|
|
stratify=y_dev_test) |
150
|
|
|
return X_train, X_dev, X_test, y_train, y_dev, y_test |
151
|
|
|
|