|
1
|
|
|
''' |
|
2
|
|
|
Functions for data preprocessing. |
|
3
|
|
|
|
|
4
|
|
|
:author: Andreas Kanz |
|
5
|
|
|
|
|
6
|
|
|
''' |
|
7
|
|
|
|
|
8
|
|
|
# Imports |
|
9
|
|
|
import numpy as np |
|
10
|
|
|
import pandas as pd |
|
11
|
|
|
|
|
12
|
|
|
from sklearn.model_selection import train_test_split |
|
13
|
|
|
|
|
14
|
|
|
from .describe import corr_mat |
|
15
|
|
|
from .utils import _missing_vals |
|
16
|
|
|
from .utils import _validate_input_int |
|
17
|
|
|
from .utils import _validate_input_range |
|
18
|
|
|
|
|
19
|
|
|
|
|
20
|
|
|
def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3): |
|
21
|
|
|
''' |
|
22
|
|
|
Converts columns with a high ratio of missing values into binary features and eventually drops them based on \ |
|
23
|
|
|
their correlation with other features and the target variable. This function follows a three step process: |
|
24
|
|
|
- 1) Identify features with a high ratio of missing values |
|
25
|
|
|
- 2) Identify high correlations of these features among themselves and with other features in the dataset. |
|
26
|
|
|
- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \ |
|
27
|
|
|
they correlate reasonably well with the target variable. |
|
28
|
|
|
|
|
29
|
|
|
Note: If no target is provided, the process exits after step two and drops columns identified up to this point. |
|
30
|
|
|
|
|
31
|
|
|
Parameters |
|
32
|
|
|
---------- |
|
33
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. |
|
34
|
|
|
|
|
35
|
|
|
target: string, list, np.array or pd.Series, default None |
|
36
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
|
37
|
|
|
and the label. |
|
38
|
|
|
|
|
39
|
|
|
mv_threshold: float, default 0.1 |
|
40
|
|
|
Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
|
41
|
|
|
for dropping and undergo further analysis. |
|
42
|
|
|
|
|
43
|
|
|
corr_thresh_features: float, default 0.6 |
|
44
|
|
|
Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\ |
|
45
|
|
|
allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis. |
|
46
|
|
|
|
|
47
|
|
|
corr_thresh_target: float, default 0.3 |
|
48
|
|
|
Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \ |
|
49
|
|
|
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \ |
|
50
|
|
|
the feature is ultimately dropped. |
|
51
|
|
|
|
|
52
|
|
|
Returns |
|
53
|
|
|
------- |
|
54
|
|
|
data: Updated Pandas DataFrame |
|
55
|
|
|
cols_mv: Columns with missing values included in the analysis |
|
56
|
|
|
drop_cols: List of dropped columns |
|
57
|
|
|
''' |
|
58
|
|
|
|
|
59
|
|
|
# Validate Inputs |
|
60
|
|
|
_validate_input_range(mv_threshold, 'mv_threshold', 0, 1) |
|
61
|
|
|
_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1) |
|
62
|
|
|
_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1) |
|
63
|
|
|
|
|
64
|
|
|
data = pd.DataFrame(data).copy() |
|
65
|
|
|
data_local = data.copy() |
|
66
|
|
|
mv_ratios = _missing_vals(data_local)['mv_cols_ratio'] |
|
67
|
|
|
cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
|
68
|
|
|
data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
|
69
|
|
|
|
|
70
|
|
|
high_corr_features = [] |
|
71
|
|
|
data_temp = data_local.copy() |
|
72
|
|
|
for col in cols_mv: |
|
73
|
|
|
corrmat = corr_mat(data_temp, colored=False) |
|
74
|
|
|
if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
|
75
|
|
|
high_corr_features.append(col) |
|
76
|
|
|
data_temp = data_temp.drop(columns=[col]) |
|
77
|
|
|
|
|
78
|
|
|
drop_cols = [] |
|
79
|
|
|
if target is None: |
|
80
|
|
|
data = data.drop(columns=high_corr_features) |
|
81
|
|
|
else: |
|
82
|
|
|
for col in high_corr_features: |
|
83
|
|
|
if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target: |
|
84
|
|
|
drop_cols.append(col) |
|
85
|
|
|
data = data.drop(columns=[col]) |
|
86
|
|
|
|
|
87
|
|
|
return data, cols_mv, drop_cols |
|
88
|
|
|
|
|
89
|
|
|
|
|
90
|
|
|
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=1234): |
|
91
|
|
|
''' |
|
92
|
|
|
Split a dataset and a label column into train, dev and test sets. |
|
93
|
|
|
|
|
94
|
|
|
Parameters: |
|
95
|
|
|
---------- |
|
96
|
|
|
|
|
97
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ |
|
98
|
|
|
information is used to label the plots. |
|
99
|
|
|
|
|
100
|
|
|
target: string, list, np.array or pd.Series, default None |
|
101
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
|
102
|
|
|
and the label. |
|
103
|
|
|
|
|
104
|
|
|
dev_size: float, default 0.1 |
|
105
|
|
|
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \ |
|
106
|
|
|
split. |
|
107
|
|
|
|
|
108
|
|
|
test_size: float, default 0.1 |
|
109
|
|
|
If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \ |
|
110
|
|
|
split. |
|
111
|
|
|
|
|
112
|
|
|
stratify: target column, default None |
|
113
|
|
|
If not None, data is split in a stratified fashion, using the input as the class labels. |
|
114
|
|
|
|
|
115
|
|
|
random_state: integer |
|
116
|
|
|
Random_state is the seed used by the random number generator. |
|
117
|
|
|
|
|
118
|
|
|
Returns |
|
119
|
|
|
------- |
|
120
|
|
|
tuple: Tuple containing train-dev-test split of inputs. |
|
121
|
|
|
''' |
|
122
|
|
|
|
|
123
|
|
|
# Validate Inputs |
|
124
|
|
|
_validate_input_range(dev_size, 'dev_size', 0, 1) |
|
125
|
|
|
_validate_input_range(test_size, 'test_size', 0, 1) |
|
126
|
|
|
_validate_input_int(random_state, 'random_state') |
|
127
|
|
|
|
|
128
|
|
|
target_data = [] |
|
129
|
|
|
if isinstance(target, str): |
|
130
|
|
|
target_data = data[target] |
|
131
|
|
|
data = data.drop(target, axis=1) |
|
132
|
|
|
|
|
133
|
|
|
elif isinstance(target, (list, pd.Series, np.ndarray)): |
|
134
|
|
|
target_data = pd.Series(target) |
|
135
|
|
|
target = target.name |
|
136
|
|
|
|
|
137
|
|
|
X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data, |
|
138
|
|
|
test_size=dev_size+test_size, |
|
139
|
|
|
random_state=random_state, |
|
140
|
|
|
stratify=stratify) |
|
141
|
|
|
|
|
142
|
|
|
if (dev_size == 0) or (test_size == 0): |
|
143
|
|
|
return X_train, X_dev_test, y_train, y_dev_test |
|
144
|
|
|
|
|
145
|
|
|
else: |
|
146
|
|
|
X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, |
|
147
|
|
|
test_size=test_size/(dev_size+test_size), |
|
148
|
|
|
random_state=random_state, |
|
149
|
|
|
stratify=y_dev_test) |
|
150
|
|
|
return X_train, X_dev, X_test, y_train, y_dev, y_test |
|
151
|
|
|
|