|
1
|
|
|
''' |
|
2
|
|
|
Functions for data preprocessing. |
|
3
|
|
|
|
|
4
|
|
|
:author: Andreas Kanz |
|
5
|
|
|
|
|
6
|
|
|
''' |
|
7
|
|
|
|
|
8
|
|
|
# Imports |
|
9
|
|
|
import pandas as pd |
|
10
|
|
|
|
|
11
|
|
|
from .describe import corr_mat |
|
12
|
|
|
from .utils import _missing_vals |
|
13
|
|
|
from .utils import _validate_input_range |
|
14
|
|
|
|
|
15
|
|
|
|
|
16
|
|
|
def mv_col_handler(data, target=None, mv_threshold=0.25, corr_thresh_features=0.65, corr_thresh_target=0.2): |
|
17
|
|
|
''' |
|
18
|
|
|
Drops columns with a high ratio of missing values based on correlation with other features and the target variable. |
|
19
|
|
|
|
|
20
|
|
|
Parameters |
|
21
|
|
|
---------- |
|
22
|
|
|
data: 2D dataset that can be coerced into Pandas DataFrame. |
|
23
|
|
|
|
|
24
|
|
|
target: string, list, np.array or pd.Series, default None |
|
25
|
|
|
Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
|
26
|
|
|
and the label. |
|
27
|
|
|
|
|
28
|
|
|
mv_threshold: float, default 0.25 |
|
29
|
|
|
Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
|
30
|
|
|
for dropping and undergo further analysis. |
|
31
|
|
|
|
|
32
|
|
|
corr_thresh_features: float, default 0.65 |
|
33
|
|
|
Value between 0 <= threshold <= 1. Previously identified features with a high mv-ratio with a correlation \ |
|
34
|
|
|
larger than corr_thresh_features with any other feature undergo further analysis. |
|
35
|
|
|
|
|
36
|
|
|
corr_thresh_target: float, default 0.25 |
|
37
|
|
|
Value between 0 <= threshold <= 1. The remaining features (with a high mv-ratio and high correlation to an \ |
|
38
|
|
|
existing feature) are dropped unless their correlation with the target is larger than corr_thresh_target. |
|
39
|
|
|
|
|
40
|
|
|
Returns |
|
41
|
|
|
------- |
|
42
|
|
|
data: Updated Pandas DataFrame |
|
43
|
|
|
drop_cols: List of dropped columns |
|
44
|
|
|
''' |
|
45
|
|
|
|
|
46
|
|
|
# Validate Inputs |
|
47
|
|
|
_validate_input_range(mv_threshold, 'mv_threshold', -1, 1) |
|
48
|
|
|
_validate_input_range(corr_thresh_features, 'corr_thresh_features', -1, 1) |
|
49
|
|
|
_validate_input_range(corr_thresh_target, 'corr_thresh_target', -1, 1) |
|
50
|
|
|
|
|
51
|
|
|
data = pd.DataFrame(data).copy() |
|
52
|
|
|
mv_ratios = _missing_vals(data)['mv_cols_ratio'] |
|
53
|
|
|
cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
|
54
|
|
|
data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
|
55
|
|
|
|
|
56
|
|
|
for col in cols_mv: |
|
57
|
|
|
data[col] = data_mv_binary[col] |
|
58
|
|
|
|
|
59
|
|
|
high_corr_features = [] |
|
60
|
|
|
data_temp = data.copy() |
|
61
|
|
|
for col in cols_mv: |
|
62
|
|
|
corrmat = corr_mat(data_temp, colored=False) |
|
63
|
|
|
if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
|
64
|
|
|
high_corr_features.append(col) |
|
65
|
|
|
data_temp = data_temp.drop(columns=[col]) |
|
66
|
|
|
|
|
67
|
|
|
drop_cols = [] |
|
68
|
|
|
for col in high_corr_features: |
|
69
|
|
|
if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target: |
|
70
|
|
|
drop_cols.append(col) |
|
71
|
|
|
data = data.drop(columns=[col]) |
|
72
|
|
|
|
|
73
|
|
|
return data, drop_cols |
|
74
|
|
|
|