Conditions | 8 |
Total Lines | 58 |
Code Lines | 23 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | ''' |
||
16 | def mv_col_handler(data, target=None, mv_threshold=0.25, corr_thresh_features=0.65, corr_thresh_target=0.2): |
||
17 | ''' |
||
18 | Drops columns with a high ratio of missing values based on correlation with other features and the target variable. |
||
19 | |||
20 | Parameters |
||
21 | ---------- |
||
22 | data: 2D dataset that can be coerced into Pandas DataFrame. |
||
23 | |||
24 | target: string, list, np.array or pd.Series, default None |
||
25 | Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
||
26 | and the label. |
||
27 | |||
28 | mv_threshold: float, default 0.25 |
||
29 | Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
||
30 | for dropping and undergo further analysis. |
||
31 | |||
32 | corr_thresh_features: float, default 0.65 |
||
33 | Value between 0 <= threshold <= 1. Previously identified features with a high mv-ratio with a correlation \ |
||
34 | larger than corr_thresh_features with any other feature undergo further analysis. |
||
35 | |||
36 | corr_thresh_target: float, default 0.25 |
||
37 | Value between 0 <= threshold <= 1. The remaining features (with a high mv-ratio and high correlation to an \ |
||
38 | existing feature) are dropped unless their correlation with the target is larger than corr_thresh_target. |
||
39 | |||
40 | Returns |
||
41 | ------- |
||
42 | data: Updated Pandas DataFrame |
||
43 | drop_cols: List of dropped columns |
||
44 | ''' |
||
45 | |||
46 | # Validate Inputs |
||
47 | _validate_input_range(mv_threshold, 'mv_threshold', -1, 1) |
||
48 | _validate_input_range(corr_thresh_features, 'corr_thresh_features', -1, 1) |
||
49 | _validate_input_range(corr_thresh_target, 'corr_thresh_target', -1, 1) |
||
50 | |||
51 | data = pd.DataFrame(data).copy() |
||
52 | mv_ratios = _missing_vals(data)['mv_cols_ratio'] |
||
53 | cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
||
54 | data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
||
55 | |||
56 | for col in cols_mv: |
||
57 | data[col] = data_mv_binary[col] |
||
58 | |||
59 | high_corr_features = [] |
||
60 | data_temp = data.copy() |
||
61 | for col in cols_mv: |
||
62 | corrmat = corr_mat(data_temp, colored=False) |
||
63 | if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
||
64 | high_corr_features.append(col) |
||
65 | data_temp = data_temp.drop(columns=[col]) |
||
66 | |||
67 | drop_cols = [] |
||
68 | for col in high_corr_features: |
||
69 | if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target: |
||
70 | drop_cols.append(col) |
||
71 | data = data.drop(columns=[col]) |
||
72 | |||
73 | return data, drop_cols |
||
74 |