| Conditions | 8 |
| Total Lines | 58 |
| Code Lines | 23 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | ''' |
||
| 16 | def mv_col_handler(data, target=None, mv_threshold=0.25, corr_thresh_features=0.65, corr_thresh_target=0.2): |
||
| 17 | ''' |
||
| 18 | Drops columns with a high ratio of missing values based on correlation with other features and the target variable. |
||
| 19 | |||
| 20 | Parameters |
||
| 21 | ---------- |
||
| 22 | data: 2D dataset that can be coerced into Pandas DataFrame. |
||
| 23 | |||
| 24 | target: string, list, np.array or pd.Series, default None |
||
| 25 | Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
||
| 26 | and the label. |
||
| 27 | |||
| 28 | mv_threshold: float, default 0.25 |
||
| 29 | Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
||
| 30 | for dropping and undergo further analysis. |
||
| 31 | |||
| 32 | corr_thresh_features: float, default 0.65 |
||
| 33 | Value between 0 <= threshold <= 1. Previously identified features with a high mv-ratio with a correlation \ |
||
| 34 | larger than corr_thresh_features with any other feature undergo further analysis. |
||
| 35 | |||
| 36 | corr_thresh_target: float, default 0.25 |
||
| 37 | Value between 0 <= threshold <= 1. The remaining features (with a high mv-ratio and high correlation to an \ |
||
| 38 | existing feature) are dropped unless their correlation with the target is larger than corr_thresh_target. |
||
| 39 | |||
| 40 | Returns |
||
| 41 | ------- |
||
| 42 | data: Updated Pandas DataFrame |
||
| 43 | drop_cols: List of dropped columns |
||
| 44 | ''' |
||
| 45 | |||
| 46 | # Validate Inputs |
||
| 47 | _validate_input_range(mv_threshold, 'mv_threshold', -1, 1) |
||
| 48 | _validate_input_range(corr_thresh_features, 'corr_thresh_features', -1, 1) |
||
| 49 | _validate_input_range(corr_thresh_target, 'corr_thresh_target', -1, 1) |
||
| 50 | |||
| 51 | data = pd.DataFrame(data).copy() |
||
| 52 | mv_ratios = _missing_vals(data)['mv_cols_ratio'] |
||
| 53 | cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
||
| 54 | data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
||
| 55 | |||
| 56 | for col in cols_mv: |
||
| 57 | data[col] = data_mv_binary[col] |
||
| 58 | |||
| 59 | high_corr_features = [] |
||
| 60 | data_temp = data.copy() |
||
| 61 | for col in cols_mv: |
||
| 62 | corrmat = corr_mat(data_temp, colored=False) |
||
| 63 | if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
||
| 64 | high_corr_features.append(col) |
||
| 65 | data_temp = data_temp.drop(columns=[col]) |
||
| 66 | |||
| 67 | drop_cols = [] |
||
| 68 | for col in high_corr_features: |
||
| 69 | if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target: |
||
| 70 | drop_cols.append(col) |
||
| 71 | data = data.drop(columns=[col]) |
||
| 72 | |||
| 73 | return data, drop_cols |
||
| 74 |