| Conditions | 8 |
| Total Lines | 68 |
| Code Lines | 24 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | ''' |
||
| 20 | def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3): |
||
| 21 | ''' |
||
| 22 | Converts columns with a high ratio of missing values into binary features and eventually drops them based on \ |
||
| 23 | their correlation with other features and the target variable. This function follows a three step process: |
||
| 24 | - 1) Identify features with a high ratio of missing values |
||
| 25 | - 2) Identify high correlations of these features among themselves and with other features in the dataset. |
||
| 26 | - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \ |
||
| 27 | they correlate reasonably well with the target variable. |
||
| 28 | |||
| 29 | Note: If no target is provided, the process exits after step two and drops columns identified up to this point. |
||
| 30 | |||
| 31 | Parameters |
||
| 32 | ---------- |
||
| 33 | data: 2D dataset that can be coerced into Pandas DataFrame. |
||
| 34 | |||
| 35 | target: string, list, np.array or pd.Series, default None |
||
| 36 | Specify target for correlation. E.g. label column to generate only the correlations between each feature \ |
||
| 37 | and the label. |
||
| 38 | |||
| 39 | mv_threshold: float, default 0.1 |
||
| 40 | Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ |
||
| 41 | for dropping and undergo further analysis. |
||
| 42 | |||
| 43 | corr_thresh_features: float, default 0.6 |
||
| 44 | Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\ |
||
| 45 | allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis. |
||
| 46 | |||
| 47 | corr_thresh_target: float, default 0.3 |
||
| 48 | Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \ |
||
| 49 | high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \ |
||
| 50 | the feature is ultimately dropped. |
||
| 51 | |||
| 52 | Returns |
||
| 53 | ------- |
||
| 54 | data: Updated Pandas DataFrame |
||
| 55 | cols_mv: Columns with missing values included in the analysis |
||
| 56 | drop_cols: List of dropped columns |
||
| 57 | ''' |
||
| 58 | |||
| 59 | # Validate Inputs |
||
| 60 | _validate_input_range(mv_threshold, 'mv_threshold', 0, 1) |
||
| 61 | _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1) |
||
| 62 | _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1) |
||
| 63 | |||
| 64 | data = pd.DataFrame(data).copy() |
||
| 65 | data_local = data.copy() |
||
| 66 | mv_ratios = _missing_vals(data_local)['mv_cols_ratio'] |
||
| 67 | cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() |
||
| 68 | data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) |
||
| 69 | |||
| 70 | high_corr_features = [] |
||
| 71 | data_temp = data_local.copy() |
||
| 72 | for col in cols_mv: |
||
| 73 | corrmat = corr_mat(data_temp, colored=False) |
||
| 74 | if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: |
||
| 75 | high_corr_features.append(col) |
||
| 76 | data_temp = data_temp.drop(columns=[col]) |
||
| 77 | |||
| 78 | drop_cols = [] |
||
| 79 | if target is None: |
||
| 80 | data = data.drop(columns=high_corr_features) |
||
| 81 | else: |
||
| 82 | for col in high_corr_features: |
||
| 83 | if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target: |
||
| 84 | drop_cols.append(col) |
||
| 85 | data = data.drop(columns=[col]) |
||
| 86 | |||
| 87 | return data, cols_mv, drop_cols |
||
| 88 | |||
| 151 |