| 1 |  |  | ''' | 
            
                                                        
            
                                    
            
            
                | 2 |  |  | Functions for data preprocessing. | 
            
                                                        
            
                                    
            
            
                | 3 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 4 |  |  | :author: Andreas Kanz | 
            
                                                        
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 6 |  |  | ''' | 
            
                                                        
            
                                    
            
            
                | 7 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 8 |  |  | # Imports | 
            
                                                        
            
                                    
            
            
                | 9 |  |  | import numpy as np | 
            
                                                        
            
                                    
            
            
                | 10 |  |  | import pandas as pd | 
            
                                                        
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 12 |  |  | from sklearn.model_selection import train_test_split | 
            
                                                        
            
                                    
            
            
                | 13 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 14 |  |  | from .describe import corr_mat | 
            
                                                        
            
                                    
            
            
                | 15 |  |  | from .utils import _missing_vals | 
            
                                                        
            
                                    
            
            
                | 16 |  |  | from .utils import _validate_input_int | 
            
                                                        
            
                                    
            
            
                | 17 |  |  | from .utils import _validate_input_range | 
            
                                                        
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 20 |  |  | def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3): | 
            
                                                        
            
                                    
            
            
                | 21 |  |  |     ''' | 
            
                                                        
            
                                    
            
            
                | 22 |  |  |     Converts columns with a high ratio of missing values into binary features and eventually drops them based on \ | 
            
                                                        
            
                                    
            
            
                | 23 |  |  |     their correlation with other features and the target variable. This function follows a three step process: | 
            
                                                        
            
                                    
            
            
                | 24 |  |  |     - 1) Identify features with a high ratio of missing values | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |     - 2) Identify high correlations of these features among themselves and with other features in the dataset. | 
            
                                                        
            
                                    
            
            
                | 26 |  |  |     - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \ | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |          they correlate reasonably well with the target variable. | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 29 |  |  |     Parameters | 
            
                                                        
            
                                    
            
            
                | 30 |  |  |     ---------- | 
            
                                                        
            
                                    
            
            
                | 31 |  |  |     data: 2D dataset that can be coerced into Pandas DataFrame. | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 33 |  |  |     target: string, list, np.array or pd.Series, default None | 
            
                                                        
            
                                    
            
            
                | 34 |  |  |         Specify target for correlation. E.g. label column to generate only the correlations between each feature \ | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |         and the label. | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |     mv_threshold: float, default 0.1 | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |         Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \ | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |         for dropping and undergo further analysis. | 
            
                                                        
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 41 |  |  |     corr_thresh_features: float, default 0.6 | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |         Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\ | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |          allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis. | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |     corr_thresh_target: float, default 0.3 | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |         Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \ | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |         high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \ | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |         the feature is ultimately dropped. | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |     Returns | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |     ------- | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |     data: Updated Pandas DataFrame | 
            
                                                        
            
                                    
            
            
                | 53 |  |  |     drop_cols: List of dropped columns | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |     ''' | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |     # Validate Inputs | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |     _validate_input_range(mv_threshold, 'mv_threshold', 0, 1) | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |     _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1) | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |     _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1) | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |     data = pd.DataFrame(data).copy() | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |     mv_ratios = _missing_vals(data)['mv_cols_ratio'] | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |     cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist() | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |     data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0) | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |     for col in cols_mv: | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |         data[col] = data_mv_binary[col] | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |     high_corr_features = [] | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |     data_temp = data.copy() | 
            
                                                        
            
                                    
            
            
                | 71 |  |  |     for col in cols_mv: | 
            
                                                        
            
                                    
            
            
                | 72 |  |  |         corrmat = corr_mat(data_temp, colored=False) | 
            
                                                        
            
                                    
            
            
                | 73 |  |  |         if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features: | 
            
                                                        
            
                                    
            
            
                | 74 |  |  |             high_corr_features.append(col) | 
            
                                                        
            
                                    
            
            
                | 75 |  |  |             data_temp = data_temp.drop(columns=[col]) | 
            
                                                        
            
                                    
            
            
                | 76 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 77 |  |  |     drop_cols = [] | 
            
                                                        
            
                                    
            
            
                | 78 |  |  |     if target is None: | 
            
                                                        
            
                                    
            
            
                | 79 |  |  |         data = data_temp | 
            
                                                        
            
                                    
            
            
                | 80 |  |  |     else: | 
            
                                                        
            
                                    
            
            
                | 81 |  |  |         for col in high_corr_features: | 
            
                                                        
            
                                    
            
            
                | 82 |  |  |             if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target: | 
            
                                                        
            
                                    
            
            
                | 83 |  |  |                 drop_cols.append(col) | 
            
                                                        
            
                                    
            
            
                | 84 |  |  |                 data = data.drop(columns=[col]) | 
            
                                                        
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 86 |  |  |     return data, drop_cols | 
            
                                                        
            
                                    
            
            
                | 87 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 88 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 89 |  |  | def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=1234): | 
            
                                                        
            
                                    
            
            
                | 90 |  |  |     ''' | 
            
                                                        
            
                                    
            
            
                | 91 |  |  |     Split a dataset and a label column into train, dev and test sets. | 
            
                                                        
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 93 |  |  |     Parameters: | 
            
                                                        
            
                                    
            
            
                | 94 |  |  |     ---------- | 
            
                                                        
            
                                    
            
            
                | 95 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 96 |  |  |     data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ | 
            
                                                        
            
                                    
            
            
                | 97 |  |  |     information is used to label the plots. | 
            
                                                        
            
                                    
            
            
                | 98 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 99 |  |  |     target: string, list, np.array or pd.Series, default None | 
            
                                                        
            
                                    
            
            
                | 100 |  |  |         Specify target for correlation. E.g. label column to generate only the correlations between each feature \ | 
            
                                                        
            
                                    
            
            
                | 101 |  |  |         and the label. | 
            
                                                        
            
                                    
            
            
                | 102 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 103 |  |  |     dev_size: float, default 0.1 | 
            
                                                        
            
                                    
            
            
                | 104 |  |  |         If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \ | 
            
                                                        
            
                                    
            
            
                | 105 |  |  |         split. | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 107 |  |  |     test_size: float, default 0.1 | 
            
                                                        
            
                                    
            
            
                | 108 |  |  |         If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \ | 
            
                                                        
            
                                    
            
            
                | 109 |  |  |         split. | 
            
                                                        
            
                                    
            
            
                | 110 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 111 |  |  |     stratify: target column, default None | 
            
                                                        
            
                                    
            
            
                | 112 |  |  |         If not None, data is split in a stratified fashion, using the input as the class labels. | 
            
                                                        
            
                                    
            
            
                | 113 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 114 |  |  |     random_state: integer | 
            
                                                        
            
                                    
            
            
                | 115 |  |  |         Random_state is the seed used by the random number generator. | 
            
                                                        
            
                                    
            
            
                | 116 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 117 |  |  |     Returns | 
            
                                                        
            
                                    
            
            
                | 118 |  |  |     ------- | 
            
                                                        
            
                                    
            
            
                | 119 |  |  |     tuple: Tuple containing train-dev-test split of inputs. | 
            
                                                        
            
                                    
            
            
                | 120 |  |  |     ''' | 
            
                                                        
            
                                    
            
            
                | 121 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 122 |  |  |     # Validate Inputs | 
            
                                                        
            
                                    
            
            
                | 123 |  |  |     _validate_input_int(random_state, 'random_state') | 
            
                                                        
            
                                    
            
            
                | 124 |  |  |     _validate_input_range(dev_size, 'dev_size', 0, 1) | 
            
                                                        
            
                                    
            
            
                | 125 |  |  |     _validate_input_range(test_size, 'test_size', 0, 1) | 
            
                                                        
            
                                    
            
            
                | 126 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 127 |  |  |     target_data = [] | 
            
                                                        
            
                                    
            
            
                | 128 |  |  |     if isinstance(target, str): | 
            
                                                        
            
                                    
            
            
                | 129 |  |  |         target_data = data[target] | 
            
                                                        
            
                                    
            
            
                | 130 |  |  |         data = data.drop(target, axis=1) | 
            
                                                        
            
                                    
            
            
                | 131 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 132 |  |  |     elif isinstance(target, (list, pd.Series, np.ndarray)): | 
            
                                                        
            
                                    
            
            
                | 133 |  |  |         target_data = pd.Series(target) | 
            
                                                        
            
                                    
            
            
                | 134 |  |  |         target = target.name | 
            
                                                        
            
                                    
            
            
                | 135 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 136 |  |  |     X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data, | 
            
                                                        
            
                                    
            
            
                | 137 |  |  |                                                                 test_size=dev_size+test_size, | 
            
                                                        
            
                                    
            
            
                | 138 |  |  |                                                                 random_state=random_state, | 
            
                                                        
            
                                    
            
            
                | 139 |  |  |                                                                 stratify=stratify) | 
            
                                                        
            
                                    
            
            
                | 140 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 141 |  |  |     if (dev_size == 0) or (test_size == 0): | 
            
                                                        
            
                                    
            
            
                | 142 |  |  |         return X_train, X_dev_test, y_train, y_dev_test | 
            
                                                        
            
                                    
            
            
                | 143 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 144 |  |  |     else: | 
            
                                                        
            
                                    
            
            
                | 145 |  |  |         X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test, | 
            
                                                        
            
                                    
            
            
                | 146 |  |  |                                                         test_size=test_size/(dev_size+test_size), | 
            
                                                        
            
                                    
            
            
                | 147 |  |  |                                                         random_state=random_state, | 
            
                                                        
            
                                    
            
            
                | 148 |  |  |                                                         stratify=y_dev_test) | 
            
                                                        
            
                                    
            
            
                | 149 |  |  |         return X_train, X_dev, X_test, y_train, y_dev, y_test | 
            
                                                        
            
                                    
            
            
                | 150 |  |  |  |