| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                Functions for data cleaning.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                :author: Andreas Kanz  | 
            
            
                                                        
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                # Imports  | 
            
            
                                                        
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                import itertools  | 
            
            
                                                        
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                import numpy as np  | 
            
            
                                                        
            
                                    
            
            
                | 
                    11
                 | 
                                    
                                                     | 
                
                 | 
                import pandas as pd  | 
            
            
                                                        
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                from sklearn.base import BaseEstimator, TransformerMixin  | 
            
            
                                                        
            
                                    
            
            
                | 
                    13
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    14
                 | 
                                    
                                                     | 
                
                 | 
                from .describe import corr_mat  | 
            
            
                                                        
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                from .utils import (_diff_report,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                                    _drop_duplicates,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                                    _missing_vals,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    18
                 | 
                                    
                                                     | 
                
                 | 
                                    _validate_input_bool,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                                    _validate_input_range)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                __all__ = ['convert_datatypes',  | 
            
            
                                                        
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                           'data_cleaning',  | 
            
            
                                                        
            
                                    
            
            
                | 
                    24
                 | 
                                    
                                                     | 
                
                 | 
                           'drop_missing',  | 
            
            
                                                        
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                           'mv_col_handling']  | 
            
            
                                                        
            
                                    
            
            
                | 
                    26
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    28
                 | 
                                    
                                                     | 
                
                 | 
                def optimize_ints(data):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    29
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.DataFrame(data).copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    30
                 | 
                                    
                                                     | 
                
                 | 
                    ints = data.select_dtypes(include=['int64']).columns.tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    31
                 | 
                                    
                                                     | 
                
                 | 
                    data[ints] = data[ints].apply(pd.to_numeric, downcast='integer')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                    return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                def optimize_floats(data):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    36
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.DataFrame(data).copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                    floats = data.select_dtypes(include=['float64']).columns.tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    38
                 | 
                                    
                                                     | 
                
                 | 
                    data[floats] = data[floats].apply(pd.to_numeric, downcast='float')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                    return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    41
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                        due to an issue in pandas. This is expected to be fixed in pandas 1.1. \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    46
                 | 
                                    
                                                     | 
                
                 | 
                        See https://github.com/pandas-dev/pandas/issues/33803  | 
            
            
                                                        
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    48
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    50
                 | 
                                    
                                                     | 
                
                 | 
                    data: 2D dataset that can be coerced into Pandas DataFrame.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    51
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                    category: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    54
                 | 
                                    
                                                     | 
                
                 | 
                        columns using cat_exclude.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    55
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    56
                 | 
                                    
                                                     | 
                
                 | 
                    cat_threshold: float, default 0.05  | 
            
            
                                                        
            
                                    
            
            
                | 
                    57
                 | 
                                    
                                                     | 
                
                 | 
                        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    58
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    59
                 | 
                                    
                                                     | 
                
                 | 
                    cat_exclude: list, default None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    60
                 | 
                                    
                                                     | 
                
                 | 
                        List of columns to exclude from categorical conversion.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    61
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    62
                 | 
                                    
                                                     | 
                
                 | 
                    Returns  | 
            
            
                                                        
            
                                    
            
            
                | 
                    63
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    64
                 | 
                                    
                                                     | 
                
                 | 
                    data: Pandas DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    65
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    66
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    67
                 | 
                                    
                                                     | 
                
                 | 
                    # Validate Inputs  | 
            
            
                                                        
            
                                    
            
            
                | 
                    68
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_bool(category, 'Category')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    69
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    70
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.DataFrame(data).copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    74
                 | 
                                    
                                                     | 
                
                 | 
                    for col in data.columns:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    75
                 | 
                                    
                                                     | 
                
                 | 
                        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                        if (category and  | 
            
            
                                                        
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                            unique_vals_ratio < cat_threshold and  | 
            
            
                                                        
            
                                    
            
            
                | 
                    78
                 | 
                                    
                                                     | 
                
                 | 
                            col not in cat_exclude and  | 
            
            
                                                        
            
                                    
            
            
                | 
                    79
                 | 
                                    
                                                     | 
                
                 | 
                                data[col].dtype == 'object'):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                            data[col] = data[col].astype('category') | 
            
            
                                                        
            
                                    
            
            
                | 
                    81
                 | 
                                    
                                                     | 
                
                 | 
                        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                                                             convert_integer=False, convert_boolean=True)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    83
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    84
                 | 
                                    
                                                     | 
                
                 | 
                    data = optimize_ints(data)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    85
                 | 
                                    
                                                     | 
                
                 | 
                    data = optimize_floats(data)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    86
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    87
                 | 
                                    
                                                     | 
                
                 | 
                    return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    88
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    89
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    90
                 | 
                                    
                                                     | 
                
                 | 
                def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    91
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    92
                 | 
                                    
                                                     | 
                
                 | 
                    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    93
                 | 
                                    
                                                     | 
                
                 | 
                    drop additional columns and rows based on the fraction of remaining NA-values.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    94
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    95
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    96
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    97
                 | 
                                    
                                                     | 
                
                 | 
                    data: 2D dataset that can be coerced into Pandas DataFrame.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    98
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    99
                 | 
                                    
                                                     | 
                
                 | 
                    drop_threshold_cols: float, default 1  | 
            
            
                                                        
            
                                    
            
            
                | 
                    100
                 | 
                                    
                                                     | 
                
                 | 
                        Drop columns with NA-ratio above the specified threshold.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    101
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    102
                 | 
                                    
                                                     | 
                
                 | 
                    drop_threshold_rows: float, default 1  | 
            
            
                                                        
            
                                    
            
            
                | 
                    103
                 | 
                                    
                                                     | 
                
                 | 
                        Drop rows with NA-ratio above the specified threshold.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    104
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    105
                 | 
                                    
                                                     | 
                
                 | 
                    Returns  | 
            
            
                                                        
            
                                    
            
            
                | 
                    106
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    107
                 | 
                                    
                                                     | 
                
                 | 
                    data_cleaned: Pandas DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    108
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    109
                 | 
                                    
                                                     | 
                
                 | 
                    Notes  | 
            
            
                                                        
            
                                    
            
            
                | 
                    110
                 | 
                                    
                                                     | 
                
                 | 
                    -----  | 
            
            
                                                        
            
                                    
            
            
                | 
                    111
                 | 
                                    
                                                     | 
                
                 | 
                    Columns are dropped first. Rows are dropped based on the remaining data.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    112
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    113
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    114
                 | 
                                    
                                                     | 
                
                 | 
                    # Validate Inputs  | 
            
            
                                                        
            
                                    
            
            
                | 
                    115
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    116
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    117
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    118
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.DataFrame(data).copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    119
                 | 
                                    
                                                     | 
                
                 | 
                    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    120
                 | 
                                    
                                                     | 
                
                 | 
                    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    121
                 | 
                                    
                                                     | 
                
                 | 
                    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    122
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    123
                 | 
                                    
                                                     | 
                
                 | 
                    return data_cleaned  | 
            
            
                                                        
            
                                    
            
            
                | 
                    124
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    125
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    126
                 | 
                                    
                                                     | 
                
                 | 
                def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    127
                 | 
                                    
                                                     | 
                
                 | 
                                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    128
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    129
                 | 
                                    
                                                     | 
                
                 | 
                    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    130
                 | 
                                    
                                                     | 
                
                 | 
                        columns as well as optimizing the datatypes.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    131
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    132
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    133
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    134
                 | 
                                    
                                                     | 
                
                 | 
                    data: 2D dataset that can be coerced into Pandas DataFrame.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    135
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    136
                 | 
                                    
                                                     | 
                
                 | 
                    drop_threshold_cols: float, default 0.9  | 
            
            
                                                        
            
                                    
            
            
                | 
                    137
                 | 
                                    
                                                     | 
                
                 | 
                        Drop columns with NA-ratio above the specified threshold.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    138
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    139
                 | 
                                    
                                                     | 
                
                 | 
                    drop_threshold_rows: float, default 0.9  | 
            
            
                                                        
            
                                    
            
            
                | 
                    140
                 | 
                                    
                                                     | 
                
                 | 
                        Drop rows with NA-ratio above the specified threshold.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    141
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    142
                 | 
                                    
                                                     | 
                
                 | 
                    drop_duplicates: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    143
                 | 
                                    
                                                     | 
                
                 | 
                        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    144
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    145
                 | 
                                    
                                                     | 
                
                 | 
                    convert_dtypes: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    146
                 | 
                                    
                                                     | 
                
                 | 
                        Convert dtypes using pd.convert_dtypes().  | 
            
            
                                                        
            
                                    
            
            
                | 
                    147
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    148
                 | 
                                    
                                                     | 
                
                 | 
                    category: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    149
                 | 
                                    
                                                     | 
                
                 | 
                        Enable changing dtypes of 'object' columns to "category". Set threshold using cat_threshold. Requires \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    150
                 | 
                                    
                                                     | 
                
                 | 
                        convert_dtypes=True.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    151
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    152
                 | 
                                    
                                                     | 
                
                 | 
                    cat_threshold: float, default 0.03  | 
            
            
                                                        
            
                                    
            
            
                | 
                    153
                 | 
                                    
                                                     | 
                
                 | 
                        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    154
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    155
                 | 
                                    
                                                     | 
                
                 | 
                    cat_exclude: list, default None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    156
                 | 
                                    
                                                     | 
                
                 | 
                        List of columns to exclude from categorical conversion.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    157
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    158
                 | 
                                    
                                                     | 
                
                 | 
                    show: {'all', 'changes', None} default 'all' | 
            
            
                                                        
            
                                    
            
            
                | 
                    159
                 | 
                                    
                                                     | 
                
                 | 
                        Specify verbosity of the output.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    160
                 | 
                                    
                                                     | 
                
                 | 
                        * 'all': Print information about the data before and after cleaning as well as information about changes.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    161
                 | 
                                    
                                                     | 
                
                 | 
                        * 'changes': Print out differences in the data before and after cleaning.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    162
                 | 
                                    
                                                     | 
                
                 | 
                        * None: No information about the data and the data cleaning is printed.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    163
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    164
                 | 
                                    
                                                     | 
                
                 | 
                    Returns  | 
            
            
                                                        
            
                                    
            
            
                | 
                    165
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    166
                 | 
                                    
                                                     | 
                
                 | 
                    data_cleaned: Pandas DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    167
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    168
                 | 
                                    
                                                     | 
                
                 | 
                    See Also  | 
            
            
                                                        
            
                                    
            
            
                | 
                    169
                 | 
                                    
                                                     | 
                
                 | 
                    --------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    170
                 | 
                                    
                                                     | 
                
                 | 
                    convert_datatypes: Convert columns to best possible dtypes.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    171
                 | 
                                    
                                                     | 
                
                 | 
                    drop_missing : Flexibly drop columns and rows.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    172
                 | 
                                    
                                                     | 
                
                 | 
                    _memory_usage: Gives the total memory usage in kilobytes.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    173
                 | 
                                    
                                                     | 
                
                 | 
                    _missing_vals: Metrics about missing values in the dataset.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    174
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    175
                 | 
                                    
                                                     | 
                
                 | 
                    Notes  | 
            
            
                                                        
            
                                    
            
            
                | 
                    176
                 | 
                                    
                                                     | 
                
                 | 
                    -----  | 
            
            
                                                        
            
                                    
            
            
                | 
                    177
                 | 
                                    
                                                     | 
                
                 | 
                    The category dtype is not grouped in the summary, unless it contains exactly the same categories.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    178
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    179
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    180
                 | 
                                    
                                                     | 
                
                 | 
                    # Validate Inputs  | 
            
            
                                                        
            
                                    
            
            
                | 
                    181
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    182
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    183
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_bool(drop_duplicates, 'drop_duplicates')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    184
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_bool(convert_dtypes, 'convert_datatypes')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    185
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_bool(category, 'category')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    186
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    187
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    188
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.DataFrame(data).copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    189
                 | 
                                    
                                                     | 
                
                 | 
                    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    190
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    191
                 | 
                                    
                                                     | 
                
                 | 
                    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    192
                 | 
                                    
                                                     | 
                
                 | 
                    data_cleaned = data_cleaned.drop(columns=single_val_cols)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    193
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    194
                 | 
                                    
                                                     | 
                
                 | 
                    dupl_rows = None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    195
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    196
                 | 
                                    
                                                     | 
                
                 | 
                    if drop_duplicates:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    197
                 | 
                                    
                                                     | 
                
                 | 
                        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    198
                 | 
                                    
                                                     | 
                
                 | 
                    if convert_dtypes:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    199
                 | 
                                    
                                                     | 
                
                 | 
                        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    200
                 | 
                                    
                                                     | 
                
                 | 
                                                         cat_exclude=cat_exclude)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    201
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    202
                 | 
                                    
                                                     | 
                
                 | 
                    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    203
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    204
                 | 
                                    
                                                     | 
                
                 | 
                    return data_cleaned  | 
            
            
                                                        
            
                                    
            
            
                | 
                    205
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    206
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    207
                 | 
                                    
                                                     | 
                
                 | 
                class DataCleaner(BaseEstimator, TransformerMixin):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    208
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    209
                 | 
                                    
                                                     | 
                
                 | 
                    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    210
                 | 
                                    
                                                     | 
                
                 | 
                    functions (e.g. using MVColHandler() or SubsetPooler()).  | 
            
            
                                                        
            
                                    
            
            
                | 
                    211
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    212
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    213
                 | 
                                    
                                                     | 
                
                 | 
                    ---------´  | 
            
            
                                                        
            
                                    
            
            
                | 
                    214
                 | 
                                    
                                                     | 
                
                 | 
                    drop_threshold_cols: float, default 0.9  | 
            
            
                                                        
            
                                    
            
            
                | 
                    215
                 | 
                                    
                                                     | 
                
                 | 
                        Drop columns with NA-ratio above the specified threshold.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    216
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    217
                 | 
                                    
                                                     | 
                
                 | 
                    drop_threshold_rows: float, default 0.9  | 
            
            
                                                        
            
                                    
            
            
                | 
                    218
                 | 
                                    
                                                     | 
                
                 | 
                        Drop rows with NA-ratio above the specified threshold.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    219
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    220
                 | 
                                    
                                                     | 
                
                 | 
                    drop_duplicates: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    221
                 | 
                                    
                                                     | 
                
                 | 
                        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    222
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    223
                 | 
                                    
                                                     | 
                
                 | 
                    convert_dtypes: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    224
                 | 
                                    
                                                     | 
                
                 | 
                        Convert dtypes using pd.convert_dtypes().  | 
            
            
                                                        
            
                                    
            
            
                | 
                    225
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    226
                 | 
                                    
                                                     | 
                
                 | 
                    category: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    227
                 | 
                                    
                                                     | 
                
                 | 
                        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    228
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    229
                 | 
                                    
                                                     | 
                
                 | 
                    cat_threshold: float, default 0.03  | 
            
            
                                                        
            
                                    
            
            
                | 
                    230
                 | 
                                    
                                                     | 
                
                 | 
                        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    231
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    232
                 | 
                                    
                                                     | 
                
                 | 
                    cat_exclude: list, default None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    233
                 | 
                                    
                                                     | 
                
                 | 
                        List of columns to exclude from categorical conversion.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    234
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    235
                 | 
                                    
                                                     | 
                
                 | 
                    show: {'all', 'changes', None} default 'all' | 
            
            
                                                        
            
                                    
            
            
                | 
                    236
                 | 
                                    
                                                     | 
                
                 | 
                        Specify verbosity of the output.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    237
                 | 
                                    
                                                     | 
                
                 | 
                        * 'all': Print information about the data before and after cleaning as well as information about changes.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    238
                 | 
                                    
                                                     | 
                
                 | 
                        * 'changes': Print out differences in the data before and after cleaning.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    239
                 | 
                                    
                                                     | 
                
                 | 
                        * None: No information about the data and the data cleaning is printed.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    240
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    241
                 | 
                                    
                                                     | 
                
                 | 
                    Returns:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    242
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    243
                 | 
                                    
                                                     | 
                
                 | 
                    data_cleaned: Pandas DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    244
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    245
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    246
                 | 
                                    
                                                     | 
                
                 | 
                    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    247
                 | 
                                    
                                                     | 
                
                 | 
                                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    248
                 | 
                                    
                                                     | 
                
                 | 
                        self.drop_threshold_cols = drop_threshold_cols  | 
            
            
                                                        
            
                                    
            
            
                | 
                    249
                 | 
                                    
                                                     | 
                
                 | 
                        self.drop_threshold_rows = drop_threshold_rows  | 
            
            
                                                        
            
                                    
            
            
                | 
                    250
                 | 
                                    
                                                     | 
                
                 | 
                        self.drop_duplicates = drop_duplicates  | 
            
            
                                                        
            
                                    
            
            
                | 
                    251
                 | 
                                    
                                                     | 
                
                 | 
                        self.convert_dtypes = convert_dtypes  | 
            
            
                                                        
            
                                    
            
            
                | 
                    252
                 | 
                                    
                                                     | 
                
                 | 
                        self.category = category  | 
            
            
                                                        
            
                                    
            
            
                | 
                    253
                 | 
                                    
                                                     | 
                
                 | 
                        self.cat_threshold = cat_threshold  | 
            
            
                                                        
            
                                    
            
            
                | 
                    254
                 | 
                                    
                                                     | 
                
                 | 
                        self.cat_exclude = cat_exclude  | 
            
            
                                                        
            
                                    
            
            
                | 
                    255
                 | 
                                    
                                                     | 
                
                 | 
                        self.show = show  | 
            
            
                                                        
            
                                    
            
            
                | 
                    256
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    257
                 | 
                                    
                                                     | 
                
                 | 
                    def fit(self, data, target=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    258
                 | 
                                    
                                                     | 
                
                 | 
                        return self  | 
            
            
                                                        
            
                                    
            
            
                | 
                    259
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    260
                 | 
                                    
                                                     | 
                
                 | 
                    def transform(self, data, target=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    261
                 | 
                                    
                                                     | 
                
                 | 
                        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    262
                 | 
                                    
                                                     | 
                
                 | 
                                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    263
                 | 
                                    
                                                     | 
                
                 | 
                                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    264
                 | 
                                    
                                                     | 
                
                 | 
                                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    265
                 | 
                                    
                                                     | 
                
                 | 
                        return data_cleaned  | 
            
            
                                                        
            
                                    
            
            
                | 
                    266
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    267
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    268
                 | 
                                    
                                                     | 
                
                 | 
                def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    269
                 | 
                                    
                                                     | 
                
                 | 
                                    return_details=False):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    270
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    271
                 | 
                                    
                                                     | 
                
                 | 
                    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    272
                 | 
                                    
                                                     | 
                
                 | 
                    their correlation with other features and the target variable. This function follows a three step process:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    273
                 | 
                                    
                                                     | 
                
                 | 
                    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').  | 
            
            
                                                        
            
                                    
            
            
                | 
                    274
                 | 
                                    
                                                     | 
                
                 | 
                    - 2) Identify high correlations of these features among themselves and with other features in the dataset (above \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    275
                 | 
                                    
                                                     | 
                
                 | 
                         'corr_thresh_features').  | 
            
            
                                                        
            
                                    
            
            
                | 
                    276
                 | 
                                    
                                                     | 
                
                 | 
                    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    277
                 | 
                                    
                                                     | 
                
                 | 
                         they correlate reasonably well with the target variable (above 'corr_thresh_target').  | 
            
            
                                                        
            
                                    
            
            
                | 
                    278
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    279
                 | 
                                    
                                                     | 
                
                 | 
                    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    280
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    281
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    282
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    283
                 | 
                                    
                                                     | 
                
                 | 
                    data: 2D dataset that can be coerced into Pandas DataFrame.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    284
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    285
                 | 
                                    
                                                     | 
                
                 | 
                    target: string, list, np.array or pd.Series, default None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    286
                 | 
                                    
                                                     | 
                
                 | 
                        Specify target for correlation. I.e. label column to generate only the correlations between each feature \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    287
                 | 
                                    
                                                     | 
                
                 | 
                        and the label.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    288
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    289
                 | 
                                    
                                                     | 
                
                 | 
                    mv_threshold: float, default 0.1  | 
            
            
                                                        
            
                                    
            
            
                | 
                    290
                 | 
                                    
                                                     | 
                
                 | 
                        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    291
                 | 
                                    
                                                     | 
                
                 | 
                        for dropping and undergo further analysis.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    292
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    293
                 | 
                                    
                                                     | 
                
                 | 
                    corr_thresh_features: float, default 0.5  | 
            
            
                                                        
            
                                    
            
            
                | 
                    294
                 | 
                                    
                                                     | 
                
                 | 
                        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    295
                 | 
                                    
                                                     | 
                
                 | 
                        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    296
                 | 
                                    
                                                     | 
                
                 | 
                        analysis.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    297
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    298
                 | 
                                    
                                                     | 
                
                 | 
                    corr_thresh_target: float, default 0.3  | 
            
            
                                                        
            
                                    
            
            
                | 
                    299
                 | 
                                    
                                                     | 
                
                 | 
                        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    300
                 | 
                                    
                                                     | 
                
                 | 
                        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    301
                 | 
                                    
                                                     | 
                
                 | 
                        the feature is ultimately dropped.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    302
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    303
                 | 
                                    
                                                     | 
                
                 | 
                    return_details: bool, default False  | 
            
            
                                                        
            
                                    
            
            
                | 
                    304
                 | 
                                    
                                                     | 
                
                 | 
                        Provdies flexibility to return intermediary results.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    305
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    306
                 | 
                                    
                                                     | 
                
                 | 
                    Returns  | 
            
            
                                                        
            
                                    
            
            
                | 
                    307
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    308
                 | 
                                    
                                                     | 
                
                 | 
                    data: Updated Pandas DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    309
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    310
                 | 
                                    
                                                     | 
                
                 | 
                    optional:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    311
                 | 
                                    
                                                     | 
                
                 | 
                    cols_mv: Columns with missing values included in the analysis  | 
            
            
                                                        
            
                                    
            
            
                | 
                    312
                 | 
                                    
                                                     | 
                
                 | 
                    drop_cols: List of dropped columns  | 
            
            
                                                        
            
                                    
            
            
                | 
                    313
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    314
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    315
                 | 
                                    
                                                     | 
                
                 | 
                    # Validate Inputs  | 
            
            
                                                        
            
                                    
            
            
                | 
                    316
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    317
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    318
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    319
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    320
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.DataFrame(data).copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    321
                 | 
                                    
                                                     | 
                
                 | 
                    data_local = data.copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    322
                 | 
                                    
                                                     | 
                
                 | 
                    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']  | 
            
            
                                                        
            
                                    
            
            
                | 
                    323
                 | 
                                    
                                                     | 
                
                 | 
                    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    324
                 | 
                                    
                                                     | 
                
                 | 
                    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    325
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    326
                 | 
                                    
                                                     | 
                
                 | 
                    high_corr_features = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    327
                 | 
                                    
                                                     | 
                
                 | 
                    data_temp = data_local.copy()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    328
                 | 
                                    
                                                     | 
                
                 | 
                    for col in cols_mv:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    329
                 | 
                                    
                                                     | 
                
                 | 
                        corrmat = corr_mat(data_temp, colored=False)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    330
                 | 
                                    
                                                     | 
                
                 | 
                        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    331
                 | 
                                    
                                                     | 
                
                 | 
                            high_corr_features.append(col)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    332
                 | 
                                    
                                                     | 
                
                 | 
                            data_temp = data_temp.drop(columns=[col])  | 
            
            
                                                        
            
                                    
            
            
                | 
                    333
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    334
                 | 
                                    
                                                     | 
                
                 | 
                    drop_cols = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    335
                 | 
                                    
                                                     | 
                
                 | 
                    if target is None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    336
                 | 
                                    
                                                     | 
                
                 | 
                        data = data.drop(columns=high_corr_features)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    337
                 | 
                                    
                                                     | 
                
                 | 
                    else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    338
                 | 
                                    
                                                     | 
                
                 | 
                        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    339
                 | 
                                    
                                                     | 
                
                 | 
                        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    340
                 | 
                                    
                                                     | 
                
                 | 
                        data = data.drop(columns=drop_cols)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    341
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    342
                 | 
                                    
                                                     | 
                
                 | 
                    if return_details:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    343
                 | 
                                    
                                                     | 
                
                 | 
                        return data, cols_mv, drop_cols  | 
            
            
                                                        
            
                                    
            
            
                | 
                    344
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    345
                 | 
                                    
                                                     | 
                
                 | 
                    return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    346
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    347
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    348
                 | 
                                    
                                                     | 
                
                 | 
                class MVColHandler(BaseEstimator, TransformerMixin):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    349
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    350
                 | 
                                    
                                                     | 
                
                 | 
                    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    351
                 | 
                                    
                                                     | 
                
                 | 
                    functions (e.g. using DataCleaner() or SubsetPooler()).  | 
            
            
                                                        
            
                                    
            
            
                | 
                    352
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    353
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    354
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    355
                 | 
                                    
                                                     | 
                
                 | 
                    target: string, list, np.array or pd.Series, default None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    356
                 | 
                                    
                                                     | 
                
                 | 
                        Specify target for correlation. E.g. label column to generate only the correlations between each feature \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    357
                 | 
                                    
                                                     | 
                
                 | 
                        and the label.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    358
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    359
                 | 
                                    
                                                     | 
                
                 | 
                    mv_threshold: float, default 0.1  | 
            
            
                                                        
            
                                    
            
            
                | 
                    360
                 | 
                                    
                                                     | 
                
                 | 
                        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    361
                 | 
                                    
                                                     | 
                
                 | 
                        for dropping and undergo further analysis.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    362
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    363
                 | 
                                    
                                                     | 
                
                 | 
                    corr_thresh_features: float, default 0.6  | 
            
            
                                                        
            
                                    
            
            
                | 
                    364
                 | 
                                    
                                                     | 
                
                 | 
                        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\  | 
            
            
                                                        
            
                                    
            
            
                | 
                    365
                 | 
                                    
                                                     | 
                
                 | 
                         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    366
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    367
                 | 
                                    
                                                     | 
                
                 | 
                    corr_thresh_target: float, default 0.3  | 
            
            
                                                        
            
                                    
            
            
                | 
                    368
                 | 
                                    
                                                     | 
                
                 | 
                        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    369
                 | 
                                    
                                                     | 
                
                 | 
                        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    370
                 | 
                                    
                                                     | 
                
                 | 
                        the feature is ultimately dropped.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    371
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    372
                 | 
                                    
                                                     | 
                
                 | 
                    return_details: bool, default True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    373
                 | 
                                    
                                                     | 
                
                 | 
                        Provdies flexibility to return intermediary results.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    374
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    375
                 | 
                                    
                                                     | 
                
                 | 
                    Returns  | 
            
            
                                                        
            
                                    
            
            
                | 
                    376
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    377
                 | 
                                    
                                                     | 
                
                 | 
                    data: Updated Pandas DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    378
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    379
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    380
                 | 
                                    
                                                     | 
                
                 | 
                    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    381
                 | 
                                    
                                                     | 
                
                 | 
                                 return_details=True):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    382
                 | 
                                    
                                                     | 
                
                 | 
                        self.target = target  | 
            
            
                                                        
            
                                    
            
            
                | 
                    383
                 | 
                                    
                                                     | 
                
                 | 
                        self.mv_threshold = mv_threshold  | 
            
            
                                                        
            
                                    
            
            
                | 
                    384
                 | 
                                    
                                                     | 
                
                 | 
                        self.corr_thresh_features = corr_thresh_features  | 
            
            
                                                        
            
                                    
            
            
                | 
                    385
                 | 
                                    
                                                     | 
                
                 | 
                        self.corr_thresh_target = corr_thresh_target  | 
            
            
                                                        
            
                                    
            
            
                | 
                    386
                 | 
                                    
                                                     | 
                
                 | 
                        self.return_details = return_details  | 
            
            
                                                        
            
                                    
            
            
                | 
                    387
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    388
                 | 
                                    
                                                     | 
                
                 | 
                    def fit(self, data, target=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    389
                 | 
                                    
                                                     | 
                
                 | 
                        return self  | 
            
            
                                                        
            
                                    
            
            
                | 
                    390
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    391
                 | 
                                    
                                                     | 
                
                 | 
                    def transform(self, data, target=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    392
                 | 
                                    
                                                     | 
                
                 | 
                        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    393
                 | 
                                    
                                                     | 
                
                 | 
                                                                      corr_thresh_features=self.corr_thresh_features,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    394
                 | 
                                    
                                                     | 
                
                 | 
                                                                      corr_thresh_target=self.corr_thresh_target,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    395
                 | 
                                    
                                                     | 
                
                 | 
                                                                      return_details=self.return_details)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    396
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    397
                 | 
                                    
                                                     | 
                
                 | 
                        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}') | 
            
            
                                                        
            
                                    
            
            
                | 
                    398
                 | 
                                    
                                                     | 
                
                 | 
                        print('Features dropped:', len(dropped_cols), dropped_cols) | 
            
            
                                                        
            
                                    
            
            
                | 
                    399
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    400
                 | 
                                    
                                                     | 
                
                 | 
                        return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    401
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    402
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    403
                 | 
                                    
                                                     | 
                
                 | 
                def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    404
                 | 
                                    
                                                     | 
                
                 | 
                                           return_details=False):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    405
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    406
                 | 
                                    
                                                     | 
                
                 | 
                    Checks for duplicates in subsets of columns and pools them. This can reduce the number of columns in the data \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    407
                 | 
                                    
                                                     | 
                
                 | 
                    without loosing much information. Suitable columns are combined to subsets and tested for duplicates. In case \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    408
                 | 
                                    
                                                     | 
                
                 | 
                    sufficient duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    409
                 | 
                                    
                                                     | 
                
                 | 
                    numbers in the 'pooled_var' column indicate identical information in the respective rows.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    410
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    411
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    412
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    413
                 | 
                                    
                                                     | 
                
                 | 
                    data: 2D dataset that can be coerced into Pandas DataFrame.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    414
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    415
                 | 
                                    
                                                     | 
                
                 | 
                    col_dupl_thresh: float, default 0.2  | 
            
            
                                                        
            
                                    
            
            
                | 
                    416
                 | 
                                    
                                                     | 
                
                 | 
                        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    417
                 | 
                                    
                                                     | 
                
                 | 
                        Columns with a lower ratio are not considered for pooling.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    418
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    419
                 | 
                                    
                                                     | 
                
                 | 
                    subset_thresh: float, default 0.2  | 
            
            
                                                        
            
                                    
            
            
                | 
                    420
                 | 
                                    
                                                     | 
                
                 | 
                        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    421
                 | 
                                    
                                                     | 
                
                 | 
                        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    422
                 | 
                                    
                                                     | 
                
                 | 
                        reached.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    423
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    424
                 | 
                                    
                                                     | 
                
                 | 
                    min_col_pool: integer, default 3  | 
            
            
                                                        
            
                                    
            
            
                | 
                    425
                 | 
                                    
                                                     | 
                
                 | 
                        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    426
                 | 
                                    
                                                     | 
                
                 | 
                        subsets and stops when 'min_col_pool' is reached.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    427
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    428
                 | 
                                    
                                                     | 
                
                 | 
                    exclude. list, default None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    429
                 | 
                                    
                                                     | 
                
                 | 
                        List of column names to be excluded from the analysis. These columns are passed through without modification.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    430
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    431
                 | 
                                    
                                                     | 
                
                 | 
                    return_details: bool, default False  | 
            
            
                                                        
            
                                    
            
            
                | 
                    432
                 | 
                                    
                                                     | 
                
                 | 
                        Provdies flexibility to return intermediary results.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    433
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    434
                 | 
                                    
                                                     | 
                
                 | 
                    Returns:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    435
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    436
                 | 
                                    
                                                     | 
                
                 | 
                    data: pd.DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    437
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    438
                 | 
                                    
                                                     | 
                
                 | 
                    optional:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    439
                 | 
                                    
                                                     | 
                
                 | 
                    subset_cols: List of columns used as subset.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    440
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    441
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    442
                 | 
                                    
                                                     | 
                
                 | 
                    # Input validation  | 
            
            
                                                        
            
                                    
            
            
                | 
                    443
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    444
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    445
                 | 
                                    
                                                     | 
                
                 | 
                    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])  | 
            
            
                                                        
            
                                    
            
            
                | 
                    446
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    447
                 | 
                                    
                                                     | 
                
                 | 
                    excluded_cols = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    448
                 | 
                                    
                                                     | 
                
                 | 
                    if exclude is not None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    449
                 | 
                                    
                                                     | 
                
                 | 
                        excluded_cols = data[exclude]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    450
                 | 
                                    
                                                     | 
                
                 | 
                        data = data.drop(columns=exclude)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    451
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    452
                 | 
                                    
                                                     | 
                
                 | 
                    subset_cols = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    453
                 | 
                                    
                                                     | 
                
                 | 
                    for i in range(data.shape[1]+1-min_col_pool):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    454
                 | 
                                    
                                                     | 
                
                 | 
                        check_list = [col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    455
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    456
                 | 
                                    
                                                     | 
                
                 | 
                        if len(check_list) > 0:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    457
                 | 
                                    
                                                     | 
                
                 | 
                            combinations = itertools.combinations(check_list, len(check_list)-i)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    458
                 | 
                                    
                                                     | 
                
                 | 
                        else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    459
                 | 
                                    
                                                     | 
                
                 | 
                            continue  | 
            
            
                                                        
            
                                    
            
            
                | 
                    460
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    461
                 | 
                                    
                                                     | 
                
                 | 
                        ratios = [data[list(comb)].duplicated().mean() for comb in combinations]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    462
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    463
                 | 
                                    
                                                     | 
                
                 | 
                        max_ratio = max(ratios)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    464
                 | 
                                    
                                                     | 
                
                 | 
                        max_idx = np.argmax(ratios)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    465
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    466
                 | 
                                    
                                                     | 
                
                 | 
                        if max_ratio > subset_thresh:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    467
                 | 
                                    
                                                     | 
                
                 | 
                            best_subset = itertools.islice(itertools.combinations(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    468
                 | 
                                    
                                                     | 
                
                 | 
                                check_list, len(check_list)-i), max_idx, max_idx+1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    469
                 | 
                                    
                                                     | 
                
                 | 
                            best_subset = data[list(list(best_subset)[0])]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    470
                 | 
                                    
                                                     | 
                
                 | 
                            subset_cols = best_subset.columns.tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    471
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    472
                 | 
                                    
                                                     | 
                
                 | 
                            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'}) | 
            
            
                                                        
            
                                    
            
            
                | 
                    473
                 | 
                                    
                                                     | 
                
                 | 
                            data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    474
                 | 
                                    
                                                     | 
                
                 | 
                                              ).drop(columns=best_subset.columns.tolist())  | 
            
            
                                                        
            
                                    
            
            
                | 
                    475
                 | 
                                    
                                                     | 
                
                 | 
                            data.index = pd.RangeIndex(len(data))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    476
                 | 
                                    
                                                     | 
                
                 | 
                            break  | 
            
            
                                                        
            
                                    
            
            
                | 
                    477
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    478
                 | 
                                    
                                                     | 
                
                 | 
                    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    479
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    480
                 | 
                                    
                                                     | 
                
                 | 
                    if return_details:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    481
                 | 
                                    
                                                     | 
                
                 | 
                        return data, subset_cols  | 
            
            
                                                        
            
                                    
            
            
                | 
                    482
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    483
                 | 
                                    
                                                     | 
                
                 | 
                    return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    484
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    485
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    486
                 | 
                                    
                                                     | 
                
                 | 
                class SubsetPooler(BaseEstimator, TransformerMixin):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    487
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    488
                 | 
                                    
                                                     | 
                
                 | 
                    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    489
                 | 
                                    
                                                     | 
                
                 | 
                    functions (e.g. using DataCleaner() or MVColHandler()).  | 
            
            
                                                        
            
                                    
            
            
                | 
                    490
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    491
                 | 
                                    
                                                     | 
                
                 | 
                    Parameters  | 
            
            
                                                        
            
                                    
            
            
                | 
                    492
                 | 
                                    
                                                     | 
                
                 | 
                    ----------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    493
                 | 
                                    
                                                     | 
                
                 | 
                    col_dupl_ratio: float, default 0.2  | 
            
            
                                                        
            
                                    
            
            
                | 
                    494
                 | 
                                    
                                                     | 
                
                 | 
                        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    495
                 | 
                                    
                                                     | 
                
                 | 
                        Columns with a lower ratio are not considered for pooling.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    496
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    497
                 | 
                                    
                                                     | 
                
                 | 
                    dupl_thresh: float, default 0.2  | 
            
            
                                                        
            
                                    
            
            
                | 
                    498
                 | 
                                    
                                                     | 
                
                 | 
                        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    499
                 | 
                                    
                                                     | 
                
                 | 
                        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    500
                 | 
                                    
                                                     | 
                
                 | 
                        reached.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    501
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    502
                 | 
                                    
                                                     | 
                
                 | 
                    min_col_pool: integer, default 3  | 
            
            
                                                        
            
                                    
            
            
                | 
                    503
                 | 
                                    
                                                     | 
                
                 | 
                        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \  | 
            
            
                                                        
            
                                    
            
            
                | 
                    504
                 | 
                                    
                                                     | 
                
                 | 
                        subsets and stops when 'min_col_pool' is reached.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    505
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    506
                 | 
                                    
                                                     | 
                
                 | 
                    return_details: bool, default False  | 
            
            
                                                        
            
                                    
            
            
                | 
                    507
                 | 
                                    
                                                     | 
                
                 | 
                        Provdies flexibility to return intermediary results.  | 
            
            
                                                        
            
                                    
            
            
                | 
                    508
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    509
                 | 
                                    
                                                     | 
                
                 | 
                    Returns:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    510
                 | 
                                    
                                                     | 
                
                 | 
                    -------  | 
            
            
                                                        
            
                                    
            
            
                | 
                    511
                 | 
                                    
                                                     | 
                
                 | 
                    data: pd.DataFrame  | 
            
            
                                                        
            
                                    
            
            
                | 
                    512
                 | 
                                    
                                                     | 
                
                 | 
                    '''  | 
            
            
                                                        
            
                                    
            
            
                | 
                    513
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    514
                 | 
                                    
                                                     | 
                
                 | 
                    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    515
                 | 
                                    
                                                     | 
                
                 | 
                        self.col_dupl_thresh = col_dupl_thresh  | 
            
            
                                                        
            
                                    
            
            
                | 
                    516
                 | 
                                    
                                                     | 
                
                 | 
                        self.subset_thresh = subset_thresh  | 
            
            
                                                        
            
                                    
            
            
                | 
                    517
                 | 
                                    
                                                     | 
                
                 | 
                        self.min_col_pool = min_col_pool  | 
            
            
                                                        
            
                                    
            
            
                | 
                    518
                 | 
                                    
                                                     | 
                
                 | 
                        self.return_details = return_details  | 
            
            
                                                        
            
                                    
            
            
                | 
                    519
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    520
                 | 
                                    
                                                     | 
                
                 | 
                    def fit(self, data, target=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    521
                 | 
                                    
                                                     | 
                
                 | 
                        return self  | 
            
            
                                                        
            
                                    
            
            
                | 
                    522
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    523
                 | 
                                    
                                                     | 
                
                 | 
                    def transform(self, data, target=None):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    524
                 | 
                                    
                                                     | 
                
                 | 
                        data, subset_cols = pool_duplicate_subsets(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    525
                 | 
                                    
                                                     | 
                
                 | 
                            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    526
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    527
                 | 
                                    
                                                     | 
                
                 | 
                        print('Combined columns:', len(subset_cols), subset_cols) | 
            
            
                                                        
            
                                    
            
            
                | 
                    528
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    529
                 | 
                                    
                                                     | 
                
                 | 
                        return data  | 
            
            
                                                        
            
                                    
            
            
                | 
                    530
                 | 
                                    
                                                     | 
                
                 | 
                 |