klib.clean.pool_duplicate_subsets() - Code Metrics - Inspection of "docstring linebreak fixes" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 758151...a39973 )

by Andreas

created 2020-05-09 10:15 UTC

klib.clean.pool_duplicate_subsets() C

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	87
Code Lines	38

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
eloc	38
nop	6
dl	0
loc	87
rs	6.6346
c	0
b	0
f	0

How to fix Long Method

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import itertools
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from .describe import corr_mat
from .utils import (_diff_report,
                    _drop_duplicates,
                    _missing_vals,
                    _validate_input_bool,
                    _validate_input_range)


__all__ = ['convert_datatypes',
           'data_cleaning',
           'drop_missing',
           'mv_col_handling']


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    Returns
    -------
    data: Pandas DataFrame
    '''

    # Validate Inputs
    _validate_input_bool(category, 'Category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
                                             convert_integer=False, convert_boolean=True)

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of remaining NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)

    data = pd.DataFrame(data).copy()
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
        columns as well as optimizing the datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    See Also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
    _validate_input_bool(category, 'category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)

    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    dupl_rows = None

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                         cat_exclude=cat_exclude)

    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)

    return data_cleaned


class DataCleaner(BaseEstimator, TransformerMixin):
    '''
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
    functions (e.g. using MVColHandler() or SubsetPooler()).

    Parameters:
    ---------´
    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns:
    -------
    data_cleaned: Pandas DataFrame
    '''

    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
        self.drop_threshold_cols = drop_threshold_cols
        self.drop_threshold_rows = drop_threshold_rows
        self.drop_duplicates = drop_duplicates
        self.convert_dtypes = convert_dtypes
        self.category = category
        self.cat_threshold = cat_threshold
        self.cat_exclude = cat_exclude
        self.show = show

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
        return data_cleaned


def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
                    return_details=False):
    '''
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
    their correlation with other features and the target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values.
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
         they correlate reasonably well with the target variable.

    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.5
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
        analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    return_details: bool, default False
        Provdies flexibility to return intermediary results.

    Returns
    -------
    data: Updated Pandas DataFrame

    optional:
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
        data = data.drop(columns=drop_cols)

    if return_details:
        return data, cols_mv, drop_cols

    return data


class MVColHandler(BaseEstimator, TransformerMixin):
    '''
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
    functions (e.g. using DataCleaner() or SubsetPooler()).

    Parameters
    ----------
    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    return_details: bool, default True
        Provdies flexibility to return intermediary results.

    Returns
    -------
    data: Updated Pandas DataFrame
    '''

    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
                 return_details=True):
        self.target = target
        self.mv_threshold = mv_threshold
        self.corr_thresh_features = corr_thresh_features
        self.corr_thresh_target = corr_thresh_target
        self.return_details = return_details

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
                                                      corr_thresh_features=self.corr_thresh_features,
                                                      corr_thresh_target=self.corr_thresh_target,
                                                      return_details=self.return_details)

        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
        print('Features dropped:', len(dropped_cols), dropped_cols)

        return data


def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
                           return_details=False):
    '''
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
    the 'pooled_var' column indicate identical information in the respective rows.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    col_dupl_thresh: float, default 0.2
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
        Columns with a lower ratio are not considered for pooling.

    subset_thresh: float, default 0.2
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
        reached.

    min_col_pool: integer, default 3
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
        subsets and stops when 'min_col_pool' is reached.

    exclude. list, default None
        List of column names to be excluded from the analysis. These columns are passed through without modification.

    return_details: bool, default False
        Provdies flexibility to return intermediary results.

    Returns:
    -------
    data: pd.DataFrame

    optional:
    subset_cols: List of columns used as subset.
    '''

    # Input validation
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])

    excluded_cols = []
    if exclude is not None:
        excluded_cols = data[exclude]
        data = data.drop(columns=exclude)

    subset_cols = []
    for i in range(data.shape[1]+1-min_col_pool):
        check_list = []
        for col in data.columns:
            cdr = data.duplicated(subset=col).mean()
            if cdr > col_dupl_thresh:
                check_list.append(col)

        if len(check_list) > 0:
            combinations = itertools.combinations(check_list, len(check_list)-i)
        else:
            continue

        ratios = []
        for comb in combinations:
            ratios.append(data[list(comb)].duplicated().mean())

        max_ratio = max(ratios)
        max_idx = np.argmax(ratios)

        if max_ratio > subset_thresh:
            best_subset = itertools.islice(itertools.combinations(
                check_list, len(check_list)-i), max_idx, max_idx+1)
            best_subset = data[list(list(best_subset)[0])]
            subset_cols = best_subset.columns.tolist()

            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
            data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
                              ).drop(columns=best_subset.columns.tolist())
            data.index = pd.RangeIndex(len(data))
            break

    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)

    if return_details:
        return data, subset_cols

    return data


class SubsetPooler(BaseEstimator, TransformerMixin):
    '''
    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
    functions (e.g. using DataCleaner() or MVColHandler()).

    Parameters
    ----------
    col_dupl_ratio: float, default 0.2
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
        Columns with a lower ratio are not considered for pooling.

    dupl_thresh: float, default 0.2
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
        reached.

    min_col_pool: integer, default 3
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
        subsets and stops when 'min_col_pool' is reached.

    return_details: bool, default False
        Provdies flexibility to return intermediary results.

    Returns:
    -------
    data: pd.DataFrame
    '''

    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
        self.col_dupl_thresh = col_dupl_thresh
        self.subset_thresh = subset_thresh
        self.min_col_pool = min_col_pool
        self.return_details = return_details

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, subset_cols = pool_duplicate_subsets(
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)

        print('Combined columns:', len(subset_cols), subset_cols)

        return data


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import itertools
10			import numpy as np
11			import pandas as pd
12			from sklearn.base import BaseEstimator, TransformerMixin
13
14			from .describe import corr_mat
15			from .utils import (_diff_report,
16			_drop_duplicates,
17			_missing_vals,
18			_validate_input_bool,
19			_validate_input_range)
20
21
22			__all__ = ['convert_datatypes',
23			'data_cleaning',
24			'drop_missing',
25			'mv_col_handling']
26
27
28			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
29			'''
30			Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
31
32			Parameters
33			----------
34			data: 2D dataset that can be coerced into Pandas DataFrame.
35
36			category: bool, default True
37			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
38			columns using cat_exclude.
39
40			cat_threshold: float, default 0.05
41			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
42
43			cat_exclude: list, default None
44			List of columns to exclude from categorical conversion.
45
46			Returns
47			-------
48			data: Pandas DataFrame
49			'''
50
51			# Validate Inputs
52			_validate_input_bool(category, 'Category')
53			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
54
55			cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
56
57			data = pd.DataFrame(data).copy()
58			for col in data.columns:
59			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
60			if (category and
61			unique_vals_ratio < cat_threshold and
62			col not in cat_exclude and
63			data[col].dtype == 'object'):
64			data[col] = data[col].astype('category')
65			data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
66			convert_integer=False, convert_boolean=True)
67
68			return data
69
70
71			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
72			'''
73			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
74			drop additional columns and rows based on the fraction of remaining NA-values.
75
76			Parameters
77			----------
78			data: 2D dataset that can be coerced into Pandas DataFrame.
79
80			drop_threshold_cols: float, default 1
81			Drop columns with NA-ratio above the specified threshold.
82
83			drop_threshold_rows: float, default 1
84			Drop rows with NA-ratio above the specified threshold.
85
86			Returns
87			-------
88			data_cleaned: Pandas DataFrame
89
90			Notes
91			-----
92			Columns are dropped first. Rows are dropped based on the remaining data.
93			'''
94
95			# Validate Inputs
96			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
97			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
98
99			data = pd.DataFrame(data).copy()
100			data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
101			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
102			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
103
104			return data_cleaned
105
106
107			def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
108			convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
109			'''
110			Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
111			columns as well as optimizing the datatypes.
112
113			Parameters
114			----------
115			data: 2D dataset that can be coerced into Pandas DataFrame.
116
117			drop_threshold_cols: float, default 0.9
118			Drop columns with NA-ratio above the specified threshold.
119
120			drop_threshold_rows: float, default 0.9
121			Drop rows with NA-ratio above the specified threshold.
122
123			drop_duplicates: bool, default True
124			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
125
126			convert_dtypes: bool, default True
127			Convert dtypes using pd.convert_dtypes().
128
129			category: bool, default True
130			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
131
132			cat_threshold: float, default 0.03
133			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
134
135			cat_exclude: list, default None
136			List of columns to exclude from categorical conversion.
137
138			show: {'all', 'changes', None} default 'all'
139			Specify verbosity of the output.
140			* 'all': Print information about the data before and after cleaning as well as information about changes.
141			* 'changes': Print out differences in the data before and after cleaning.
142			* None: No information about the data and the data cleaning is printed.
143
144			Returns
145			-------
146			data_cleaned: Pandas DataFrame
147
148			See Also
149			--------
150			convert_datatypes: Convert columns to best possible dtypes.
151			drop_missing : Flexibly drop columns and rows.
152			_memory_usage: Gives the total memory usage in kilobytes.
153			_missing_vals: Metrics about missing values in the dataset.
154
155			Notes
156			-----
157			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
158			'''
159
160			# Validate Inputs
161			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
162			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
163			_validate_input_bool(drop_duplicates, 'drop_duplicates')
164			_validate_input_bool(convert_dtypes, 'convert_datatypes')
165			_validate_input_bool(category, 'category')
166			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
167
168			data = pd.DataFrame(data).copy()
169			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
170
171			single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
172			data_cleaned = data_cleaned.drop(columns=single_val_cols)
173
174			dupl_rows = None
175
176			if drop_duplicates:
177			data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
178			if convert_dtypes:
179			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
180			cat_exclude=cat_exclude)
181
182			_diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
183
184			return data_cleaned
185
186
187			class DataCleaner(BaseEstimator, TransformerMixin):
188			'''
189			Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
190			functions (e.g. using MVColHandler() or SubsetPooler()).
191
192			Parameters:
193			---------´
194			drop_threshold_cols: float, default 0.9
195			Drop columns with NA-ratio above the specified threshold.
196
197			drop_threshold_rows: float, default 0.9
198			Drop rows with NA-ratio above the specified threshold.
199
200			drop_duplicates: bool, default True
201			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
202
203			convert_dtypes: bool, default True
204			Convert dtypes using pd.convert_dtypes().
205
206			category: bool, default True
207			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
208
209			cat_threshold: float, default 0.03
210			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
211
212			cat_exclude: list, default None
213			List of columns to exclude from categorical conversion.
214
215			show: {'all', 'changes', None} default 'all'
216			Specify verbosity of the output.
217			* 'all': Print information about the data before and after cleaning as well as information about changes.
218			* 'changes': Print out differences in the data before and after cleaning.
219			* None: No information about the data and the data cleaning is printed.
220
221			Returns:
222			-------
223			data_cleaned: Pandas DataFrame
224			'''
225
226			def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
227			category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
228			self.drop_threshold_cols = drop_threshold_cols
229			self.drop_threshold_rows = drop_threshold_rows
230			self.drop_duplicates = drop_duplicates
231			self.convert_dtypes = convert_dtypes
232			self.category = category
233			self.cat_threshold = cat_threshold
234			self.cat_exclude = cat_exclude
235			self.show = show
236
237			def fit(self, data, target=None):
238			return self
239
240			def transform(self, data, target=None):
241			data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
242			drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
243			convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
244			cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
245			return data_cleaned
246
247
248			def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
249			return_details=False):
250			'''
251			Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
252			their correlation with other features and the target variable. This function follows a three step process:
253			- 1) Identify features with a high ratio of missing values.
254			- 2) Identify high correlations of these features among themselves and with other features in the dataset.
255			- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
256			they correlate reasonably well with the target variable.
257
258			Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
259
260			Parameters
261			----------
262			data: 2D dataset that can be coerced into Pandas DataFrame.
263
264			target: string, list, np.array or pd.Series, default None
265			Specify target for correlation. I.e. label column to generate only the correlations between each feature \
266			and the label.
267
268			mv_threshold: float, default 0.1
269			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
270			for dropping and undergo further analysis.
271
272			corr_thresh_features: float, default 0.5
273			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
274			is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
275			analysis.
276
277			corr_thresh_target: float, default 0.3
278			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
279			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
280			the feature is ultimately dropped.
281
282			return_details: bool, default False
283			Provdies flexibility to return intermediary results.
284
285			Returns
286			-------
287			data: Updated Pandas DataFrame
288
289			optional:
290			cols_mv: Columns with missing values included in the analysis
291			drop_cols: List of dropped columns
292			'''
293
294			# Validate Inputs
295			_validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
296			_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
297			_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
298
299			data = pd.DataFrame(data).copy()
300			data_local = data.copy()
301			mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
302			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
303			data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
304
305			high_corr_features = []
306			data_temp = data_local.copy()
307			for col in cols_mv:
308			corrmat = corr_mat(data_temp, colored=False)
309			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
310			high_corr_features.append(col)
311			data_temp = data_temp.drop(columns=[col])
312
313			drop_cols = []
314			if target is None:
315			data = data.drop(columns=high_corr_features)
316			else:
317			corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
318			drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
319			data = data.drop(columns=drop_cols)
320
321			if return_details:
322			return data, cols_mv, drop_cols
323
324			return data
325
326
327			class MVColHandler(BaseEstimator, TransformerMixin):
328			'''
329			Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
330			functions (e.g. using DataCleaner() or SubsetPooler()).
331
332			Parameters
333			----------
334			target: string, list, np.array or pd.Series, default None
335			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
336			and the label.
337
338			mv_threshold: float, default 0.1
339			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
340			for dropping and undergo further analysis.
341
342			corr_thresh_features: float, default 0.6
343			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
344			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
345
346			corr_thresh_target: float, default 0.3
347			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
348			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
349			the feature is ultimately dropped.
350
351			return_details: bool, default True
352			Provdies flexibility to return intermediary results.
353
354			Returns
355			-------
356			data: Updated Pandas DataFrame
357			'''
358
359			def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
360			return_details=True):
361			self.target = target
362			self.mv_threshold = mv_threshold
363			self.corr_thresh_features = corr_thresh_features
364			self.corr_thresh_target = corr_thresh_target
365			self.return_details = return_details
366
367			def fit(self, data, target=None):
368			return self
369
370			def transform(self, data, target=None):
371			data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
372			corr_thresh_features=self.corr_thresh_features,
373			corr_thresh_target=self.corr_thresh_target,
374			return_details=self.return_details)
375
376			print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
377			print('Features dropped:', len(dropped_cols), dropped_cols)
378
379			return data
380
381
382			def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
383			return_details=False):
384			'''
385			Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
386			loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
387			duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
388			the 'pooled_var' column indicate identical information in the respective rows.
389
390			Parameters
391			----------
392			data: 2D dataset that can be coerced into Pandas DataFrame.
393
394			col_dupl_thresh: float, default 0.2
395			Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
396			Columns with a lower ratio are not considered for pooling.
397
398			subset_thresh: float, default 0.2
399			The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
400			reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
401			reached.
402
403			min_col_pool: integer, default 3
404			Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
405			subsets and stops when 'min_col_pool' is reached.
406
407			exclude. list, default None
408			List of column names to be excluded from the analysis. These columns are passed through without modification.
409
410			return_details: bool, default False
411			Provdies flexibility to return intermediary results.
412
413			Returns:
414			-------
415			data: pd.DataFrame
416
417			optional:
418			subset_cols: List of columns used as subset.
419			'''
420
421			# Input validation
422			_validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
423			_validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
424			_validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
425
426			excluded_cols = []
427			if exclude is not None:
428			excluded_cols = data[exclude]
429			data = data.drop(columns=exclude)
430
431			subset_cols = []
432			for i in range(data.shape[1]+1-min_col_pool):
433			check_list = []
434			for col in data.columns:
435			cdr = data.duplicated(subset=col).mean()
436			if cdr > col_dupl_thresh:
437			check_list.append(col)
438
439			if len(check_list) > 0:
440			combinations = itertools.combinations(check_list, len(check_list)-i)
441			else:
442			continue
443
444			ratios = []
445			for comb in combinations:
446			ratios.append(data[list(comb)].duplicated().mean())
447
448			max_ratio = max(ratios)
449			max_idx = np.argmax(ratios)
450
451			if max_ratio > subset_thresh:
452			best_subset = itertools.islice(itertools.combinations(
453			check_list, len(check_list)-i), max_idx, max_idx+1)
454			best_subset = data[list(list(best_subset)[0])]
455			subset_cols = best_subset.columns.tolist()
456
457			unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
458			data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
459			).drop(columns=best_subset.columns.tolist())
460			data.index = pd.RangeIndex(len(data))
461			break
462
463			data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
464
465			if return_details:
466			return data, subset_cols
467
468			return data
469
470
471			class SubsetPooler(BaseEstimator, TransformerMixin):
472			'''
473			Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
474			functions (e.g. using DataCleaner() or MVColHandler()).
475
476			Parameters
477			----------
478			col_dupl_ratio: float, default 0.2
479			Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
480			Columns with a lower ratio are not considered for pooling.
481
482			dupl_thresh: float, default 0.2
483			The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
484			reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
485			reached.
486
487			min_col_pool: integer, default 3
488			Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
489			subsets and stops when 'min_col_pool' is reached.
490
491			return_details: bool, default False
492			Provdies flexibility to return intermediary results.
493
494			Returns:
495			-------
496			data: pd.DataFrame
497			'''
498
499			def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
500			self.col_dupl_thresh = col_dupl_thresh
501			self.subset_thresh = subset_thresh
502			self.min_col_pool = min_col_pool
503			self.return_details = return_details
504
505			def fit(self, data, target=None):
506			return self
507
508			def transform(self, data, target=None):
509			data, subset_cols = pool_duplicate_subsets(
510			data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)
511
512			print('Combined columns:', len(subset_cols), subset_cols)
513
514			return data
515

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 758151...a39973 )

klib.clean.pool_duplicate_subsets() C

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like