klib.clean.SubsetPooler.fit() - Code Metrics - Inspection of "add subset_pooling" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 125b89...3ddec6 )

by Andreas

created 2020-05-01 15:34 UTC

klib.clean.SubsetPooler.fit() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	2
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	3
dl	0
loc	2
rs	10
c	0
b	0
f	0

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import itertools
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from .describe import corr_mat
from .utils import (_diff_report,
                    _drop_duplicates,
                    _missing_vals,
                    _validate_input_bool,
                    _validate_input_range)


__all__ = ['convert_datatypes',
           'data_cleaning',
           'drop_missing',
           'mv_col_handling']


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    Returns
    -------
    data: Pandas DataFrame
    '''

    # Validate Inputs
    _validate_input_bool(category, 'Category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
                                             convert_integer=False, convert_boolean=True)

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of remaining NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)

    data = pd.DataFrame(data).copy()
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
        columns as well as optimizing the datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    See Also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
    _validate_input_bool(category, 'category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)

    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    dupl_rows = None

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                         cat_exclude=cat_exclude)

    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)

    return data_cleaned


class DataCleaner(BaseEstimator, TransformerMixin):
    '''
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar
    functions (e.g. MVColHandler()).

    Parameters:
    ---------´
    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns:
    -------
    data_cleaned: Pandas DataFrame
    '''

    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
        self.drop_threshold_cols = drop_threshold_cols
        self.drop_threshold_rows = drop_threshold_rows
        self.drop_duplicates = drop_duplicates
        self.convert_dtypes = convert_dtypes
        self.category = category
        self.cat_threshold = cat_threshold
        self.cat_exclude = cat_exclude
        self.show = show

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
        return data_cleaned


def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
    '''
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
    their correlation with other features and the target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
         they correlate reasonably well with the target variable.

    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    Returns
    -------
    data: Updated Pandas DataFrame
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        for col in high_corr_features:
            if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
                drop_cols.append(col)
                data = data.drop(columns=[col])

    return data, cols_mv, drop_cols


class MVColHandler(BaseEstimator, TransformerMixin):
    '''
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar
    functions (e.g. DataCleaner()).

    Parameters
    ----------
    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    Returns
    -------
    data: Updated Pandas DataFrame
    '''

    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
        self.target = target
        self.mv_threshold = mv_threshold
        self.corr_thresh_features = corr_thresh_features
        self.corr_thresh_target = corr_thresh_target

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
                                                      corr_thresh_features=self.corr_thresh_features,
                                                      corr_thresh_target=self.corr_thresh_target)

        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
        print('Features dropped:', len(dropped_cols), dropped_cols)

        return data


def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3):
    '''
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
    the 'pooled_var' column indicate identical information in the respective rows.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    col_dupl_ratio: float, default 0.2
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
        Columns with a lower ratio are not considered for pooling.

    dupl_thresh: float, default 0.2
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
        reached.

    min_col_pool: integer, default 3
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
        subsets and stops when 'min_col_pool' is reached.

    Returns:
    -------
    data: pd.DataFrame
    '''

    # Input validation
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])

    for i in range(data.shape[1]+1-min_col_pool):
        check_list = []
        for col in data.columns:
            cdr = data.duplicated(subset=col).mean()
            if cdr > col_dupl_thresh:
                check_list.append(col)

        combinations = itertools.combinations(check_list, len(check_list)-i)

        ratios = []
        for comb in combinations:
            ratios.append(data[list(comb)].duplicated().mean())

        max_ratio = pd.DataFrame(ratios).max()
        max_idx = pd.DataFrame(ratios).idxmax()

        subset_cols = []
        if max_ratio[0] > subset_thresh:
            best_subset = itertools.islice(itertools.combinations(
                check_list, len(check_list)-i), max_idx[0], max_idx[0]+1)
            best_subset = data[list(list(best_subset)[0])]
            subset_cols = best_subset.columns.tolist()

            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
            data = data.merge(unique_subset, how='inner', on=best_subset.columns.tolist()
                              ).drop(columns=best_subset.columns.tolist())
            data.index = pd.RangeIndex(len(data))
            break

    return data, subset_cols



class SubsetPooler(BaseEstimator, TransformerMixin):
    '''
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
    the 'pooled_var' column indicate identical information in the respective rows.

    Parameters
    ----------
    col_dupl_ratio: float, default 0.2
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
        Columns with a lower ratio are not considered for pooling.

    dupl_thresh: float, default 0.2
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
        reached.

    min_col_pool: integer, default 3
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
        subsets and stops when 'min_col_pool' is reached.

    Returns:
    -------
    data: pd.DataFrame
    '''

    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3):
        self.col_dupl_thresh = col_dupl_thresh
        self.subset_thresh = subset_thresh
        self.min_col_pool = min_col_pool

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, subset_cols = pool_duplicate_subsets(
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3)

        print('Combined columns:', len(subset_cols), subset_cols)

        return data


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import itertools
10			import pandas as pd
11			from sklearn.base import BaseEstimator, TransformerMixin
12
13			from .describe import corr_mat
14			from .utils import (_diff_report,
15			_drop_duplicates,
16			_missing_vals,
17			_validate_input_bool,
18			_validate_input_range)
19
20
21			__all__ = ['convert_datatypes',
22			'data_cleaning',
23			'drop_missing',
24			'mv_col_handling']
25
26
27			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
28			'''
29			Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
30
31			Parameters
32			----------
33			data: 2D dataset that can be coerced into Pandas DataFrame.
34
35			category: bool, default True
36			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
37			columns using cat_exclude.
38
39			cat_threshold: float, default 0.05
40			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
41
42			cat_exclude: list, default None
43			List of columns to exclude from categorical conversion.
44
45			Returns
46			-------
47			data: Pandas DataFrame
48			'''
49
50			# Validate Inputs
51			_validate_input_bool(category, 'Category')
52			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
53
54			cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
55
56			data = pd.DataFrame(data).copy()
57			for col in data.columns:
58			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
59			if (category and
60			unique_vals_ratio < cat_threshold and
61			col not in cat_exclude and
62			data[col].dtype == 'object'):
63			data[col] = data[col].astype('category')
64			data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
65			convert_integer=False, convert_boolean=True)
66
67			return data
68
69
70			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
71			'''
72			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
73			drop additional columns and rows based on the fraction of remaining NA-values.
74
75			Parameters
76			----------
77			data: 2D dataset that can be coerced into Pandas DataFrame.
78
79			drop_threshold_cols: float, default 1
80			Drop columns with NA-ratio above the specified threshold.
81
82			drop_threshold_rows: float, default 1
83			Drop rows with NA-ratio above the specified threshold.
84
85			Returns
86			-------
87			data_cleaned: Pandas DataFrame
88
89			Notes
90			-----
91			Columns are dropped first. Rows are dropped based on the remaining data.
92			'''
93
94			# Validate Inputs
95			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
96			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
97
98			data = pd.DataFrame(data).copy()
99			data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
100			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
101			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
102
103			return data_cleaned
104
105
106			def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
107			convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
108			'''
109			Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
110			columns as well as optimizing the datatypes.
111
112			Parameters
113			----------
114			data: 2D dataset that can be coerced into Pandas DataFrame.
115
116			drop_threshold_cols: float, default 0.9
117			Drop columns with NA-ratio above the specified threshold.
118
119			drop_threshold_rows: float, default 0.9
120			Drop rows with NA-ratio above the specified threshold.
121
122			drop_duplicates: bool, default True
123			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
124
125			convert_dtypes: bool, default True
126			Convert dtypes using pd.convert_dtypes().
127
128			category: bool, default True
129			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
130
131			cat_threshold: float, default 0.03
132			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
133
134			cat_exclude: list, default None
135			List of columns to exclude from categorical conversion.
136
137			show: {'all', 'changes', None} default 'all'
138			Specify verbosity of the output.
139			* 'all': Print information about the data before and after cleaning as well as information about changes.
140			* 'changes': Print out differences in the data before and after cleaning.
141			* None: No information about the data and the data cleaning is printed.
142
143			Returns
144			-------
145			data_cleaned: Pandas DataFrame
146
147			See Also
148			--------
149			convert_datatypes: Convert columns to best possible dtypes.
150			drop_missing : Flexibly drop columns and rows.
151			_memory_usage: Gives the total memory usage in kilobytes.
152			_missing_vals: Metrics about missing values in the dataset.
153
154			Notes
155			-----
156			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
157			'''
158
159			# Validate Inputs
160			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
161			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
162			_validate_input_bool(drop_duplicates, 'drop_duplicates')
163			_validate_input_bool(convert_dtypes, 'convert_datatypes')
164			_validate_input_bool(category, 'category')
165			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
166
167			data = pd.DataFrame(data).copy()
168			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
169
170			single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
171			data_cleaned = data_cleaned.drop(columns=single_val_cols)
172
173			dupl_rows = None
174
175			if drop_duplicates:
176			data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
177			if convert_dtypes:
178			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
179			cat_exclude=cat_exclude)
180
181			_diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
182
183			return data_cleaned
184
185
186			class DataCleaner(BaseEstimator, TransformerMixin):
187			'''
188			Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar
189			functions (e.g. MVColHandler()).
190
191			Parameters:
192			---------´
193			drop_threshold_cols: float, default 0.9
194			Drop columns with NA-ratio above the specified threshold.
195
196			drop_threshold_rows: float, default 0.9
197			Drop rows with NA-ratio above the specified threshold.
198
199			drop_duplicates: bool, default True
200			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
201
202			convert_dtypes: bool, default True
203			Convert dtypes using pd.convert_dtypes().
204
205			category: bool, default True
206			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
207
208			cat_threshold: float, default 0.03
209			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
210
211			cat_exclude: list, default None
212			List of columns to exclude from categorical conversion.
213
214			show: {'all', 'changes', None} default 'all'
215			Specify verbosity of the output.
216			* 'all': Print information about the data before and after cleaning as well as information about changes.
217			* 'changes': Print out differences in the data before and after cleaning.
218			* None: No information about the data and the data cleaning is printed.
219
220			Returns:
221			-------
222			data_cleaned: Pandas DataFrame
223			'''
224
225			def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
226			category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
227			self.drop_threshold_cols = drop_threshold_cols
228			self.drop_threshold_rows = drop_threshold_rows
229			self.drop_duplicates = drop_duplicates
230			self.convert_dtypes = convert_dtypes
231			self.category = category
232			self.cat_threshold = cat_threshold
233			self.cat_exclude = cat_exclude
234			self.show = show
235
236			def fit(self, data, target=None):
237			return self
238
239			def transform(self, data, target=None):
240			data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
241			drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
242			convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
243			cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
244			return data_cleaned
245
246
247			def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
248			'''
249			Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
250			their correlation with other features and the target variable. This function follows a three step process:
251			- 1) Identify features with a high ratio of missing values
252			- 2) Identify high correlations of these features among themselves and with other features in the dataset.
253			- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
254			they correlate reasonably well with the target variable.
255
256			Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
257
258			Parameters
259			----------
260			data: 2D dataset that can be coerced into Pandas DataFrame.
261
262			target: string, list, np.array or pd.Series, default None
263			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
264			and the label.
265
266			mv_threshold: float, default 0.1
267			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
268			for dropping and undergo further analysis.
269
270			corr_thresh_features: float, default 0.6
271			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
272			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
273
274			corr_thresh_target: float, default 0.3
275			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
276			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
277			the feature is ultimately dropped.
278
279			Returns
280			-------
281			data: Updated Pandas DataFrame
282			cols_mv: Columns with missing values included in the analysis
283			drop_cols: List of dropped columns
284			'''
285
286			# Validate Inputs
287			_validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
288			_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
289			_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
290
291			data = pd.DataFrame(data).copy()
292			data_local = data.copy()
293			mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
294			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
295			data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
296
297			high_corr_features = []
298			data_temp = data_local.copy()
299			for col in cols_mv:
300			corrmat = corr_mat(data_temp, colored=False)
301			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
302			high_corr_features.append(col)
303			data_temp = data_temp.drop(columns=[col])
304
305			drop_cols = []
306			if target is None:
307			data = data.drop(columns=high_corr_features)
308			else:
309			for col in high_corr_features:
310			if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
311			drop_cols.append(col)
312			data = data.drop(columns=[col])
313
314			return data, cols_mv, drop_cols
315
316
317			class MVColHandler(BaseEstimator, TransformerMixin):
318			'''
319			Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar
320			functions (e.g. DataCleaner()).
321
322			Parameters
323			----------
324			target: string, list, np.array or pd.Series, default None
325			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
326			and the label.
327
328			mv_threshold: float, default 0.1
329			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
330			for dropping and undergo further analysis.
331
332			corr_thresh_features: float, default 0.6
333			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
334			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
335
336			corr_thresh_target: float, default 0.3
337			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
338			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
339			the feature is ultimately dropped.
340
341			Returns
342			-------
343			data: Updated Pandas DataFrame
344			'''
345
346			def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
347			self.target = target
348			self.mv_threshold = mv_threshold
349			self.corr_thresh_features = corr_thresh_features
350			self.corr_thresh_target = corr_thresh_target
351
352			def fit(self, data, target=None):
353			return self
354
355			def transform(self, data, target=None):
356			data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
357			corr_thresh_features=self.corr_thresh_features,
358			corr_thresh_target=self.corr_thresh_target)
359
360			print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
361			print('Features dropped:', len(dropped_cols), dropped_cols)
362
363			return data
364
365
366			def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3):
367			'''
368			Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
369			loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
370			duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
371			the 'pooled_var' column indicate identical information in the respective rows.
372
373			Parameters
374			----------
375			data: 2D dataset that can be coerced into Pandas DataFrame.
376
377			col_dupl_ratio: float, default 0.2
378			Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
379			Columns with a lower ratio are not considered for pooling.
380
381			dupl_thresh: float, default 0.2
382			The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
383			reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
384			reached.
385
386			min_col_pool: integer, default 3
387			Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
388			subsets and stops when 'min_col_pool' is reached.
389
390			Returns:
391			-------
392			data: pd.DataFrame
393			'''
394
395			# Input validation
396			_validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
397			_validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
398			_validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
399
400			for i in range(data.shape[1]+1-min_col_pool):
401			check_list = []
402			for col in data.columns:
403			cdr = data.duplicated(subset=col).mean()
404			if cdr > col_dupl_thresh:
405			check_list.append(col)
406
407			combinations = itertools.combinations(check_list, len(check_list)-i)
408
409			ratios = []
410			for comb in combinations:
411			ratios.append(data[list(comb)].duplicated().mean())
412
413			max_ratio = pd.DataFrame(ratios).max()
414			max_idx = pd.DataFrame(ratios).idxmax()
415
416			subset_cols = []
417			if max_ratio[0] > subset_thresh:
418			best_subset = itertools.islice(itertools.combinations(
419			check_list, len(check_list)-i), max_idx[0], max_idx[0]+1)
420			best_subset = data[list(list(best_subset)[0])]
421			subset_cols = best_subset.columns.tolist()
422
423			unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
424			data = data.merge(unique_subset, how='inner', on=best_subset.columns.tolist()
425			).drop(columns=best_subset.columns.tolist())
426			data.index = pd.RangeIndex(len(data))
427			break
428
429			return data, subset_cols
			0 ignored issues – show introduced 2020-05-01 15:37 UTC by Report Bug Copy Issue Report The variable `subset_cols` does not seem to be defined in case the `for` loop on line `400` is not entered. Are you sure this can never be the case? Loading history...
430
431
432			class SubsetPooler(BaseEstimator, TransformerMixin):
433			'''
434			Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
435			loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
436			duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
437			the 'pooled_var' column indicate identical information in the respective rows.
438
439			Parameters
440			----------
441			col_dupl_ratio: float, default 0.2
442			Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
443			Columns with a lower ratio are not considered for pooling.
444
445			dupl_thresh: float, default 0.2
446			The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
447			reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
448			reached.
449
450			min_col_pool: integer, default 3
451			Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
452			subsets and stops when 'min_col_pool' is reached.
453
454			Returns:
455			-------
456			data: pd.DataFrame
457			'''
458
459			def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3):
460			self.col_dupl_thresh = col_dupl_thresh
461			self.subset_thresh = subset_thresh
462			self.min_col_pool = min_col_pool
463
464			def fit(self, data, target=None):
465			return self
466
467			def transform(self, data, target=None):
468			data, subset_cols = pool_duplicate_subsets(
469			data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3)
470
471			print('Combined columns:', len(subset_cols), subset_cols)
472
473			return data
474

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 125b89...3ddec6 )

klib.clean.SubsetPooler.fit() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like