klib.clean - Code Metrics - Inspection of "enhance convert dtypes" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 8aef93...18d2e3 )

by Andreas

created 2020-05-17 17:44 UTC

klib.clean A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	534
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	185
dl	0
loc	534
rs	9.36
c	0
b	0
f	0
wmc	38

9 Methods

Rating	Name	Size	Complexity
A	MVColHandler.__init__()	7	1
A	DataCleaner.__init__()	10	1
A	DataCleaner.transform()	6	1
A	MVColHandler.fit()	2	1
A	MVColHandler.transform()	10	1
A	SubsetPooler.transform()	7	1
A	SubsetPooler.fit()	2	1
A	SubsetPooler.__init__()	5	1
A	DataCleaner.fit()	2	1

7 Functions

Rating	Name	Size	Complexity
C	pool_duplicate_subsets()	87	9
A	optimize_ints()	5	1
A	drop_missing()	34	1
A	optimize_floats()	5	1
B	convert_datatypes()	46	7
A	data_cleaning()	78	3
B	mv_col_handling()	77	7

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import itertools
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from .describe import corr_mat
from .utils import (_diff_report,
                    _drop_duplicates,
                    _missing_vals,
                    _validate_input_bool,
                    _validate_input_range)


__all__ = ['convert_datatypes',
           'data_cleaning',
           'drop_missing',
           'mv_col_handling']


def optimize_ints(data):
    data = pd.DataFrame(data).copy()
    ints = data.select_dtypes(include=['int64']).columns.tolist()
    data[ints] = data[ints].apply(pd.to_numeric, downcast='integer')
    return data


def optimize_floats(data):
    data = pd.DataFrame(data).copy()
    floats = data.select_dtypes(include=['float64']).columns.tolist()
    data[floats] = data[floats].apply(pd.to_numeric, downcast='float')
    return data


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
        due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
        See https://github.com/pandas-dev/pandas/issues/33803

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    Returns
    -------
    data: Pandas DataFrame
    '''

    # Validate Inputs
    _validate_input_bool(category, 'Category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
                                             convert_integer=False, convert_boolean=True)

    data = optimize_ints(data)
    data = optimize_floats(data)

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of remaining NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)

    data = pd.DataFrame(data).copy()
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
        columns as well as optimizing the datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    See Also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
    _validate_input_bool(category, 'category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)

    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    dupl_rows = None

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                         cat_exclude=cat_exclude)

    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)

    return data_cleaned


class DataCleaner(BaseEstimator, TransformerMixin):
    '''
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
    functions (e.g. using MVColHandler() or SubsetPooler()).

    Parameters:
    ---------´
    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns:
    -------
    data_cleaned: Pandas DataFrame
    '''

    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
        self.drop_threshold_cols = drop_threshold_cols
        self.drop_threshold_rows = drop_threshold_rows
        self.drop_duplicates = drop_duplicates
        self.convert_dtypes = convert_dtypes
        self.category = category
        self.cat_threshold = cat_threshold
        self.cat_exclude = cat_exclude
        self.show = show

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
        return data_cleaned


def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
                    return_details=False):
    '''
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
    their correlation with other features and the target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values.
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
         they correlate reasonably well with the target variable.

    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.5
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
        analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    return_details: bool, default False
        Provdies flexibility to return intermediary results.

    Returns
    -------
    data: Updated Pandas DataFrame

    optional:
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
        data = data.drop(columns=drop_cols)

    if return_details:
        return data, cols_mv, drop_cols

    return data


class MVColHandler(BaseEstimator, TransformerMixin):
    '''
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
    functions (e.g. using DataCleaner() or SubsetPooler()).

    Parameters
    ----------
    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    return_details: bool, default True
        Provdies flexibility to return intermediary results.

    Returns
    -------
    data: Updated Pandas DataFrame
    '''

    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
                 return_details=True):
        self.target = target
        self.mv_threshold = mv_threshold
        self.corr_thresh_features = corr_thresh_features
        self.corr_thresh_target = corr_thresh_target
        self.return_details = return_details

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
                                                      corr_thresh_features=self.corr_thresh_features,
                                                      corr_thresh_target=self.corr_thresh_target,
                                                      return_details=self.return_details)

        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
        print('Features dropped:', len(dropped_cols), dropped_cols)

        return data


def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
                           return_details=False):
    '''
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
    the 'pooled_var' column indicate identical information in the respective rows.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    col_dupl_thresh: float, default 0.2
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
        Columns with a lower ratio are not considered for pooling.

    subset_thresh: float, default 0.2
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
        reached.

    min_col_pool: integer, default 3
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
        subsets and stops when 'min_col_pool' is reached.

    exclude. list, default None
        List of column names to be excluded from the analysis. These columns are passed through without modification.

    return_details: bool, default False
        Provdies flexibility to return intermediary results.

    Returns:
    -------
    data: pd.DataFrame

    optional:
    subset_cols: List of columns used as subset.
    '''

    # Input validation
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])

    excluded_cols = []
    if exclude is not None:
        excluded_cols = data[exclude]
        data = data.drop(columns=exclude)

    subset_cols = []
    for i in range(data.shape[1]+1-min_col_pool):
        check_list = []
        for col in data.columns:
            cdr = data.duplicated(subset=col).mean()
            if cdr > col_dupl_thresh:
                check_list.append(col)

        if len(check_list) > 0:
            combinations = itertools.combinations(check_list, len(check_list)-i)
        else:
            continue

        ratios = []
        for comb in combinations:
            ratios.append(data[list(comb)].duplicated().mean())

        max_ratio = max(ratios)
        max_idx = np.argmax(ratios)

        if max_ratio > subset_thresh:
            best_subset = itertools.islice(itertools.combinations(
                check_list, len(check_list)-i), max_idx, max_idx+1)
            best_subset = data[list(list(best_subset)[0])]
            subset_cols = best_subset.columns.tolist()

            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
            data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
                              ).drop(columns=best_subset.columns.tolist())
            data.index = pd.RangeIndex(len(data))
            break

    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)

    if return_details:
        return data, subset_cols

    return data


class SubsetPooler(BaseEstimator, TransformerMixin):
    '''
    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
    functions (e.g. using DataCleaner() or MVColHandler()).

    Parameters
    ----------
    col_dupl_ratio: float, default 0.2
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
        Columns with a lower ratio are not considered for pooling.

    dupl_thresh: float, default 0.2
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
        reached.

    min_col_pool: integer, default 3
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
        subsets and stops when 'min_col_pool' is reached.

    return_details: bool, default False
        Provdies flexibility to return intermediary results.

    Returns:
    -------
    data: pd.DataFrame
    '''

    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
        self.col_dupl_thresh = col_dupl_thresh
        self.subset_thresh = subset_thresh
        self.min_col_pool = min_col_pool
        self.return_details = return_details

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, subset_cols = pool_duplicate_subsets(
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)

        print('Combined columns:', len(subset_cols), subset_cols)

        return data


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import itertools
10			import numpy as np
11			import pandas as pd
12			from sklearn.base import BaseEstimator, TransformerMixin
13
14			from .describe import corr_mat
15			from .utils import (_diff_report,
16			_drop_duplicates,
17			_missing_vals,
18			_validate_input_bool,
19			_validate_input_range)
20
21
22			__all__ = ['convert_datatypes',
23			'data_cleaning',
24			'drop_missing',
25			'mv_col_handling']
26
27
28			def optimize_ints(data):
29			data = pd.DataFrame(data).copy()
30			ints = data.select_dtypes(include=['int64']).columns.tolist()
31			data[ints] = data[ints].apply(pd.to_numeric, downcast='integer')
32			return data
33
34
35			def optimize_floats(data):
36			data = pd.DataFrame(data).copy()
37			floats = data.select_dtypes(include=['float64']).columns.tolist()
38			data[floats] = data[floats].apply(pd.to_numeric, downcast='float')
39			return data
40
41
42			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
43			'''
44			Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
45			due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
46			See https://github.com/pandas-dev/pandas/issues/33803
47
48			Parameters
49			----------
50			data: 2D dataset that can be coerced into Pandas DataFrame.
51
52			category: bool, default True
53			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
54			columns using cat_exclude.
55
56			cat_threshold: float, default 0.05
57			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
58
59			cat_exclude: list, default None
60			List of columns to exclude from categorical conversion.
61
62			Returns
63			-------
64			data: Pandas DataFrame
65			'''
66
67			# Validate Inputs
68			_validate_input_bool(category, 'Category')
69			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
70
71			cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
72
73			data = pd.DataFrame(data).copy()
74			for col in data.columns:
75			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
76			if (category and
77			unique_vals_ratio < cat_threshold and
78			col not in cat_exclude and
79			data[col].dtype == 'object'):
80			data[col] = data[col].astype('category')
81			data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
82			convert_integer=False, convert_boolean=True)
83
84			data = optimize_ints(data)
85			data = optimize_floats(data)
86
87			return data
88
89
90			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
91			'''
92			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
93			drop additional columns and rows based on the fraction of remaining NA-values.
94
95			Parameters
96			----------
97			data: 2D dataset that can be coerced into Pandas DataFrame.
98
99			drop_threshold_cols: float, default 1
100			Drop columns with NA-ratio above the specified threshold.
101
102			drop_threshold_rows: float, default 1
103			Drop rows with NA-ratio above the specified threshold.
104
105			Returns
106			-------
107			data_cleaned: Pandas DataFrame
108
109			Notes
110			-----
111			Columns are dropped first. Rows are dropped based on the remaining data.
112			'''
113
114			# Validate Inputs
115			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
116			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
117
118			data = pd.DataFrame(data).copy()
119			data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
120			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
121			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
122
123			return data_cleaned
124
125
126			def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
127			convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
128			'''
129			Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
130			columns as well as optimizing the datatypes.
131
132			Parameters
133			----------
134			data: 2D dataset that can be coerced into Pandas DataFrame.
135
136			drop_threshold_cols: float, default 0.9
137			Drop columns with NA-ratio above the specified threshold.
138
139			drop_threshold_rows: float, default 0.9
140			Drop rows with NA-ratio above the specified threshold.
141
142			drop_duplicates: bool, default True
143			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
144
145			convert_dtypes: bool, default True
146			Convert dtypes using pd.convert_dtypes().
147
148			category: bool, default True
149			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
150
151			cat_threshold: float, default 0.03
152			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
153
154			cat_exclude: list, default None
155			List of columns to exclude from categorical conversion.
156
157			show: {'all', 'changes', None} default 'all'
158			Specify verbosity of the output.
159			* 'all': Print information about the data before and after cleaning as well as information about changes.
160			* 'changes': Print out differences in the data before and after cleaning.
161			* None: No information about the data and the data cleaning is printed.
162
163			Returns
164			-------
165			data_cleaned: Pandas DataFrame
166
167			See Also
168			--------
169			convert_datatypes: Convert columns to best possible dtypes.
170			drop_missing : Flexibly drop columns and rows.
171			_memory_usage: Gives the total memory usage in kilobytes.
172			_missing_vals: Metrics about missing values in the dataset.
173
174			Notes
175			-----
176			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
177			'''
178
179			# Validate Inputs
180			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
181			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
182			_validate_input_bool(drop_duplicates, 'drop_duplicates')
183			_validate_input_bool(convert_dtypes, 'convert_datatypes')
184			_validate_input_bool(category, 'category')
185			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
186
187			data = pd.DataFrame(data).copy()
188			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
189
190			single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
191			data_cleaned = data_cleaned.drop(columns=single_val_cols)
192
193			dupl_rows = None
194
195			if drop_duplicates:
196			data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
197			if convert_dtypes:
198			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
199			cat_exclude=cat_exclude)
200
201			_diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
202
203			return data_cleaned
204
205
206			class DataCleaner(BaseEstimator, TransformerMixin):
207			'''
208			Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
209			functions (e.g. using MVColHandler() or SubsetPooler()).
210
211			Parameters:
212			---------´
213			drop_threshold_cols: float, default 0.9
214			Drop columns with NA-ratio above the specified threshold.
215
216			drop_threshold_rows: float, default 0.9
217			Drop rows with NA-ratio above the specified threshold.
218
219			drop_duplicates: bool, default True
220			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
221
222			convert_dtypes: bool, default True
223			Convert dtypes using pd.convert_dtypes().
224
225			category: bool, default True
226			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
227
228			cat_threshold: float, default 0.03
229			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
230
231			cat_exclude: list, default None
232			List of columns to exclude from categorical conversion.
233
234			show: {'all', 'changes', None} default 'all'
235			Specify verbosity of the output.
236			* 'all': Print information about the data before and after cleaning as well as information about changes.
237			* 'changes': Print out differences in the data before and after cleaning.
238			* None: No information about the data and the data cleaning is printed.
239
240			Returns:
241			-------
242			data_cleaned: Pandas DataFrame
243			'''
244
245			def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
246			category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
247			self.drop_threshold_cols = drop_threshold_cols
248			self.drop_threshold_rows = drop_threshold_rows
249			self.drop_duplicates = drop_duplicates
250			self.convert_dtypes = convert_dtypes
251			self.category = category
252			self.cat_threshold = cat_threshold
253			self.cat_exclude = cat_exclude
254			self.show = show
255
256			def fit(self, data, target=None):
257			return self
258
259			def transform(self, data, target=None):
260			data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
261			drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
262			convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
263			cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
264			return data_cleaned
265
266
267			def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
268			return_details=False):
269			'''
270			Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
271			their correlation with other features and the target variable. This function follows a three step process:
272			- 1) Identify features with a high ratio of missing values.
273			- 2) Identify high correlations of these features among themselves and with other features in the dataset.
274			- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
275			they correlate reasonably well with the target variable.
276
277			Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
278
279			Parameters
280			----------
281			data: 2D dataset that can be coerced into Pandas DataFrame.
282
283			target: string, list, np.array or pd.Series, default None
284			Specify target for correlation. I.e. label column to generate only the correlations between each feature \
285			and the label.
286
287			mv_threshold: float, default 0.1
288			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
289			for dropping and undergo further analysis.
290
291			corr_thresh_features: float, default 0.5
292			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
293			is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
294			analysis.
295
296			corr_thresh_target: float, default 0.3
297			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
298			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
299			the feature is ultimately dropped.
300
301			return_details: bool, default False
302			Provdies flexibility to return intermediary results.
303
304			Returns
305			-------
306			data: Updated Pandas DataFrame
307
308			optional:
309			cols_mv: Columns with missing values included in the analysis
310			drop_cols: List of dropped columns
311			'''
312
313			# Validate Inputs
314			_validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
315			_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
316			_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
317
318			data = pd.DataFrame(data).copy()
319			data_local = data.copy()
320			mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
321			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
322			data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
323
324			high_corr_features = []
325			data_temp = data_local.copy()
326			for col in cols_mv:
327			corrmat = corr_mat(data_temp, colored=False)
328			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
329			high_corr_features.append(col)
330			data_temp = data_temp.drop(columns=[col])
331
332			drop_cols = []
333			if target is None:
334			data = data.drop(columns=high_corr_features)
335			else:
336			corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
337			drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
338			data = data.drop(columns=drop_cols)
339
340			if return_details:
341			return data, cols_mv, drop_cols
342
343			return data
344
345
346			class MVColHandler(BaseEstimator, TransformerMixin):
347			'''
348			Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
349			functions (e.g. using DataCleaner() or SubsetPooler()).
350
351			Parameters
352			----------
353			target: string, list, np.array or pd.Series, default None
354			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
355			and the label.
356
357			mv_threshold: float, default 0.1
358			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
359			for dropping and undergo further analysis.
360
361			corr_thresh_features: float, default 0.6
362			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
363			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
364
365			corr_thresh_target: float, default 0.3
366			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
367			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
368			the feature is ultimately dropped.
369
370			return_details: bool, default True
371			Provdies flexibility to return intermediary results.
372
373			Returns
374			-------
375			data: Updated Pandas DataFrame
376			'''
377
378			def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
379			return_details=True):
380			self.target = target
381			self.mv_threshold = mv_threshold
382			self.corr_thresh_features = corr_thresh_features
383			self.corr_thresh_target = corr_thresh_target
384			self.return_details = return_details
385
386			def fit(self, data, target=None):
387			return self
388
389			def transform(self, data, target=None):
390			data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
391			corr_thresh_features=self.corr_thresh_features,
392			corr_thresh_target=self.corr_thresh_target,
393			return_details=self.return_details)
394
395			print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
396			print('Features dropped:', len(dropped_cols), dropped_cols)
397
398			return data
399
400
401			def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
402			return_details=False):
403			'''
404			Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
405			loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
406			duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
407			the 'pooled_var' column indicate identical information in the respective rows.
408
409			Parameters
410			----------
411			data: 2D dataset that can be coerced into Pandas DataFrame.
412
413			col_dupl_thresh: float, default 0.2
414			Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
415			Columns with a lower ratio are not considered for pooling.
416
417			subset_thresh: float, default 0.2
418			The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
419			reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
420			reached.
421
422			min_col_pool: integer, default 3
423			Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
424			subsets and stops when 'min_col_pool' is reached.
425
426			exclude. list, default None
427			List of column names to be excluded from the analysis. These columns are passed through without modification.
428
429			return_details: bool, default False
430			Provdies flexibility to return intermediary results.
431
432			Returns:
433			-------
434			data: pd.DataFrame
435
436			optional:
437			subset_cols: List of columns used as subset.
438			'''
439
440			# Input validation
441			_validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
442			_validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
443			_validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
444
445			excluded_cols = []
446			if exclude is not None:
447			excluded_cols = data[exclude]
448			data = data.drop(columns=exclude)
449
450			subset_cols = []
451			for i in range(data.shape[1]+1-min_col_pool):
452			check_list = []
453			for col in data.columns:
454			cdr = data.duplicated(subset=col).mean()
455			if cdr > col_dupl_thresh:
456			check_list.append(col)
457
458			if len(check_list) > 0:
459			combinations = itertools.combinations(check_list, len(check_list)-i)
460			else:
461			continue
462
463			ratios = []
464			for comb in combinations:
465			ratios.append(data[list(comb)].duplicated().mean())
466
467			max_ratio = max(ratios)
468			max_idx = np.argmax(ratios)
469
470			if max_ratio > subset_thresh:
471			best_subset = itertools.islice(itertools.combinations(
472			check_list, len(check_list)-i), max_idx, max_idx+1)
473			best_subset = data[list(list(best_subset)[0])]
474			subset_cols = best_subset.columns.tolist()
475
476			unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
477			data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
478			).drop(columns=best_subset.columns.tolist())
479			data.index = pd.RangeIndex(len(data))
480			break
481
482			data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
483
484			if return_details:
485			return data, subset_cols
486
487			return data
488
489
490			class SubsetPooler(BaseEstimator, TransformerMixin):
491			'''
492			Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
493			functions (e.g. using DataCleaner() or MVColHandler()).
494
495			Parameters
496			----------
497			col_dupl_ratio: float, default 0.2
498			Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
499			Columns with a lower ratio are not considered for pooling.
500
501			dupl_thresh: float, default 0.2
502			The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
503			reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
504			reached.
505
506			min_col_pool: integer, default 3
507			Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
508			subsets and stops when 'min_col_pool' is reached.
509
510			return_details: bool, default False
511			Provdies flexibility to return intermediary results.
512
513			Returns:
514			-------
515			data: pd.DataFrame
516			'''
517
518			def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
519			self.col_dupl_thresh = col_dupl_thresh
520			self.subset_thresh = subset_thresh
521			self.min_col_pool = min_col_pool
522			self.return_details = return_details
523
524			def fit(self, data, target=None):
525			return self
526
527			def transform(self, data, target=None):
528			data, subset_cols = pool_duplicate_subsets(
529			data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)
530
531			print('Combined columns:', len(subset_cols), subset_cols)
532
533			return data
534

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 8aef93...18d2e3 )

klib.clean A

Complexity

Size/Duplication

Importance

9 Methods

7 Functions

Duplication Side-by-Side

Filter issues like