klib.clean.DataCleaner.fit() - Code Metrics - Inspection of "remove circular dependency" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( bfbbb5...96b70c )

by Andreas

created 2020-04-28 14:32 UTC

klib.clean.DataCleaner.fit() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	2
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	3
dl	0
loc	2
rs	10
c	0
b	0
f	0

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from .describe import corr_mat
from .utils import _diff_report
from .utils import _drop_duplicates
from .utils import _missing_vals
from .utils import _validate_input_range
from .utils import _validate_input_bool

__all__ = ['convert_datatypes',
           'data_cleaning',
           'drop_missing',
           'mv_col_handling']


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    Returns
    -------
    data: Pandas DataFrame
    '''

    # Validate Inputs
    _validate_input_bool(category, 'Category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
                                             convert_integer=False, convert_boolean=True)

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of remaining NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)

    data = pd.DataFrame(data).copy()
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
        columns as well as optimizing the datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    See Also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
    _validate_input_bool(category, 'category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)

    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                         cat_exclude=cat_exclude)

    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)

    return data_cleaned


class DataCleaner(BaseEstimator, TransformerMixin):
    '''Docstring of a class? methods also have docstrings or commments?'''
    '''possible component of a cleaning pipeline --> e.g. followed by MCH'''

    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
        self.drop_threshold_cols = drop_threshold_cols
        self.drop_threshold_rows = drop_threshold_rows
        self.drop_duplicates = drop_duplicates
        self.convert_dtypes = convert_dtypes
        self.category = category
        self.cat_threshold = cat_threshold
        self.cat_exclude = cat_exclude
        self.show = show

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
        return data_cleaned


def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
    '''
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
    their correlation with other features and the target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
         they correlate reasonably well with the target variable.

    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    Returns
    -------
    data: Updated Pandas DataFrame
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        for col in high_corr_features:
            if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
                drop_cols.append(col)
                data = data.drop(columns=[col])

    return data, cols_mv, drop_cols


class MVColHandler(BaseEstimator, TransformerMixin):
    '''possible component of a cleaning pipeline --> follows DataCleaning'''

    def __init__(self, target=None, mch_mv_thresh=0.1, mch_feature_thresh=0.6, mch_target_thresh=0.3):
        self.target = target
        self.mch_mv_thresh = mch_mv_thresh
        self.mch_feature_thresh = mch_feature_thresh
        self.mch_target_thresh = mch_target_thresh

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mch_mv_thresh,
                                                      corr_thresh_features=self.mch_feature_thresh,
                                                      corr_thresh_target=self.mch_target_thresh)

        print(f'\nFeatures with MV-ratio > {self.mch_mv_thresh}: {len(cols_mv)}')
        print('Features dropped:', len(dropped_cols), dropped_cols)

        return data


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10			from sklearn.base import BaseEstimator, TransformerMixin
11
12			from .describe import corr_mat
13			from .utils import _diff_report
14			from .utils import _drop_duplicates
15			from .utils import _missing_vals
16			from .utils import _validate_input_range
17			from .utils import _validate_input_bool
18
19			__all__ = ['convert_datatypes',
20			'data_cleaning',
21			'drop_missing',
22			'mv_col_handling']
23
24
25			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
26			'''
27			Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
28
29			Parameters
30			----------
31			data: 2D dataset that can be coerced into Pandas DataFrame.
32
33			category: bool, default True
34			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
35			columns using cat_exclude.
36
37			cat_threshold: float, default 0.05
38			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
39
40			cat_exclude: list, default None
41			List of columns to exclude from categorical conversion.
42
43			Returns
44			-------
45			data: Pandas DataFrame
46			'''
47
48			# Validate Inputs
49			_validate_input_bool(category, 'Category')
50			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
51
52			cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
53
54			data = pd.DataFrame(data).copy()
55			for col in data.columns:
56			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
57			if (category and
58			unique_vals_ratio < cat_threshold and
59			col not in cat_exclude and
60			data[col].dtype == 'object'):
61			data[col] = data[col].astype('category')
62			data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
63			convert_integer=False, convert_boolean=True)
64
65			return data
66
67
68			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
69			'''
70			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
71			drop additional columns and rows based on the fraction of remaining NA-values.
72
73			Parameters
74			----------
75			data: 2D dataset that can be coerced into Pandas DataFrame.
76
77			drop_threshold_cols: float, default 1
78			Drop columns with NA-ratio above the specified threshold.
79
80			drop_threshold_rows: float, default 1
81			Drop rows with NA-ratio above the specified threshold.
82
83			Returns
84			-------
85			data_cleaned: Pandas DataFrame
86
87			Notes
88			-----
89			Columns are dropped first. Rows are dropped based on the remaining data.
90			'''
91
92			# Validate Inputs
93			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
94			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
95
96			data = pd.DataFrame(data).copy()
97			data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
98			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
99			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
100
101			return data_cleaned
102
103
104			def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
105			convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
106			'''
107			Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
108			columns as well as optimizing the datatypes.
109
110			Parameters
111			----------
112			data: 2D dataset that can be coerced into Pandas DataFrame.
113
114			drop_threshold_cols: float, default 0.9
115			Drop columns with NA-ratio above the specified threshold.
116
117			drop_threshold_rows: float, default 0.9
118			Drop rows with NA-ratio above the specified threshold.
119
120			drop_duplicates: bool, default True
121			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
122
123			convert_dtypes: bool, default True
124			Convert dtypes using pd.convert_dtypes().
125
126			category: bool, default True
127			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
128
129			cat_threshold: float, default 0.03
130			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
131
132			cat_exclude: list, default None
133			List of columns to exclude from categorical conversion.
134
135			show: {'all', 'changes', None} default 'all'
136			Specify verbosity of the output.
137			* 'all': Print information about the data before and after cleaning as well as information about changes.
138			* 'changes': Print out differences in the data before and after cleaning.
139			* None: No information about the data and the data cleaning is printed.
140
141			Returns
142			-------
143			data_cleaned: Pandas DataFrame
144
145			See Also
146			--------
147			convert_datatypes: Convert columns to best possible dtypes.
148			drop_missing : Flexibly drop columns and rows.
149			_memory_usage: Gives the total memory usage in kilobytes.
150			_missing_vals: Metrics about missing values in the dataset.
151
152			Notes
153			-----
154			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
155			'''
156
157			# Validate Inputs
158			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
159			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
160			_validate_input_bool(drop_duplicates, 'drop_duplicates')
161			_validate_input_bool(convert_dtypes, 'convert_datatypes')
162			_validate_input_bool(category, 'category')
163			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
164
165			data = pd.DataFrame(data).copy()
166			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
167
168			single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
169			data_cleaned = data_cleaned.drop(columns=single_val_cols)
170
171			if drop_duplicates:
172			data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
173			if convert_dtypes:
174			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
175			cat_exclude=cat_exclude)
176
177			_diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
178
179			return data_cleaned
180
181
182			class DataCleaner(BaseEstimator, TransformerMixin):
183			'''Docstring of a class? methods also have docstrings or commments?'''
184			'''possible component of a cleaning pipeline --> e.g. followed by MCH'''
185
186			def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
187			category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
188			self.drop_threshold_cols = drop_threshold_cols
189			self.drop_threshold_rows = drop_threshold_rows
190			self.drop_duplicates = drop_duplicates
191			self.convert_dtypes = convert_dtypes
192			self.category = category
193			self.cat_threshold = cat_threshold
194			self.cat_exclude = cat_exclude
195			self.show = show
196
197			def fit(self, data, target=None):
198			return self
199
200			def transform(self, data, target=None):
201			data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
202			drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
203			convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
204			cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
205			return data_cleaned
206
207
208			def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
209			'''
210			Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
211			their correlation with other features and the target variable. This function follows a three step process:
212			- 1) Identify features with a high ratio of missing values
213			- 2) Identify high correlations of these features among themselves and with other features in the dataset.
214			- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
215			they correlate reasonably well with the target variable.
216
217			Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
218
219			Parameters
220			----------
221			data: 2D dataset that can be coerced into Pandas DataFrame.
222
223			target: string, list, np.array or pd.Series, default None
224			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
225			and the label.
226
227			mv_threshold: float, default 0.1
228			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
229			for dropping and undergo further analysis.
230
231			corr_thresh_features: float, default 0.6
232			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
233			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
234
235			corr_thresh_target: float, default 0.3
236			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
237			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
238			the feature is ultimately dropped.
239
240			Returns
241			-------
242			data: Updated Pandas DataFrame
243			cols_mv: Columns with missing values included in the analysis
244			drop_cols: List of dropped columns
245			'''
246
247			# Validate Inputs
248			_validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
249			_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
250			_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
251
252			data = pd.DataFrame(data).copy()
253			data_local = data.copy()
254			mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
255			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
256			data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
257
258			high_corr_features = []
259			data_temp = data_local.copy()
260			for col in cols_mv:
261			corrmat = corr_mat(data_temp, colored=False)
262			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
263			high_corr_features.append(col)
264			data_temp = data_temp.drop(columns=[col])
265
266			drop_cols = []
267			if target is None:
268			data = data.drop(columns=high_corr_features)
269			else:
270			for col in high_corr_features:
271			if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
272			drop_cols.append(col)
273			data = data.drop(columns=[col])
274
275			return data, cols_mv, drop_cols
276
277
278			class MVColHandler(BaseEstimator, TransformerMixin):
279			'''possible component of a cleaning pipeline --> follows DataCleaning'''
280
281			def __init__(self, target=None, mch_mv_thresh=0.1, mch_feature_thresh=0.6, mch_target_thresh=0.3):
282			self.target = target
283			self.mch_mv_thresh = mch_mv_thresh
284			self.mch_feature_thresh = mch_feature_thresh
285			self.mch_target_thresh = mch_target_thresh
286
287			def fit(self, data, target=None):
288			return self
289
290			def transform(self, data, target=None):
291			data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mch_mv_thresh,
292			corr_thresh_features=self.mch_feature_thresh,
293			corr_thresh_target=self.mch_target_thresh)
294
295			print(f'\nFeatures with MV-ratio > {self.mch_mv_thresh}: {len(cols_mv)}')
296			print('Features dropped:', len(dropped_cols), dropped_cols)
297
298			return data
299

akanz1 / klib

GitHub Access Token became invalid

Push — master ( bfbbb5...96b70c )

klib.clean.DataCleaner.fit() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like