klib.clean.DataCleaner.transform() - Code Metrics - Inspection of "reset_" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( b2514e...e55ee5 )

by Andreas

created 2020-04-28 10:29 UTC

klib.clean.DataCleaner.transform() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	6
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	6
nop	3
dl	0
loc	6
rs	10
c	0
b	0
f	0

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

# from .preprocess import mv_col_handler
from .utils import _diff_report
from .utils import _drop_duplicates
from .utils import _missing_vals
from .utils import _validate_input_range
from .utils import _validate_input_bool


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    Returns
    -------
    data: Pandas DataFrame
    '''

    # Validate Inputs
    _validate_input_bool(category, 'Category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
                                             convert_integer=False, convert_boolean=True)

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of remaining NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)

    data = pd.DataFrame(data).copy()
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
        columns as well as optimizing the datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 0.9
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.9
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    convert_dtypes: bool, default True
        Convert dtypes using pd.convert_dtypes().

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: list, default None
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    See Also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
    '''

    # Validate Inputs
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
    _validate_input_bool(category, 'category')
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)

    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                         cat_exclude=cat_exclude)

    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)

    return data_cleaned


class DataCleaner(BaseEstimator, TransformerMixin):
    '''Docstring of a class? methods also have docstrings or commments?'''
    '''possible component of a cleaning pipeline --> e.g. followed by MCH'''

    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
        self.drop_threshold_cols = drop_threshold_cols
        self.drop_threshold_rows = drop_threshold_rows
        self.drop_duplicates = drop_duplicates
        self.convert_dtypes = convert_dtypes
        self.category = category
        self.cat_threshold = cat_threshold
        self.cat_exclude = cat_exclude
        self.show = show

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
        return data_cleaned


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10			from sklearn.base import BaseEstimator, TransformerMixin
11
12			# from .preprocess import mv_col_handler
13			from .utils import _diff_report
14			from .utils import _drop_duplicates
15			from .utils import _missing_vals
16			from .utils import _validate_input_range
17			from .utils import _validate_input_bool
18
19
20			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
21			'''
22			Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
23
24			Parameters
25			----------
26			data: 2D dataset that can be coerced into Pandas DataFrame.
27
28			category: bool, default True
29			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
30			columns using cat_exclude.
31
32			cat_threshold: float, default 0.05
33			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
34
35			cat_exclude: list, default None
36			List of columns to exclude from categorical conversion.
37
38			Returns
39			-------
40			data: Pandas DataFrame
41			'''
42
43			# Validate Inputs
44			_validate_input_bool(category, 'Category')
45			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
46
47			cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
48
49			data = pd.DataFrame(data).copy()
50			for col in data.columns:
51			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
52			if (category and
53			unique_vals_ratio < cat_threshold and
54			col not in cat_exclude and
55			data[col].dtype == 'object'):
56			data[col] = data[col].astype('category')
57			data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
58			convert_integer=False, convert_boolean=True)
59
60			return data
61
62
63			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
64			'''
65			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
66			drop additional columns and rows based on the fraction of remaining NA-values.
67
68			Parameters
69			----------
70			data: 2D dataset that can be coerced into Pandas DataFrame.
71
72			drop_threshold_cols: float, default 1
73			Drop columns with NA-ratio above the specified threshold.
74
75			drop_threshold_rows: float, default 1
76			Drop rows with NA-ratio above the specified threshold.
77
78			Returns
79			-------
80			data_cleaned: Pandas DataFrame
81
82			Notes
83			-----
84			Columns are dropped first. Rows are dropped based on the remaining data.
85			'''
86
87			# Validate Inputs
88			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
89			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
90
91			data = pd.DataFrame(data).copy()
92			data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
93			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
94			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
95
96			return data_cleaned
97
98
99			def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
100			convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
101			'''
102			Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
103			columns as well as optimizing the datatypes.
104
105			Parameters
106			----------
107			data: 2D dataset that can be coerced into Pandas DataFrame.
108
109			drop_threshold_cols: float, default 0.9
110			Drop columns with NA-ratio above the specified threshold.
111
112			drop_threshold_rows: float, default 0.9
113			Drop rows with NA-ratio above the specified threshold.
114
115			drop_duplicates: bool, default True
116			Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
117
118			convert_dtypes: bool, default True
119			Convert dtypes using pd.convert_dtypes().
120
121			category: bool, default True
122			Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
123
124			cat_threshold: float, default 0.03
125			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
126
127			cat_exclude: list, default None
128			List of columns to exclude from categorical conversion.
129
130			show: {'all', 'changes', None} default 'all'
131			Specify verbosity of the output.
132			* 'all': Print information about the data before and after cleaning as well as information about changes.
133			* 'changes': Print out differences in the data before and after cleaning.
134			* None: No information about the data and the data cleaning is printed.
135
136			Returns
137			-------
138			data_cleaned: Pandas DataFrame
139
140			See Also
141			--------
142			convert_datatypes: Convert columns to best possible dtypes.
143			drop_missing : Flexibly drop columns and rows.
144			_memory_usage: Gives the total memory usage in kilobytes.
145			_missing_vals: Metrics about missing values in the dataset.
146
147			Notes
148			-----
149			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
150			'''
151
152			# Validate Inputs
153			_validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
154			_validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
155			_validate_input_bool(drop_duplicates, 'drop_duplicates')
156			_validate_input_bool(convert_dtypes, 'convert_datatypes')
157			_validate_input_bool(category, 'category')
158			_validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
159
160			data = pd.DataFrame(data).copy()
161			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
162
163			single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
164			data_cleaned = data_cleaned.drop(columns=single_val_cols)
165
166			if drop_duplicates:
167			data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
168			if convert_dtypes:
169			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
170			cat_exclude=cat_exclude)
171
172			_diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
173
174			return data_cleaned
175
176
177			class DataCleaner(BaseEstimator, TransformerMixin):
178			'''Docstring of a class? methods also have docstrings or commments?'''
179			'''possible component of a cleaning pipeline --> e.g. followed by MCH'''
180
181			def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
182			category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
183			self.drop_threshold_cols = drop_threshold_cols
184			self.drop_threshold_rows = drop_threshold_rows
185			self.drop_duplicates = drop_duplicates
186			self.convert_dtypes = convert_dtypes
187			self.category = category
188			self.cat_threshold = cat_threshold
189			self.cat_exclude = cat_exclude
190			self.show = show
191
192			def fit(self, data, target=None):
193			return self
194
195			def transform(self, data, target=None):
196			data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
197			drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
198			convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
199			cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
200			return data_cleaned
201

akanz1 / klib

GitHub Access Token became invalid

Push — master ( b2514e...e55ee5 )

klib.clean.DataCleaner.transform() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like