klib.clean.data_cleaning() - Code Metrics - Inspection of "Move auxiliary functions to utils.py" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 0be18a...fe1083 )

by Andreas

created 2020-04-11 09:56 UTC

klib.clean.data_cleaning() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	82
Code Lines	31

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	31
dl	0
loc	82
rs	9.1359
c	0
b	0
f	0
cc	3
nop	7

How to fix Long Method

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd

from .utils import _memory_usage
from .utils import _missing_vals
from .utils import _validate_input_0_1


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    Returns
    -------
    Pandas DataFrame.

    '''

    _validate_input_0_1(cat_threshold, 'cat_threshold')

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes()

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    Pandas DataFrame.

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.

    '''

    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')

    data = pd.DataFrame(data)
    data = data.dropna(axis=0, how='all')
    data = data.dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True,
                  cat_threshold=0.03, cat_exclude=[], show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
    datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    drop_threshold_cols: float, default 0.95
    Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.95
    Drop rows with NA-ratio above the specified threshold.

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold.

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: no information about the data is printed.

    Returns
    -------
    Pandas DataFrame.

    See Also
    --------
    convert_datatypes: Converts columns to best possible dtypes.
    drop_missing : Flexibly drops columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.


    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.

    '''

    data = pd.DataFrame(data)
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
    data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                     cat_exclude=cat_exclude)

    if show in ['changes', 'all']:
        if show == 'all':
            print('Before data cleaning:\n')
            print(f'dtypes:\n{data.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data.shape[0]}')
            print(f'Number of cols: {data.shape[1]}')
            print(f"Missing values: {_missing_vals(data)['mv_total']}")
            print(f'Memory usage: {_memory_usage(data)} KB')
            print('_______________________________________________________\n')
            print('After data cleaning:\n')
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
            print(f'Number of cols: {data_cleaned.shape[1]}')
            print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}")
            print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
            print('_______________________________________________________\n')

        print(
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}")
        print(f'\nChanges:')
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
        print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}")
        mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')

    return data_cleaned


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10
11			from .utils import _memory_usage
12			from .utils import _missing_vals
13			from .utils import _validate_input_0_1
14
15
16			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
17			'''
18			Converts columns to best possible dtypes using dtypes supporting pd.NA.
19
20			Parameters
21			----------
22			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
23			information is used to label the plots.
24
25			category: bool, default True
26			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
27			columns using cat_exclude.
28
29			cat_threshold: float, default 0.05
30			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
31
32			cat_exclude: default [] (empty list)
33			List of columns to exclude from categorical conversion.
34
35			Returns
36			-------
37			Pandas DataFrame.
38
39			'''
40
41			_validate_input_0_1(cat_threshold, 'cat_threshold')
42
43			data = pd.DataFrame(data).copy()
44			for col in data.columns:
45			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
46			if (category and
47			unique_vals_ratio < cat_threshold and
48			col not in cat_exclude and
49			data[col].dtype == 'object'):
50			data[col] = data[col].astype('category')
51			data[col] = data[col].convert_dtypes()
52
53			return data
54
55
56			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
57			'''
58			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
59			drop additional columns and rows based on the fraction of NA-values.
60
61			Parameters
62			----------
63			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
64			information is used to label the plots.
65
66			drop_threshold_cols: float, default 1
67			Drop columns with NA-ratio above the specified threshold.
68
69			drop_threshold_rows: float, default 1
70			Drop rows with NA-ratio above the specified threshold.
71
72			Returns
73			-------
74			Pandas DataFrame.
75
76			Notes
77			-----
78			Columns are dropped first. Rows are dropped based on the remaining data.
79
80			'''
81
82			_validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
83			_validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
84
85			data = pd.DataFrame(data)
86			data = data.dropna(axis=0, how='all')
87			data = data.dropna(axis=1, how='all')
88			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
89			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
90
91			return data_cleaned
92
93
94			def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True,
95			cat_threshold=0.03, cat_exclude=[], show='changes'):
96			'''
97			Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
98			datatypes.
99
100			Parameters
101			----------
102			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
103			information is used to label the plots.
104
105			drop_threshold_cols: float, default 0.95
106			Drop columns with NA-ratio above the specified threshold.
107
108			drop_threshold_rows: float, default 0.95
109			Drop rows with NA-ratio above the specified threshold.
110
111			category: bool, default True
112			Change dtypes of columns to "category". Set threshold using cat_threshold.
113
114			cat_threshold: float, default 0.03
115			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
116
117			cat_exclude: default [] (empty list)
118			List of columns to exclude from categorical conversion.
119
120			show: {'all', 'changes', None} default 'all'
121			Specify verbosity of the output.
122			* 'all': Print information about the data before and after cleaning as well as information about changes.
123			* 'changes': Print out differences in the data before and after cleaning.
124			* None: no information about the data is printed.
125
126			Returns
127			-------
128			Pandas DataFrame.
129
130			See Also
131			--------
132			convert_datatypes: Converts columns to best possible dtypes.
133			drop_missing : Flexibly drops columns and rows.
134			_memory_usage: Gives the total memory usage in kilobytes.
135			_missing_vals: Metrics about missing values in the dataset.
136
137
138			Notes
139			-----
140			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
141
142			'''
143
144			data = pd.DataFrame(data)
145			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
146			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
147			cat_exclude=cat_exclude)
148
149			if show in ['changes', 'all']:
150			if show == 'all':
151			print('Before data cleaning:\n')
152			print(f'dtypes:\n{data.dtypes.value_counts()}')
153			print(f'\nNumber of rows: {data.shape[0]}')
154			print(f'Number of cols: {data.shape[1]}')
155			print(f"Missing values: {_missing_vals(data)['mv_total']}")
156			print(f'Memory usage: {_memory_usage(data)} KB')
157			print('_______________________________________________________\n')
158			print('After data cleaning:\n')
159			print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
160			print(f'\nNumber of rows: {data_cleaned.shape[0]}')
161			print(f'Number of cols: {data_cleaned.shape[1]}')
162			print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}")
163			print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
164			print('_______________________________________________________\n')
165
166			print(
167			f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}")
168			print(f'\nChanges:')
169			print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
170			print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
171			print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}")
172			mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
173			print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')
174
175			return data_cleaned
176

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 0be18a...fe1083 )

klib.clean.data_cleaning() A

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like