klib.clean._validate_input_0_1() - Code Metrics - Inspection of "remove column names from test data" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 487c57...0be18a )

by Andreas

created 2020-04-11 09:21 UTC

klib.clean._validate_input_0_1() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	3
nop	2
dl	0
loc	3
rs	10
c	0
b	0
f	0

'''
Utilities for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd

from .describe import _memory_usage
from .describe import _missing_vals


def _validate_input_0_1(value, desc):
    if value < 0 or value > 1:
        raise ValueError(f'Input value for {desc} is {value} but should be a float in the range 0 <= {desc} <=1.')


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    Returns
    -------
    Pandas DataFrame.

    '''

    _validate_input_0_1(cat_threshold, 'cat_threshold')

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes()

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    Pandas DataFrame.

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.

    '''

    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')

    data = pd.DataFrame(data)
    data = data.dropna(axis=0, how='all')
    data = data.dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True,
                  cat_threshold=0.03, cat_exclude=[], show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
    datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    drop_threshold_cols: float, default 0.95
    Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.95
    Drop rows with NA-ratio above the specified threshold.

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold.

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: no information about the data is printed.

    Returns
    -------
    Pandas DataFrame.

    See Also
    --------
    convert_datatypes: Converts columns to best possible dtypes.
    drop_missing : Flexibly drops columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.


    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.

    '''

    data = pd.DataFrame(data)
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
    data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
                                     cat_exclude=cat_exclude)

    if show in ['changes', 'all']:
        if show == 'all':
            print('Before data cleaning:\n')
            print(f'dtypes:\n{data.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data.shape[0]}')
            print(f'Number of cols: {data.shape[1]}')
            print(f"Missing values: {_missing_vals(data)['mv_total']}")
            print(f'Memory usage: {_memory_usage(data)} KB')
            print('_______________________________________________________\n')
            print('After data cleaning:\n')
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
            print(f'Number of cols: {data_cleaned.shape[1]}')
            print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}")
            print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
            print('_______________________________________________________\n')

        print(
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}")
        print(f'\nChanges:')
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
        print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}")
        mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')

    return data_cleaned


1			'''
2			Utilities for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10
11			from .describe import _memory_usage
12			from .describe import _missing_vals
13
14
15			def _validate_input_0_1(value, desc):
16			if value < 0 or value > 1:
17			raise ValueError(f'Input value for {desc} is {value} but should be a float in the range 0 <= {desc} <=1.')
18
19
20			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
21			'''
22			Converts columns to best possible dtypes using dtypes supporting pd.NA.
23
24			Parameters
25			----------
26			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
27			information is used to label the plots.
28
29			category: bool, default True
30			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
31			columns using cat_exclude.
32
33			cat_threshold: float, default 0.05
34			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
35
36			cat_exclude: default [] (empty list)
37			List of columns to exclude from categorical conversion.
38
39			Returns
40			-------
41			Pandas DataFrame.
42
43			'''
44
45			_validate_input_0_1(cat_threshold, 'cat_threshold')
46
47			data = pd.DataFrame(data).copy()
48			for col in data.columns:
49			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
50			if (category and
51			unique_vals_ratio < cat_threshold and
52			col not in cat_exclude and
53			data[col].dtype == 'object'):
54			data[col] = data[col].astype('category')
55			data[col] = data[col].convert_dtypes()
56
57			return data
58
59
60			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
61			'''
62			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
63			drop additional columns and rows based on the fraction of NA-values.
64
65			Parameters
66			----------
67			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
68			information is used to label the plots.
69
70			drop_threshold_cols: float, default 1
71			Drop columns with NA-ratio above the specified threshold.
72
73			drop_threshold_rows: float, default 1
74			Drop rows with NA-ratio above the specified threshold.
75
76			Returns
77			-------
78			Pandas DataFrame.
79
80			Notes
81			-----
82			Columns are dropped first. Rows are dropped based on the remaining data.
83
84			'''
85
86			_validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
87			_validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
88
89			data = pd.DataFrame(data)
90			data = data.dropna(axis=0, how='all')
91			data = data.dropna(axis=1, how='all')
92			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
93			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
94
95			return data_cleaned
96
97
98			def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True,
99			cat_threshold=0.03, cat_exclude=[], show='changes'):
100			'''
101			Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
102			datatypes.
103
104			Parameters
105			----------
106			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
107			information is used to label the plots.
108
109			drop_threshold_cols: float, default 0.95
110			Drop columns with NA-ratio above the specified threshold.
111
112			drop_threshold_rows: float, default 0.95
113			Drop rows with NA-ratio above the specified threshold.
114
115			category: bool, default True
116			Change dtypes of columns to "category". Set threshold using cat_threshold.
117
118			cat_threshold: float, default 0.03
119			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
120
121			cat_exclude: default [] (empty list)
122			List of columns to exclude from categorical conversion.
123
124			show: {'all', 'changes', None} default 'all'
125			Specify verbosity of the output.
126			* 'all': Print information about the data before and after cleaning as well as information about changes.
127			* 'changes': Print out differences in the data before and after cleaning.
128			* None: no information about the data is printed.
129
130			Returns
131			-------
132			Pandas DataFrame.
133
134			See Also
135			--------
136			convert_datatypes: Converts columns to best possible dtypes.
137			drop_missing : Flexibly drops columns and rows.
138			_memory_usage: Gives the total memory usage in kilobytes.
139			_missing_vals: Metrics about missing values in the dataset.
140
141
142			Notes
143			-----
144			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
145
146			'''
147
148			data = pd.DataFrame(data)
149			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
150			data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
151			cat_exclude=cat_exclude)
152
153			if show in ['changes', 'all']:
154			if show == 'all':
155			print('Before data cleaning:\n')
156			print(f'dtypes:\n{data.dtypes.value_counts()}')
157			print(f'\nNumber of rows: {data.shape[0]}')
158			print(f'Number of cols: {data.shape[1]}')
159			print(f"Missing values: {_missing_vals(data)['mv_total']}")
160			print(f'Memory usage: {_memory_usage(data)} KB')
161			print('_______________________________________________________\n')
162			print('After data cleaning:\n')
163			print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
164			print(f'\nNumber of rows: {data_cleaned.shape[0]}')
165			print(f'Number of cols: {data_cleaned.shape[1]}')
166			print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}")
167			print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
168			print('_______________________________________________________\n')
169
170			print(
171			f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}")
172			print(f'\nChanges:')
173			print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
174			print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
175			print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}")
176			mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
177			print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')
178
179			return data_cleaned
180

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 487c57...0be18a )

klib.clean._validate_input_0_1() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like