klib.clean.data_cleaning() - Code Metrics - Inspection of "update descriptions and functions" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c0ff44...3c81b0 )

by Andreas

created 2020-04-04 12:40 UTC

klib.clean.data_cleaning() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	77
Code Lines	28

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	28
dl	0
loc	77
rs	9.208
c	0
b	0
f	0
cc	3
nop	7

How to fix Long Method

'''
Utilities for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd

from .describe import _memory_usage
from .describe import _missing_vals


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    Returns
    -------
    Pandas DataFrame.

    '''

    data = pd.DataFrame(data)
    for col in data.columns:
        data[col] = data[col].convert_dtypes()
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if category and unique_vals_ratio < cat_threshold and col not in cat_exclude:
            data[col] = data[col].astype('category')

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops entirely empty columns and rows by default and optionally provides flexibility to loosens restrictions to drop additional columns and rows based on the fraction of NA-values. Note: Columns are dropped first. Rows are dropped based on the remaining data.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    Pandas DataFrame.

    '''

    data = pd.DataFrame(data)
    data = data.dropna(axis=0, how='all')
    data = data.dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)[3] > drop_threshold_cols].columns)  # drop cols
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)[4] > drop_threshold_rows, :].index)  # drop rows

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, category=True, cat_threshold=0.05, cat_exclude=[], show='all'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.

    drop_threshold_cols: float, default 1
    Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
    Drop rows with NA-ratio above the specified threshold.

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: no information about the data is printed.

    Returns
    -------
    Pandas DataFrame.

    See Also
    --------
    convert_datatypes: Converts columns to best possible dtypes.
    drop_missing : Flexibly drops columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.


    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.

    '''

    data = pd.DataFrame(data)
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
    data_cleaned = convert_datatypes(data_cleaned, category=True, cat_threshold=0.05, cat_exclude=cat_exclude)

    if show in ['changes', 'all']:
        if show == 'all':
            print('Before data cleaning:\n')
            print(f'dtypes:\n{data.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data.shape[0]}')
            print(f'Number of cols: {data.shape[1]}')
            print(f'Missing values: {_missing_vals(data)[0]}')
            print(f'Memory usage: {_memory_usage(data)} KB')
            print('_______________________________________________________\n')
            print('After data cleaning:\n')
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
            print(f'Number of cols: {data_cleaned.shape[1]}')
            print(f'Missing values: {_missing_vals(data_cleaned)[0]}')
            print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
            print('_______________________________________________________\n')

        print(f'Shape of cleaned dataset: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)[0]}')
        print(f'\nChanges:')
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
        print(f'Dropped missing values: {_missing_vals(data)[0]-_missing_vals(data_cleaned)[0]}')
        mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
        print(f'Reduced memory by: {mem_change} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')

    return data_cleaned


1			'''
2			Utilities for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10
11			from .describe import _memory_usage
12			from .describe import _missing_vals
13
14
15			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
16			'''
17			Converts columns to best possible dtypes using dtypes supporting pd.NA.
18
19			Parameters
20			----------
21			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
22
23			category: bool, default True
24			Change dtypes of columns to "category". Set threshold using cat_threshold.
25
26			cat_threshold: float, default 0.05
27			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
28
29			cat_exclude: default [] (empty list)
30			List of columns to exclude from categorical conversion.
31
32			Returns
33			-------
34			Pandas DataFrame.
35
36			'''
37
38			data = pd.DataFrame(data)
39			for col in data.columns:
40			data[col] = data[col].convert_dtypes()
41			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
42			if category and unique_vals_ratio < cat_threshold and col not in cat_exclude:
43			data[col] = data[col].astype('category')
44
45			return data
46
47
48			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
49			'''
50			Drops entirely empty columns and rows by default and optionally provides flexibility to loosens restrictions to drop additional columns and rows based on the fraction of NA-values. Note: Columns are dropped first. Rows are dropped based on the remaining data.
51
52			Parameters
53			----------
54			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
55
56			drop_threshold_cols: float, default 1
57			Drop columns with NA-ratio above the specified threshold.
58
59			drop_threshold_rows: float, default 1
60			Drop rows with NA-ratio above the specified threshold.
61
62			Returns
63			-------
64			Pandas DataFrame.
65
66			'''
67
68			data = pd.DataFrame(data)
69			data = data.dropna(axis=0, how='all')
70			data = data.dropna(axis=1, how='all')
71			data = data.drop(columns=data.loc[:, _missing_vals(data)[3] > drop_threshold_cols].columns) # drop cols
72			data_cleaned = data.drop(index=data.loc[_missing_vals(data)[4] > drop_threshold_rows, :].index) # drop rows
73
74			return data_cleaned
75
76
77			def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, category=True, cat_threshold=0.05, cat_exclude=[], show='all'):
78			'''
79			Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the datatypes.
80
81			Parameters
82			----------
83			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
84
85			drop_threshold_cols: float, default 1
86			Drop columns with NA-ratio above the specified threshold.
87
88			drop_threshold_rows: float, default 1
89			Drop rows with NA-ratio above the specified threshold.
90
91			category: bool, default True
92			Change dtypes of columns to "category". Set threshold using cat_threshold.
93
94			cat_threshold: float, default 0.05
95			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
96
97			cat_exclude: default [] (empty list)
98			List of columns to exclude from categorical conversion.
99
100			show: {'all', 'changes', None} default 'all'
101			Specify verbosity of the output.
102			* 'all': Print information about the data before and after cleaning as well as information about changes.
103			* 'changes': Print out differences in the data before and after cleaning.
104			* None: no information about the data is printed.
105
106			Returns
107			-------
108			Pandas DataFrame.
109
110			See Also
111			--------
112			convert_datatypes: Converts columns to best possible dtypes.
113			drop_missing : Flexibly drops columns and rows.
114			_memory_usage: Gives the total memory usage in kilobytes.
115			_missing_vals: Metrics about missing values in the dataset.
116
117
118			Notes
119			-----
120			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
121
122			'''
123
124			data = pd.DataFrame(data)
125			data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
126			data_cleaned = convert_datatypes(data_cleaned, category=True, cat_threshold=0.05, cat_exclude=cat_exclude)
127
128			if show in ['changes', 'all']:
129			if show == 'all':
130			print('Before data cleaning:\n')
131			print(f'dtypes:\n{data.dtypes.value_counts()}')
132			print(f'\nNumber of rows: {data.shape[0]}')
133			print(f'Number of cols: {data.shape[1]}')
134			print(f'Missing values: {_missing_vals(data)[0]}')
135			print(f'Memory usage: {_memory_usage(data)} KB')
136			print('_______________________________________________________\n')
137			print('After data cleaning:\n')
138			print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
139			print(f'\nNumber of rows: {data_cleaned.shape[0]}')
140			print(f'Number of cols: {data_cleaned.shape[1]}')
141			print(f'Missing values: {_missing_vals(data_cleaned)[0]}')
142			print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
143			print('_______________________________________________________\n')
144
145			print(f'Shape of cleaned dataset: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)[0]}')
146			print(f'\nChanges:')
147			print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
148			print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
149			print(f'Dropped missing values: {_missing_vals(data)[0]-_missing_vals(data_cleaned)[0]}')
150			mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
151			print(f'Reduced memory by: {mem_change} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')
152
153			return data_cleaned
154

akanz1 / klib

GitHub Access Token became invalid

Push — master ( c0ff44...3c81b0 )

klib.clean.data_cleaning() A

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like