klib.clean - Code Metrics - Inspection of "drop duplicates and validate inputs" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( cc6bfd...4f98db )

by Andreas

created 2020-04-16 08:57 UTC

klib.clean A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	195
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	71
dl	0
loc	195
rs	10
c	0
b	0
f	0
wmc	10

3 Functions

Rating	Name	Size	Complexity
A	drop_missing()	36	1
B	convert_datatypes()	39	6
B	data_cleaning()	98	3

'''
Functions for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd

from .utils import _drop_duplicates
from .utils import _memory_usage
from .utils import _missing_vals
from .utils import _validate_input_0_1
from .utils import _validate_input_bool


def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
    '''
    Converts columns to best possible dtypes using dtypes supporting pd.NA.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    category: bool, default True
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
        columns using cat_exclude.

    cat_threshold: float, default 0.05
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    Returns
    -------
    data: Pandas DataFrame

    '''

    # Validate Inputs
    _validate_input_bool(category, 'Category')
    _validate_input_0_1(cat_threshold, 'cat_threshold')

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (category and
            unique_vals_ratio < cat_threshold and
            col not in cat_exclude and
                data[col].dtype == 'object'):
            data[col] = data[col].astype('category')
        data[col] = data[col].convert_dtypes()

    return data


def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
    '''
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
    drop additional columns and rows based on the fraction of NA-values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 1
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 1
        Drop rows with NA-ratio above the specified threshold.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    Notes
    -----
    Columns are dropped first. Rows are dropped based on the remaining data.

    '''

    # Validate Inputs
    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')

    data = pd.DataFrame(data).copy()
    data = data.dropna(axis=0, how='all')
    data = data.dropna(axis=1, how='all')
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)

    return data_cleaned


def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, drop_duplicates=True, category=True,
                  cat_threshold=0.03, cat_exclude=[], show='changes'):
    '''
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
    datatypes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    drop_threshold_cols: float, default 0.95
        Drop columns with NA-ratio above the specified threshold.

    drop_threshold_rows: float, default 0.95
        Drop rows with NA-ratio above the specified threshold.

    drop_duplicates: bool, default True
        Drops duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold.

    cat_threshold: float, default 0.03
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.

    cat_exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    data_cleaned: Pandas DataFrame

    See Also
    --------
    convert_datatypes: Converts columns to best possible dtypes.
    drop_missing : Flexibly drops columns and rows.
    _memory_usage: Gives the total memory usage in kilobytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.

    '''

    # Validate Inputs
    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
    _validate_input_bool(category, 'category')
    _validate_input_0_1(cat_threshold, 'cat_threshold')

    data = pd.DataFrame(data).copy()

    data = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
    data, dupl_idx = _drop_duplicates(data)
    data_cleaned = convert_datatypes(data, category=category, cat_threshold=cat_threshold,
                                     cat_exclude=cat_exclude)

    if show in ['changes', 'all']:
        data_mem = _memory_usage(data)
        data_cl_mem = _memory_usage(data_cleaned)
        data_mv_tot = _missing_vals(data)['mv_total']
        data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']

        if show == 'all':
            print('Before data cleaning:\n')
            print(f'dtypes:\n{data.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data.shape[0]}')
            print(f'Number of cols: {data.shape[1]}')
            print(f"Missing values: {data_mv_tot}")
            print(f'Memory usage: {data_mem} KB')
            print('_______________________________________________________\n')
            print('After data cleaning:\n')
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
            print(f'Number of cols: {data_cleaned.shape[1]}')
            print(f"Missing values: {data_cl_mv_tot}")
            print(f'Memory usage: {data_cl_mem} KB')
            print('_______________________________________________________\n')

        print(
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
        print(f'\nChanges:')
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
        print(f'    of which {len(dupl_idx)} were duplicates. (Rows with index: {dupl_idx})')
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
        mem_change = data_mem-data_cl_mem
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')

    return data_cleaned


1			'''
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10
11			from .utils import _drop_duplicates
12			from .utils import _memory_usage
13			from .utils import _missing_vals
14			from .utils import _validate_input_0_1
15			from .utils import _validate_input_bool
16
17
18			def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
19			'''
20			Converts columns to best possible dtypes using dtypes supporting pd.NA.
21
22			Parameters
23			----------
24			data: 2D dataset that can be coerced into Pandas DataFrame.
25
26			category: bool, default True
27			Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
28			columns using cat_exclude.
29
30			cat_threshold: float, default 0.05
31			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
32
33			cat_exclude: default [] (empty list)
34			List of columns to exclude from categorical conversion.
35
36			Returns
37			-------
38			data: Pandas DataFrame
39
40			'''
41
42			# Validate Inputs
43			_validate_input_bool(category, 'Category')
44			_validate_input_0_1(cat_threshold, 'cat_threshold')
45
46			data = pd.DataFrame(data).copy()
47			for col in data.columns:
48			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
49			if (category and
50			unique_vals_ratio < cat_threshold and
51			col not in cat_exclude and
52			data[col].dtype == 'object'):
53			data[col] = data[col].astype('category')
54			data[col] = data[col].convert_dtypes()
55
56			return data
57
58
59			def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
60			'''
61			Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
62			drop additional columns and rows based on the fraction of NA-values.
63
64			Parameters
65			----------
66			data: 2D dataset that can be coerced into Pandas DataFrame.
67
68			drop_threshold_cols: float, default 1
69			Drop columns with NA-ratio above the specified threshold.
70
71			drop_threshold_rows: float, default 1
72			Drop rows with NA-ratio above the specified threshold.
73
74			Returns
75			-------
76			data_cleaned: Pandas DataFrame
77
78			Notes
79			-----
80			Columns are dropped first. Rows are dropped based on the remaining data.
81
82			'''
83
84			# Validate Inputs
85			_validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
86			_validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
87
88			data = pd.DataFrame(data).copy()
89			data = data.dropna(axis=0, how='all')
90			data = data.dropna(axis=1, how='all')
91			data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
92			data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
93
94			return data_cleaned
95
96
97			def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, drop_duplicates=True, category=True,
98			cat_threshold=0.03, cat_exclude=[], show='changes'):
99			'''
100			Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
101			datatypes.
102
103			Parameters
104			----------
105			data: 2D dataset that can be coerced into Pandas DataFrame.
106
107			drop_threshold_cols: float, default 0.95
108			Drop columns with NA-ratio above the specified threshold.
109
110			drop_threshold_rows: float, default 0.95
111			Drop rows with NA-ratio above the specified threshold.
112
113			drop_duplicates: bool, default True
114			Drops duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
115
116			category: bool, default True
117			Change dtypes of columns to "category". Set threshold using cat_threshold.
118
119			cat_threshold: float, default 0.03
120			Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
121
122			cat_exclude: default [] (empty list)
123			List of columns to exclude from categorical conversion.
124
125			show: {'all', 'changes', None} default 'all'
126			Specify verbosity of the output.
127			* 'all': Print information about the data before and after cleaning as well as information about changes.
128			* 'changes': Print out differences in the data before and after cleaning.
129			* None: No information about the data and the data cleaning is printed.
130
131			Returns
132			-------
133			data_cleaned: Pandas DataFrame
134
135			See Also
136			--------
137			convert_datatypes: Converts columns to best possible dtypes.
138			drop_missing : Flexibly drops columns and rows.
139			_memory_usage: Gives the total memory usage in kilobytes.
140			_missing_vals: Metrics about missing values in the dataset.
141
142			Notes
143			-----
144			The category dtype is not grouped in the summary, unless it contains exactly the same categories.
145
146			'''
147
148			# Validate Inputs
149			_validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
150			_validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
151			_validate_input_bool(drop_duplicates, 'drop_duplicates')
152			_validate_input_bool(category, 'category')
153			_validate_input_0_1(cat_threshold, 'cat_threshold')
154
155			data = pd.DataFrame(data).copy()
156
157			data = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
158			data, dupl_idx = _drop_duplicates(data)
159			data_cleaned = convert_datatypes(data, category=category, cat_threshold=cat_threshold,
160			cat_exclude=cat_exclude)
161
162			if show in ['changes', 'all']:
163			data_mem = _memory_usage(data)
164			data_cl_mem = _memory_usage(data_cleaned)
165			data_mv_tot = _missing_vals(data)['mv_total']
166			data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']
167
168			if show == 'all':
169			print('Before data cleaning:\n')
170			print(f'dtypes:\n{data.dtypes.value_counts()}')
171			print(f'\nNumber of rows: {data.shape[0]}')
172			print(f'Number of cols: {data.shape[1]}')
173			print(f"Missing values: {data_mv_tot}")
174			print(f'Memory usage: {data_mem} KB')
175			print('_______________________________________________________\n')
176			print('After data cleaning:\n')
177			print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
178			print(f'\nNumber of rows: {data_cleaned.shape[0]}')
179			print(f'Number of cols: {data_cleaned.shape[1]}')
180			print(f"Missing values: {data_cl_mv_tot}")
181			print(f'Memory usage: {data_cl_mem} KB')
182			print('_______________________________________________________\n')
183
184			print(
185			f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
186			print(f'\nChanges:')
187			print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
188			print(f' of which {len(dupl_idx)} were duplicates. (Rows with index: {dupl_idx})')
189			print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
190			print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
191			mem_change = data_mem-data_cl_mem
192			print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')
193
194			return data_cleaned
195

akanz1 / klib

GitHub Access Token became invalid

Push — master ( cc6bfd...4f98db )

klib.clean A

Complexity

Size/Duplication

Importance

3 Functions

Duplication Side-by-Side

Filter issues like