GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( cc6bfd...4f98db )
by Andreas
01:27
created

klib.clean.data_cleaning()   B

Complexity

Conditions 3

Size

Total Lines 98
Code Lines 42

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 42
nop 8
dl 0
loc 98
rs 8.872
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
11
from .utils import _drop_duplicates
12
from .utils import _memory_usage
13
from .utils import _missing_vals
14
from .utils import _validate_input_0_1
15
from .utils import _validate_input_bool
16
17
18
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
19
    '''
20
    Converts columns to best possible dtypes using dtypes supporting pd.NA.
21
22
    Parameters
23
    ----------
24
    data: 2D dataset that can be coerced into Pandas DataFrame.
25
26
    category: bool, default True
27
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
28
        columns using cat_exclude.
29
30
    cat_threshold: float, default 0.05
31
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
32
33
    cat_exclude: default [] (empty list)
34
        List of columns to exclude from categorical conversion.
35
36
    Returns
37
    -------
38
    data: Pandas DataFrame
39
40
    '''
41
42
    # Validate Inputs
43
    _validate_input_bool(category, 'Category')
44
    _validate_input_0_1(cat_threshold, 'cat_threshold')
45
46
    data = pd.DataFrame(data).copy()
47
    for col in data.columns:
48
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
49
        if (category and
50
            unique_vals_ratio < cat_threshold and
51
            col not in cat_exclude and
52
                data[col].dtype == 'object'):
53
            data[col] = data[col].astype('category')
54
        data[col] = data[col].convert_dtypes()
55
56
    return data
57
58
59
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
60
    '''
61
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
62
    drop additional columns and rows based on the fraction of NA-values.
63
64
    Parameters
65
    ----------
66
    data: 2D dataset that can be coerced into Pandas DataFrame.
67
68
    drop_threshold_cols: float, default 1
69
        Drop columns with NA-ratio above the specified threshold.
70
71
    drop_threshold_rows: float, default 1
72
        Drop rows with NA-ratio above the specified threshold.
73
74
    Returns
75
    -------
76
    data_cleaned: Pandas DataFrame
77
78
    Notes
79
    -----
80
    Columns are dropped first. Rows are dropped based on the remaining data.
81
82
    '''
83
84
    # Validate Inputs
85
    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
86
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
87
88
    data = pd.DataFrame(data).copy()
89
    data = data.dropna(axis=0, how='all')
90
    data = data.dropna(axis=1, how='all')
91
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
92
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
93
94
    return data_cleaned
95
96
97
def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, drop_duplicates=True, category=True,
98
                  cat_threshold=0.03, cat_exclude=[], show='changes'):
99
    '''
100
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
101
    datatypes.
102
103
    Parameters
104
    ----------
105
    data: 2D dataset that can be coerced into Pandas DataFrame.
106
107
    drop_threshold_cols: float, default 0.95
108
        Drop columns with NA-ratio above the specified threshold.
109
110
    drop_threshold_rows: float, default 0.95
111
        Drop rows with NA-ratio above the specified threshold.
112
113
    drop_duplicates: bool, default True
114
        Drops duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
115
116
    category: bool, default True
117
        Change dtypes of columns to "category". Set threshold using cat_threshold.
118
119
    cat_threshold: float, default 0.03
120
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
121
122
    cat_exclude: default [] (empty list)
123
        List of columns to exclude from categorical conversion.
124
125
    show: {'all', 'changes', None} default 'all'
126
        Specify verbosity of the output.
127
        * 'all': Print information about the data before and after cleaning as well as information about changes.
128
        * 'changes': Print out differences in the data before and after cleaning.
129
        * None: No information about the data and the data cleaning is printed.
130
131
    Returns
132
    -------
133
    data_cleaned: Pandas DataFrame
134
135
    See Also
136
    --------
137
    convert_datatypes: Converts columns to best possible dtypes.
138
    drop_missing : Flexibly drops columns and rows.
139
    _memory_usage: Gives the total memory usage in kilobytes.
140
    _missing_vals: Metrics about missing values in the dataset.
141
142
    Notes
143
    -----
144
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
145
146
    '''
147
148
    # Validate Inputs
149
    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
150
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
151
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
152
    _validate_input_bool(category, 'category')
153
    _validate_input_0_1(cat_threshold, 'cat_threshold')
154
155
    data = pd.DataFrame(data).copy()
156
157
    data = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
158
    data, dupl_idx = _drop_duplicates(data)
159
    data_cleaned = convert_datatypes(data, category=category, cat_threshold=cat_threshold,
160
                                     cat_exclude=cat_exclude)
161
162
    if show in ['changes', 'all']:
163
        data_mem = _memory_usage(data)
164
        data_cl_mem = _memory_usage(data_cleaned)
165
        data_mv_tot = _missing_vals(data)['mv_total']
166
        data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']
167
168
        if show == 'all':
169
            print('Before data cleaning:\n')
170
            print(f'dtypes:\n{data.dtypes.value_counts()}')
171
            print(f'\nNumber of rows: {data.shape[0]}')
172
            print(f'Number of cols: {data.shape[1]}')
173
            print(f"Missing values: {data_mv_tot}")
174
            print(f'Memory usage: {data_mem} KB')
175
            print('_______________________________________________________\n')
176
            print('After data cleaning:\n')
177
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
178
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
179
            print(f'Number of cols: {data_cleaned.shape[1]}')
180
            print(f"Missing values: {data_cl_mv_tot}")
181
            print(f'Memory usage: {data_cl_mem} KB')
182
            print('_______________________________________________________\n')
183
184
        print(
185
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
186
        print(f'\nChanges:')
187
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
188
        print(f'    of which {len(dupl_idx)} were duplicates. (Rows with index: {dupl_idx})')
189
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
190
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
191
        mem_change = data_mem-data_cl_mem
192
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')
193
194
    return data_cleaned
195