GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 487c57...0be18a )
by Andreas
01:18
created

klib.clean._validate_input_0_1()   A

Complexity

Conditions 3

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 3
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
'''
2
Utilities for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
11
from .describe import _memory_usage
12
from .describe import _missing_vals
13
14
15
def _validate_input_0_1(value, desc):
16
    if value < 0 or value > 1:
17
        raise ValueError(f'Input value for {desc} is {value} but should be a float in the range 0 <= {desc} <=1.')
18
19
20
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
21
    '''
22
    Converts columns to best possible dtypes using dtypes supporting pd.NA.
23
24
    Parameters
25
    ----------
26
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
27
    information is used to label the plots.
28
29
    category: bool, default True
30
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
31
        columns using cat_exclude.
32
33
    cat_threshold: float, default 0.05
34
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
35
36
    cat_exclude: default [] (empty list)
37
        List of columns to exclude from categorical conversion.
38
39
    Returns
40
    -------
41
    Pandas DataFrame.
42
43
    '''
44
45
    _validate_input_0_1(cat_threshold, 'cat_threshold')
46
47
    data = pd.DataFrame(data).copy()
48
    for col in data.columns:
49
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
50
        if (category and
51
            unique_vals_ratio < cat_threshold and
52
            col not in cat_exclude and
53
                data[col].dtype == 'object'):
54
            data[col] = data[col].astype('category')
55
        data[col] = data[col].convert_dtypes()
56
57
    return data
58
59
60
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
61
    '''
62
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
63
    drop additional columns and rows based on the fraction of NA-values.
64
65
    Parameters
66
    ----------
67
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
68
    information is used to label the plots.
69
70
    drop_threshold_cols: float, default 1
71
        Drop columns with NA-ratio above the specified threshold.
72
73
    drop_threshold_rows: float, default 1
74
        Drop rows with NA-ratio above the specified threshold.
75
76
    Returns
77
    -------
78
    Pandas DataFrame.
79
80
    Notes
81
    -----
82
    Columns are dropped first. Rows are dropped based on the remaining data.
83
84
    '''
85
86
    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
87
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
88
89
    data = pd.DataFrame(data)
90
    data = data.dropna(axis=0, how='all')
91
    data = data.dropna(axis=1, how='all')
92
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
93
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
94
95
    return data_cleaned
96
97
98
def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True,
99
                  cat_threshold=0.03, cat_exclude=[], show='changes'):
100
    '''
101
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
102
    datatypes.
103
104
    Parameters
105
    ----------
106
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
107
    information is used to label the plots.
108
109
    drop_threshold_cols: float, default 0.95
110
    Drop columns with NA-ratio above the specified threshold.
111
112
    drop_threshold_rows: float, default 0.95
113
    Drop rows with NA-ratio above the specified threshold.
114
115
    category: bool, default True
116
        Change dtypes of columns to "category". Set threshold using cat_threshold.
117
118
    cat_threshold: float, default 0.03
119
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
120
121
    cat_exclude: default [] (empty list)
122
        List of columns to exclude from categorical conversion.
123
124
    show: {'all', 'changes', None} default 'all'
125
        Specify verbosity of the output.
126
        * 'all': Print information about the data before and after cleaning as well as information about changes.
127
        * 'changes': Print out differences in the data before and after cleaning.
128
        * None: no information about the data is printed.
129
130
    Returns
131
    -------
132
    Pandas DataFrame.
133
134
    See Also
135
    --------
136
    convert_datatypes: Converts columns to best possible dtypes.
137
    drop_missing : Flexibly drops columns and rows.
138
    _memory_usage: Gives the total memory usage in kilobytes.
139
    _missing_vals: Metrics about missing values in the dataset.
140
141
142
    Notes
143
    -----
144
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
145
146
    '''
147
148
    data = pd.DataFrame(data)
149
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
150
    data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
151
                                     cat_exclude=cat_exclude)
152
153
    if show in ['changes', 'all']:
154
        if show == 'all':
155
            print('Before data cleaning:\n')
156
            print(f'dtypes:\n{data.dtypes.value_counts()}')
157
            print(f'\nNumber of rows: {data.shape[0]}')
158
            print(f'Number of cols: {data.shape[1]}')
159
            print(f"Missing values: {_missing_vals(data)['mv_total']}")
160
            print(f'Memory usage: {_memory_usage(data)} KB')
161
            print('_______________________________________________________\n')
162
            print('After data cleaning:\n')
163
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
164
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
165
            print(f'Number of cols: {data_cleaned.shape[1]}')
166
            print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}")
167
            print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
168
            print('_______________________________________________________\n')
169
170
        print(
171
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}")
172
        print(f'\nChanges:')
173
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
174
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
175
        print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}")
176
        mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
177
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')
178
179
    return data_cleaned
180