GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 0be18a...fe1083 )
by Andreas
01:16
created

klib.clean._validate_input_0_1()   A

Complexity

Conditions 3

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 3
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
11
from .utils import _memory_usage
12
from .utils import _missing_vals
13
from .utils import _validate_input_0_1
14
15
16
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
17
    '''
18
    Converts columns to best possible dtypes using dtypes supporting pd.NA.
19
20
    Parameters
21
    ----------
22
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
23
    information is used to label the plots.
24
25
    category: bool, default True
26
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
27
        columns using cat_exclude.
28
29
    cat_threshold: float, default 0.05
30
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
31
32
    cat_exclude: default [] (empty list)
33
        List of columns to exclude from categorical conversion.
34
35
    Returns
36
    -------
37
    Pandas DataFrame.
38
39
    '''
40
41
    _validate_input_0_1(cat_threshold, 'cat_threshold')
42
43
    data = pd.DataFrame(data).copy()
44
    for col in data.columns:
45
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
46
        if (category and
47
            unique_vals_ratio < cat_threshold and
48
            col not in cat_exclude and
49
                data[col].dtype == 'object'):
50
            data[col] = data[col].astype('category')
51
        data[col] = data[col].convert_dtypes()
52
53
    return data
54
55
56
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
57
    '''
58
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
59
    drop additional columns and rows based on the fraction of NA-values.
60
61
    Parameters
62
    ----------
63
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
64
    information is used to label the plots.
65
66
    drop_threshold_cols: float, default 1
67
        Drop columns with NA-ratio above the specified threshold.
68
69
    drop_threshold_rows: float, default 1
70
        Drop rows with NA-ratio above the specified threshold.
71
72
    Returns
73
    -------
74
    Pandas DataFrame.
75
76
    Notes
77
    -----
78
    Columns are dropped first. Rows are dropped based on the remaining data.
79
80
    '''
81
82
    _validate_input_0_1(drop_threshold_cols, 'drop_threshold_cols')
83
    _validate_input_0_1(drop_threshold_rows, 'drop_threshold_rows')
84
85
    data = pd.DataFrame(data)
86
    data = data.dropna(axis=0, how='all')
87
    data = data.dropna(axis=1, how='all')
88
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
89
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
90
91
    return data_cleaned
92
93
94
def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True,
95
                  cat_threshold=0.03, cat_exclude=[], show='changes'):
96
    '''
97
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \
98
    datatypes.
99
100
    Parameters
101
    ----------
102
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
103
    information is used to label the plots.
104
105
    drop_threshold_cols: float, default 0.95
106
    Drop columns with NA-ratio above the specified threshold.
107
108
    drop_threshold_rows: float, default 0.95
109
    Drop rows with NA-ratio above the specified threshold.
110
111
    category: bool, default True
112
        Change dtypes of columns to "category". Set threshold using cat_threshold.
113
114
    cat_threshold: float, default 0.03
115
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
116
117
    cat_exclude: default [] (empty list)
118
        List of columns to exclude from categorical conversion.
119
120
    show: {'all', 'changes', None} default 'all'
121
        Specify verbosity of the output.
122
        * 'all': Print information about the data before and after cleaning as well as information about changes.
123
        * 'changes': Print out differences in the data before and after cleaning.
124
        * None: no information about the data is printed.
125
126
    Returns
127
    -------
128
    Pandas DataFrame.
129
130
    See Also
131
    --------
132
    convert_datatypes: Converts columns to best possible dtypes.
133
    drop_missing : Flexibly drops columns and rows.
134
    _memory_usage: Gives the total memory usage in kilobytes.
135
    _missing_vals: Metrics about missing values in the dataset.
136
137
138
    Notes
139
    -----
140
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
141
142
    '''
143
144
    data = pd.DataFrame(data)
145
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
146
    data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
147
                                     cat_exclude=cat_exclude)
148
149
    if show in ['changes', 'all']:
150
        if show == 'all':
151
            print('Before data cleaning:\n')
152
            print(f'dtypes:\n{data.dtypes.value_counts()}')
153
            print(f'\nNumber of rows: {data.shape[0]}')
154
            print(f'Number of cols: {data.shape[1]}')
155
            print(f"Missing values: {_missing_vals(data)['mv_total']}")
156
            print(f'Memory usage: {_memory_usage(data)} KB')
157
            print('_______________________________________________________\n')
158
            print('After data cleaning:\n')
159
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
160
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
161
            print(f'Number of cols: {data_cleaned.shape[1]}')
162
            print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}")
163
            print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
164
            print('_______________________________________________________\n')
165
166
        print(
167
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}")
168
        print(f'\nChanges:')
169
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
170
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
171
        print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}")
172
        mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
173
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')
174
175
    return data_cleaned
176