GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( c0ff44...3c81b0 )
by Andreas
01:16
created

klib.clean.drop_missing()   A

Complexity

Conditions 1

Size

Total Lines 27
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 27
rs 10
c 0
b 0
f 0
cc 1
nop 3
1
'''
2
Utilities for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
11
from .describe import _memory_usage
12
from .describe import _missing_vals
13
14
15
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=[]):
16
    '''
17
    Converts columns to best possible dtypes using dtypes supporting pd.NA.
18
19
    Parameters
20
    ----------
21
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
22
23
    category: bool, default True
24
        Change dtypes of columns to "category". Set threshold using cat_threshold.
25
26
    cat_threshold: float, default 0.05
27
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
28
29
    cat_exclude: default [] (empty list)
30
        List of columns to exclude from categorical conversion.
31
32
    Returns
33
    -------
34
    Pandas DataFrame.
35
36
    '''
37
38
    data = pd.DataFrame(data)
39
    for col in data.columns:
40
        data[col] = data[col].convert_dtypes()
41
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
42
        if category and unique_vals_ratio < cat_threshold and col not in cat_exclude:
43
            data[col] = data[col].astype('category')
44
45
    return data
46
47
48
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
49
    '''
50
    Drops entirely empty columns and rows by default and optionally provides flexibility to loosens restrictions to drop additional columns and rows based on the fraction of NA-values. Note: Columns are dropped first. Rows are dropped based on the remaining data.
51
52
    Parameters
53
    ----------
54
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
55
56
    drop_threshold_cols: float, default 1
57
        Drop columns with NA-ratio above the specified threshold.
58
59
    drop_threshold_rows: float, default 1
60
        Drop rows with NA-ratio above the specified threshold.
61
62
    Returns
63
    -------
64
    Pandas DataFrame.
65
66
    '''
67
68
    data = pd.DataFrame(data)
69
    data = data.dropna(axis=0, how='all')
70
    data = data.dropna(axis=1, how='all')
71
    data = data.drop(columns=data.loc[:, _missing_vals(data)[3] > drop_threshold_cols].columns)  # drop cols
72
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)[4] > drop_threshold_rows, :].index)  # drop rows
73
74
    return data_cleaned
75
76
77
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, category=True, cat_threshold=0.05, cat_exclude=[], show='all'):
78
    '''
79
    Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the datatypes.
80
81
    Parameters
82
    ----------
83
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
84
85
    drop_threshold_cols: float, default 1
86
    Drop columns with NA-ratio above the specified threshold.
87
88
    drop_threshold_rows: float, default 1
89
    Drop rows with NA-ratio above the specified threshold.
90
91
    category: bool, default True
92
        Change dtypes of columns to "category". Set threshold using cat_threshold.
93
94
    cat_threshold: float, default 0.05
95
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
96
97
    cat_exclude: default [] (empty list)
98
        List of columns to exclude from categorical conversion.
99
100
    show: {'all', 'changes', None} default 'all'
101
        Specify verbosity of the output.
102
        * 'all': Print information about the data before and after cleaning as well as information about changes.
103
        * 'changes': Print out differences in the data before and after cleaning.
104
        * None: no information about the data is printed.
105
106
    Returns
107
    -------
108
    Pandas DataFrame.
109
110
    See Also
111
    --------
112
    convert_datatypes: Converts columns to best possible dtypes.
113
    drop_missing : Flexibly drops columns and rows.
114
    _memory_usage: Gives the total memory usage in kilobytes.
115
    _missing_vals: Metrics about missing values in the dataset.
116
117
118
    Notes
119
    -----
120
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
121
122
    '''
123
124
    data = pd.DataFrame(data)
125
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
126
    data_cleaned = convert_datatypes(data_cleaned, category=True, cat_threshold=0.05, cat_exclude=cat_exclude)
127
128
    if show in ['changes', 'all']:
129
        if show == 'all':
130
            print('Before data cleaning:\n')
131
            print(f'dtypes:\n{data.dtypes.value_counts()}')
132
            print(f'\nNumber of rows: {data.shape[0]}')
133
            print(f'Number of cols: {data.shape[1]}')
134
            print(f'Missing values: {_missing_vals(data)[0]}')
135
            print(f'Memory usage: {_memory_usage(data)} KB')
136
            print('_______________________________________________________\n')
137
            print('After data cleaning:\n')
138
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
139
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
140
            print(f'Number of cols: {data_cleaned.shape[1]}')
141
            print(f'Missing values: {_missing_vals(data_cleaned)[0]}')
142
            print(f'Memory usage: {_memory_usage(data_cleaned)} KB')
143
            print('_______________________________________________________\n')
144
145
        print(f'Shape of cleaned dataset: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)[0]}')
146
        print(f'\nChanges:')
147
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
148
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
149
        print(f'Dropped missing values: {_missing_vals(data)[0]-_missing_vals(data_cleaned)[0]}')
150
        mem_change = _memory_usage(data)-_memory_usage(data_cleaned)
151
        print(f'Reduced memory by: {mem_change} KB (-{round(100*mem_change/_memory_usage(data),1)}%)')
152
153
    return data_cleaned
154