GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( b2514e...e55ee5 )
by Andreas
05:22
created

klib.clean.DataCleaner.transform()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 6
nop 3
dl 0
loc 6
rs 10
c 0
b 0
f 0
1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
from sklearn.base import BaseEstimator, TransformerMixin
11
12
# from .preprocess import mv_col_handler
13
from .utils import _diff_report
14
from .utils import _drop_duplicates
15
from .utils import _missing_vals
16
from .utils import _validate_input_range
17
from .utils import _validate_input_bool
18
19
20
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
21
    '''
22
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
23
24
    Parameters
25
    ----------
26
    data: 2D dataset that can be coerced into Pandas DataFrame.
27
28
    category: bool, default True
29
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
30
        columns using cat_exclude.
31
32
    cat_threshold: float, default 0.05
33
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
34
35
    cat_exclude: list, default None
36
        List of columns to exclude from categorical conversion.
37
38
    Returns
39
    -------
40
    data: Pandas DataFrame
41
    '''
42
43
    # Validate Inputs
44
    _validate_input_bool(category, 'Category')
45
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
46
47
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
48
49
    data = pd.DataFrame(data).copy()
50
    for col in data.columns:
51
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
52
        if (category and
53
            unique_vals_ratio < cat_threshold and
54
            col not in cat_exclude and
55
                data[col].dtype == 'object'):
56
            data[col] = data[col].astype('category')
57
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
58
                                             convert_integer=False, convert_boolean=True)
59
60
    return data
61
62
63
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
64
    '''
65
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
66
    drop additional columns and rows based on the fraction of remaining NA-values.
67
68
    Parameters
69
    ----------
70
    data: 2D dataset that can be coerced into Pandas DataFrame.
71
72
    drop_threshold_cols: float, default 1
73
        Drop columns with NA-ratio above the specified threshold.
74
75
    drop_threshold_rows: float, default 1
76
        Drop rows with NA-ratio above the specified threshold.
77
78
    Returns
79
    -------
80
    data_cleaned: Pandas DataFrame
81
82
    Notes
83
    -----
84
    Columns are dropped first. Rows are dropped based on the remaining data.
85
    '''
86
87
    # Validate Inputs
88
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
89
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
90
91
    data = pd.DataFrame(data).copy()
92
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
93
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
94
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
95
96
    return data_cleaned
97
98
99
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
100
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
101
    '''
102
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
103
        columns as well as optimizing the datatypes.
104
105
    Parameters
106
    ----------
107
    data: 2D dataset that can be coerced into Pandas DataFrame.
108
109
    drop_threshold_cols: float, default 0.9
110
        Drop columns with NA-ratio above the specified threshold.
111
112
    drop_threshold_rows: float, default 0.9
113
        Drop rows with NA-ratio above the specified threshold.
114
115
    drop_duplicates: bool, default True
116
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
117
118
    convert_dtypes: bool, default True
119
        Convert dtypes using pd.convert_dtypes().
120
121
    category: bool, default True
122
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
123
124
    cat_threshold: float, default 0.03
125
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
126
127
    cat_exclude: list, default None
128
        List of columns to exclude from categorical conversion.
129
130
    show: {'all', 'changes', None} default 'all'
131
        Specify verbosity of the output.
132
        * 'all': Print information about the data before and after cleaning as well as information about changes.
133
        * 'changes': Print out differences in the data before and after cleaning.
134
        * None: No information about the data and the data cleaning is printed.
135
136
    Returns
137
    -------
138
    data_cleaned: Pandas DataFrame
139
140
    See Also
141
    --------
142
    convert_datatypes: Convert columns to best possible dtypes.
143
    drop_missing : Flexibly drop columns and rows.
144
    _memory_usage: Gives the total memory usage in kilobytes.
145
    _missing_vals: Metrics about missing values in the dataset.
146
147
    Notes
148
    -----
149
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
150
    '''
151
152
    # Validate Inputs
153
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
154
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
155
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
156
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
157
    _validate_input_bool(category, 'category')
158
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
159
160
    data = pd.DataFrame(data).copy()
161
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
162
163
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
164
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
165
166
    if drop_duplicates:
167
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
168
    if convert_dtypes:
169
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
170
                                         cat_exclude=cat_exclude)
171
172
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
173
174
    return data_cleaned
175
176
177
class DataCleaner(BaseEstimator, TransformerMixin):
178
    '''Docstring of a class? methods also have docstrings or commments?'''
179
    '''possible component of a cleaning pipeline --> e.g. followed by MCH'''
180
181
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
182
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
183
        self.drop_threshold_cols = drop_threshold_cols
184
        self.drop_threshold_rows = drop_threshold_rows
185
        self.drop_duplicates = drop_duplicates
186
        self.convert_dtypes = convert_dtypes
187
        self.category = category
188
        self.cat_threshold = cat_threshold
189
        self.cat_exclude = cat_exclude
190
        self.show = show
191
192
    def fit(self, data, target=None):
193
        return self
194
195
    def transform(self, data, target=None):
196
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
197
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
198
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
199
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
200
        return data_cleaned
201