GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( bfbbb5...96b70c )
by Andreas
01:50
created

klib.clean.DataCleaner.fit()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
from sklearn.base import BaseEstimator, TransformerMixin
11
12
from .describe import corr_mat
13
from .utils import _diff_report
14
from .utils import _drop_duplicates
15
from .utils import _missing_vals
16
from .utils import _validate_input_range
17
from .utils import _validate_input_bool
18
19
__all__ = ['convert_datatypes',
20
           'data_cleaning',
21
           'drop_missing',
22
           'mv_col_handling']
23
24
25
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
26
    '''
27
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
28
29
    Parameters
30
    ----------
31
    data: 2D dataset that can be coerced into Pandas DataFrame.
32
33
    category: bool, default True
34
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
35
        columns using cat_exclude.
36
37
    cat_threshold: float, default 0.05
38
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
39
40
    cat_exclude: list, default None
41
        List of columns to exclude from categorical conversion.
42
43
    Returns
44
    -------
45
    data: Pandas DataFrame
46
    '''
47
48
    # Validate Inputs
49
    _validate_input_bool(category, 'Category')
50
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
51
52
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
53
54
    data = pd.DataFrame(data).copy()
55
    for col in data.columns:
56
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
57
        if (category and
58
            unique_vals_ratio < cat_threshold and
59
            col not in cat_exclude and
60
                data[col].dtype == 'object'):
61
            data[col] = data[col].astype('category')
62
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
63
                                             convert_integer=False, convert_boolean=True)
64
65
    return data
66
67
68
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
69
    '''
70
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
71
    drop additional columns and rows based on the fraction of remaining NA-values.
72
73
    Parameters
74
    ----------
75
    data: 2D dataset that can be coerced into Pandas DataFrame.
76
77
    drop_threshold_cols: float, default 1
78
        Drop columns with NA-ratio above the specified threshold.
79
80
    drop_threshold_rows: float, default 1
81
        Drop rows with NA-ratio above the specified threshold.
82
83
    Returns
84
    -------
85
    data_cleaned: Pandas DataFrame
86
87
    Notes
88
    -----
89
    Columns are dropped first. Rows are dropped based on the remaining data.
90
    '''
91
92
    # Validate Inputs
93
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
94
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
95
96
    data = pd.DataFrame(data).copy()
97
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
98
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
99
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
100
101
    return data_cleaned
102
103
104
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
105
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
106
    '''
107
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
108
        columns as well as optimizing the datatypes.
109
110
    Parameters
111
    ----------
112
    data: 2D dataset that can be coerced into Pandas DataFrame.
113
114
    drop_threshold_cols: float, default 0.9
115
        Drop columns with NA-ratio above the specified threshold.
116
117
    drop_threshold_rows: float, default 0.9
118
        Drop rows with NA-ratio above the specified threshold.
119
120
    drop_duplicates: bool, default True
121
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
122
123
    convert_dtypes: bool, default True
124
        Convert dtypes using pd.convert_dtypes().
125
126
    category: bool, default True
127
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
128
129
    cat_threshold: float, default 0.03
130
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
131
132
    cat_exclude: list, default None
133
        List of columns to exclude from categorical conversion.
134
135
    show: {'all', 'changes', None} default 'all'
136
        Specify verbosity of the output.
137
        * 'all': Print information about the data before and after cleaning as well as information about changes.
138
        * 'changes': Print out differences in the data before and after cleaning.
139
        * None: No information about the data and the data cleaning is printed.
140
141
    Returns
142
    -------
143
    data_cleaned: Pandas DataFrame
144
145
    See Also
146
    --------
147
    convert_datatypes: Convert columns to best possible dtypes.
148
    drop_missing : Flexibly drop columns and rows.
149
    _memory_usage: Gives the total memory usage in kilobytes.
150
    _missing_vals: Metrics about missing values in the dataset.
151
152
    Notes
153
    -----
154
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
155
    '''
156
157
    # Validate Inputs
158
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
159
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
160
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
161
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
162
    _validate_input_bool(category, 'category')
163
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
164
165
    data = pd.DataFrame(data).copy()
166
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
167
168
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
169
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
170
171
    if drop_duplicates:
172
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
173
    if convert_dtypes:
174
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
175
                                         cat_exclude=cat_exclude)
176
177
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
178
179
    return data_cleaned
180
181
182
class DataCleaner(BaseEstimator, TransformerMixin):
183
    '''Docstring of a class? methods also have docstrings or commments?'''
184
    '''possible component of a cleaning pipeline --> e.g. followed by MCH'''
185
186
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
187
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
188
        self.drop_threshold_cols = drop_threshold_cols
189
        self.drop_threshold_rows = drop_threshold_rows
190
        self.drop_duplicates = drop_duplicates
191
        self.convert_dtypes = convert_dtypes
192
        self.category = category
193
        self.cat_threshold = cat_threshold
194
        self.cat_exclude = cat_exclude
195
        self.show = show
196
197
    def fit(self, data, target=None):
198
        return self
199
200
    def transform(self, data, target=None):
201
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
202
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
203
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
204
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
205
        return data_cleaned
206
207
208
def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
209
    '''
210
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
211
    their correlation with other features and the target variable. This function follows a three step process:
212
    - 1) Identify features with a high ratio of missing values
213
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
214
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
215
         they correlate reasonably well with the target variable.
216
217
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
218
219
    Parameters
220
    ----------
221
    data: 2D dataset that can be coerced into Pandas DataFrame.
222
223
    target: string, list, np.array or pd.Series, default None
224
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
225
        and the label.
226
227
    mv_threshold: float, default 0.1
228
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
229
        for dropping and undergo further analysis.
230
231
    corr_thresh_features: float, default 0.6
232
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
233
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
234
235
    corr_thresh_target: float, default 0.3
236
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
237
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
238
        the feature is ultimately dropped.
239
240
    Returns
241
    -------
242
    data: Updated Pandas DataFrame
243
    cols_mv: Columns with missing values included in the analysis
244
    drop_cols: List of dropped columns
245
    '''
246
247
    # Validate Inputs
248
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
249
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
250
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
251
252
    data = pd.DataFrame(data).copy()
253
    data_local = data.copy()
254
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
255
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
256
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
257
258
    high_corr_features = []
259
    data_temp = data_local.copy()
260
    for col in cols_mv:
261
        corrmat = corr_mat(data_temp, colored=False)
262
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
263
            high_corr_features.append(col)
264
            data_temp = data_temp.drop(columns=[col])
265
266
    drop_cols = []
267
    if target is None:
268
        data = data.drop(columns=high_corr_features)
269
    else:
270
        for col in high_corr_features:
271
            if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
272
                drop_cols.append(col)
273
                data = data.drop(columns=[col])
274
275
    return data, cols_mv, drop_cols
276
277
278
class MVColHandler(BaseEstimator, TransformerMixin):
279
    '''possible component of a cleaning pipeline --> follows DataCleaning'''
280
281
    def __init__(self, target=None, mch_mv_thresh=0.1, mch_feature_thresh=0.6, mch_target_thresh=0.3):
282
        self.target = target
283
        self.mch_mv_thresh = mch_mv_thresh
284
        self.mch_feature_thresh = mch_feature_thresh
285
        self.mch_target_thresh = mch_target_thresh
286
287
    def fit(self, data, target=None):
288
        return self
289
290
    def transform(self, data, target=None):
291
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mch_mv_thresh,
292
                                                      corr_thresh_features=self.mch_feature_thresh,
293
                                                      corr_thresh_target=self.mch_target_thresh)
294
295
        print(f'\nFeatures with MV-ratio > {self.mch_mv_thresh}: {len(cols_mv)}')
296
        print('Features dropped:', len(dropped_cols), dropped_cols)
297
298
        return data
299