GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( deb24b...b3e4ef )
by Andreas
01:25
created

klib.clean.mv_col_handling()   B

Complexity

Conditions 7

Size

Total Lines 77
Code Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 26
nop 6
dl 0
loc 77
rs 7.856
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import itertools
10
import numpy as np
11
import pandas as pd
12
from sklearn.base import BaseEstimator, TransformerMixin
13
14
from .describe import corr_mat
15
from .utils import (_diff_report,
16
                    _drop_duplicates,
17
                    _missing_vals,
18
                    _validate_input_bool,
19
                    _validate_input_range)
20
21
22
__all__ = ['convert_datatypes',
23
           'data_cleaning',
24
           'drop_missing',
25
           'mv_col_handling']
26
27
28
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
29
    '''
30
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
31
32
    Parameters
33
    ----------
34
    data: 2D dataset that can be coerced into Pandas DataFrame.
35
36
    category: bool, default True
37
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
38
        columns using cat_exclude.
39
40
    cat_threshold: float, default 0.05
41
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
42
43
    cat_exclude: list, default None
44
        List of columns to exclude from categorical conversion.
45
46
    Returns
47
    -------
48
    data: Pandas DataFrame
49
    '''
50
51
    # Validate Inputs
52
    _validate_input_bool(category, 'Category')
53
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
54
55
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
56
57
    data = pd.DataFrame(data).copy()
58
    for col in data.columns:
59
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
60
        if (category and
61
            unique_vals_ratio < cat_threshold and
62
            col not in cat_exclude and
63
                data[col].dtype == 'object'):
64
            data[col] = data[col].astype('category')
65
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
66
                                             convert_integer=False, convert_boolean=True)
67
68
    return data
69
70
71
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
72
    '''
73
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
74
    drop additional columns and rows based on the fraction of remaining NA-values.
75
76
    Parameters
77
    ----------
78
    data: 2D dataset that can be coerced into Pandas DataFrame.
79
80
    drop_threshold_cols: float, default 1
81
        Drop columns with NA-ratio above the specified threshold.
82
83
    drop_threshold_rows: float, default 1
84
        Drop rows with NA-ratio above the specified threshold.
85
86
    Returns
87
    -------
88
    data_cleaned: Pandas DataFrame
89
90
    Notes
91
    -----
92
    Columns are dropped first. Rows are dropped based on the remaining data.
93
    '''
94
95
    # Validate Inputs
96
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
97
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
98
99
    data = pd.DataFrame(data).copy()
100
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
101
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
102
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
103
104
    return data_cleaned
105
106
107
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
108
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
109
    '''
110
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
111
        columns as well as optimizing the datatypes.
112
113
    Parameters
114
    ----------
115
    data: 2D dataset that can be coerced into Pandas DataFrame.
116
117
    drop_threshold_cols: float, default 0.9
118
        Drop columns with NA-ratio above the specified threshold.
119
120
    drop_threshold_rows: float, default 0.9
121
        Drop rows with NA-ratio above the specified threshold.
122
123
    drop_duplicates: bool, default True
124
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
125
126
    convert_dtypes: bool, default True
127
        Convert dtypes using pd.convert_dtypes().
128
129
    category: bool, default True
130
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
131
132
    cat_threshold: float, default 0.03
133
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
134
135
    cat_exclude: list, default None
136
        List of columns to exclude from categorical conversion.
137
138
    show: {'all', 'changes', None} default 'all'
139
        Specify verbosity of the output.
140
        * 'all': Print information about the data before and after cleaning as well as information about changes.
141
        * 'changes': Print out differences in the data before and after cleaning.
142
        * None: No information about the data and the data cleaning is printed.
143
144
    Returns
145
    -------
146
    data_cleaned: Pandas DataFrame
147
148
    See Also
149
    --------
150
    convert_datatypes: Convert columns to best possible dtypes.
151
    drop_missing : Flexibly drop columns and rows.
152
    _memory_usage: Gives the total memory usage in kilobytes.
153
    _missing_vals: Metrics about missing values in the dataset.
154
155
    Notes
156
    -----
157
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
158
    '''
159
160
    # Validate Inputs
161
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
162
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
163
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
164
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
165
    _validate_input_bool(category, 'category')
166
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
167
168
    data = pd.DataFrame(data).copy()
169
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
170
171
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
172
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
173
174
    dupl_rows = None
175
176
    if drop_duplicates:
177
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
178
    if convert_dtypes:
179
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
180
                                         cat_exclude=cat_exclude)
181
182
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
183
184
    return data_cleaned
185
186
187
class DataCleaner(BaseEstimator, TransformerMixin):
188
    '''
189
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar
190
    functions (e.g. using MVColHandler() or SubsetPooler()).
191
192
    Parameters:
193
    ---------´
194
    drop_threshold_cols: float, default 0.9
195
        Drop columns with NA-ratio above the specified threshold.
196
197
    drop_threshold_rows: float, default 0.9
198
        Drop rows with NA-ratio above the specified threshold.
199
200
    drop_duplicates: bool, default True
201
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
202
203
    convert_dtypes: bool, default True
204
        Convert dtypes using pd.convert_dtypes().
205
206
    category: bool, default True
207
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
208
209
    cat_threshold: float, default 0.03
210
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
211
212
    cat_exclude: list, default None
213
        List of columns to exclude from categorical conversion.
214
215
    show: {'all', 'changes', None} default 'all'
216
        Specify verbosity of the output.
217
        * 'all': Print information about the data before and after cleaning as well as information about changes.
218
        * 'changes': Print out differences in the data before and after cleaning.
219
        * None: No information about the data and the data cleaning is printed.
220
221
    Returns:
222
    -------
223
    data_cleaned: Pandas DataFrame
224
    '''
225
226
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
227
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
228
        self.drop_threshold_cols = drop_threshold_cols
229
        self.drop_threshold_rows = drop_threshold_rows
230
        self.drop_duplicates = drop_duplicates
231
        self.convert_dtypes = convert_dtypes
232
        self.category = category
233
        self.cat_threshold = cat_threshold
234
        self.cat_exclude = cat_exclude
235
        self.show = show
236
237
    def fit(self, data, target=None):
238
        return self
239
240
    def transform(self, data, target=None):
241
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
242
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
243
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
244
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
245
        return data_cleaned
246
247
248
def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
249
                    return_details=False):
250
    '''
251
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on
252
    their correlation with other features and the target variable. This function follows a three step process:
253
    - 1) Identify features with a high ratio of missing values.
254
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
255
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless
256
         they correlate reasonably well with the target variable.
257
258
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
259
260
    Parameters
261
    ----------
262
    data: 2D dataset that can be coerced into Pandas DataFrame.
263
264
    target: string, list, np.array or pd.Series, default None
265
        Specify target for correlation. I.e. label column to generate only the correlations between each feature
266
        and the label.
267
268
    mv_threshold: float, default 0.1
269
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates
270
        for dropping and undergo further analysis.
271
272
    corr_thresh_features: float, default 0.5
273
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio)
274
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further
275
        analysis.
276
277
    corr_thresh_target: float, default 0.3
278
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a
279
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met
280
        the feature is ultimately dropped.
281
282
    return_details: bool, default False
283
        Provdies flexibility to return intermediary results.
284
285
    Returns
286
    -------
287
    data: Updated Pandas DataFrame
288
289
    optional:
290
    cols_mv: Columns with missing values included in the analysis
291
    drop_cols: List of dropped columns
292
    '''
293
294
    # Validate Inputs
295
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
296
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
297
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
298
299
    data = pd.DataFrame(data).copy()
300
    data_local = data.copy()
301
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
302
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
303
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
304
305
    high_corr_features = []
306
    data_temp = data_local.copy()
307
    for col in cols_mv:
308
        corrmat = corr_mat(data_temp, colored=False)
309
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
310
            high_corr_features.append(col)
311
            data_temp = data_temp.drop(columns=[col])
312
313
    drop_cols = []
314
    if target is None:
315
        data = data.drop(columns=high_corr_features)
316
    else:
317
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
318
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
319
        data = data.drop(columns=drop_cols)
320
321
    if return_details:
322
        return data, cols_mv, drop_cols
323
324
    return data
325
326
327
class MVColHandler(BaseEstimator, TransformerMixin):
328
    '''
329
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar
330
    functions (e.g. using DataCleaner() or SubsetPooler()).
331
332
    Parameters
333
    ----------
334
    target: string, list, np.array or pd.Series, default None
335
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
336
        and the label.
337
338
    mv_threshold: float, default 0.1
339
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
340
        for dropping and undergo further analysis.
341
342
    corr_thresh_features: float, default 0.6
343
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
344
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
345
346
    corr_thresh_target: float, default 0.3
347
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
348
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
349
        the feature is ultimately dropped.
350
351
    return_details: bool, default True
352
        Provdies flexibility to return intermediary results.
353
354
    Returns
355
    -------
356
    data: Updated Pandas DataFrame
357
    '''
358
359
    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
360
                 return_details=True):
361
        self.target = target
362
        self.mv_threshold = mv_threshold
363
        self.corr_thresh_features = corr_thresh_features
364
        self.corr_thresh_target = corr_thresh_target
365
        self.return_details = return_details
366
367
    def fit(self, data, target=None):
368
        return self
369
370
    def transform(self, data, target=None):
371
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
372
                                                      corr_thresh_features=self.corr_thresh_features,
373
                                                      corr_thresh_target=self.corr_thresh_target,
374
                                                      return_details=self.return_details)
375
376
        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
377
        print('Features dropped:', len(dropped_cols), dropped_cols)
378
379
        return data
380
381
382
def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=False):
383
    '''
384
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
385
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
386
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
387
    the 'pooled_var' column indicate identical information in the respective rows.
388
389
    Parameters
390
    ----------
391
    data: 2D dataset that can be coerced into Pandas DataFrame.
392
393
    col_dupl_ratio: float, default 0.2
394
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
395
        Columns with a lower ratio are not considered for pooling.
396
397
    dupl_thresh: float, default 0.2
398
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
399
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
400
        reached.
401
402
    min_col_pool: integer, default 3
403
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
404
        subsets and stops when 'min_col_pool' is reached.
405
406
    return_details: bool, default False
407
        Provdies flexibility to return intermediary results.
408
409
    Returns:
410
    -------
411
    data: pd.DataFrame
412
413
    optional:
414
    subset_cols: List of columns used as subset.
415
    '''
416
417
    # Input validation
418
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
419
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
420
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
421
422
    subset_cols = []
423
    for i in range(data.shape[1]+1-min_col_pool):
424
        check_list = []
425
        for col in data.columns:
426
            cdr = data.duplicated(subset=col).mean()
427
            if cdr > col_dupl_thresh:
428
                check_list.append(col)
429
430
        if len(check_list) > 0:
431
            combinations = itertools.combinations(check_list, len(check_list)-i)
432
        else:
433
            continue
434
435
        ratios = []
436
        for comb in combinations:
437
            ratios.append(data[list(comb)].duplicated().mean())
438
439
        max_ratio = max(ratios)
440
        max_idx = np.argmax(ratios)
441
442
        if max_ratio > subset_thresh:
443
            best_subset = itertools.islice(itertools.combinations(
444
                check_list, len(check_list)-i), max_idx, max_idx+1)
445
            best_subset = data[list(list(best_subset)[0])]
446
            subset_cols = best_subset.columns.tolist()
447
448
            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
449
            data = data.merge(unique_subset, how='inner', on=best_subset.columns.tolist()
450
                              ).drop(columns=best_subset.columns.tolist())
451
            data.index = pd.RangeIndex(len(data))
452
            break
453
454
    return data, subset_cols
455
456
457
class SubsetPooler(BaseEstimator, TransformerMixin):
458
    '''
459
    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar
460
    functions (e.g. using DataCleaner() or MVColHandler()).
461
462
    Parameters
463
    ----------
464
    col_dupl_ratio: float, default 0.2
465
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
466
        Columns with a lower ratio are not considered for pooling.
467
468
    dupl_thresh: float, default 0.2
469
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
470
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
471
        reached.
472
473
    min_col_pool: integer, default 3
474
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
475
        subsets and stops when 'min_col_pool' is reached.
476
477
    return_details: bool, default False
478
        Provdies flexibility to return intermediary results.
479
480
    Returns:
481
    -------
482
    data: pd.DataFrame
483
    '''
484
485
    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
486
        self.col_dupl_thresh = col_dupl_thresh
487
        self.subset_thresh = subset_thresh
488
        self.min_col_pool = min_col_pool
489
        self.return_details = return_details
490
491
    def fit(self, data, target=None):
492
        return self
493
494
    def transform(self, data, target=None):
495
        data, subset_cols = pool_duplicate_subsets(
496
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)
497
498
        print('Combined columns:', len(subset_cols), subset_cols)
499
500
        return data
501