GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 758151...a39973 )
by Andreas
01:26
created

klib.clean.pool_duplicate_subsets()   C

Complexity

Conditions 9

Size

Total Lines 87
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 38
nop 6
dl 0
loc 87
rs 6.6346
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import itertools
10
import numpy as np
11
import pandas as pd
12
from sklearn.base import BaseEstimator, TransformerMixin
13
14
from .describe import corr_mat
15
from .utils import (_diff_report,
16
                    _drop_duplicates,
17
                    _missing_vals,
18
                    _validate_input_bool,
19
                    _validate_input_range)
20
21
22
__all__ = ['convert_datatypes',
23
           'data_cleaning',
24
           'drop_missing',
25
           'mv_col_handling']
26
27
28
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
29
    '''
30
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
31
32
    Parameters
33
    ----------
34
    data: 2D dataset that can be coerced into Pandas DataFrame.
35
36
    category: bool, default True
37
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
38
        columns using cat_exclude.
39
40
    cat_threshold: float, default 0.05
41
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
42
43
    cat_exclude: list, default None
44
        List of columns to exclude from categorical conversion.
45
46
    Returns
47
    -------
48
    data: Pandas DataFrame
49
    '''
50
51
    # Validate Inputs
52
    _validate_input_bool(category, 'Category')
53
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
54
55
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
56
57
    data = pd.DataFrame(data).copy()
58
    for col in data.columns:
59
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
60
        if (category and
61
            unique_vals_ratio < cat_threshold and
62
            col not in cat_exclude and
63
                data[col].dtype == 'object'):
64
            data[col] = data[col].astype('category')
65
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
66
                                             convert_integer=False, convert_boolean=True)
67
68
    return data
69
70
71
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
72
    '''
73
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
74
    drop additional columns and rows based on the fraction of remaining NA-values.
75
76
    Parameters
77
    ----------
78
    data: 2D dataset that can be coerced into Pandas DataFrame.
79
80
    drop_threshold_cols: float, default 1
81
        Drop columns with NA-ratio above the specified threshold.
82
83
    drop_threshold_rows: float, default 1
84
        Drop rows with NA-ratio above the specified threshold.
85
86
    Returns
87
    -------
88
    data_cleaned: Pandas DataFrame
89
90
    Notes
91
    -----
92
    Columns are dropped first. Rows are dropped based on the remaining data.
93
    '''
94
95
    # Validate Inputs
96
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
97
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
98
99
    data = pd.DataFrame(data).copy()
100
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
101
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
102
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
103
104
    return data_cleaned
105
106
107
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
108
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
109
    '''
110
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
111
        columns as well as optimizing the datatypes.
112
113
    Parameters
114
    ----------
115
    data: 2D dataset that can be coerced into Pandas DataFrame.
116
117
    drop_threshold_cols: float, default 0.9
118
        Drop columns with NA-ratio above the specified threshold.
119
120
    drop_threshold_rows: float, default 0.9
121
        Drop rows with NA-ratio above the specified threshold.
122
123
    drop_duplicates: bool, default True
124
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
125
126
    convert_dtypes: bool, default True
127
        Convert dtypes using pd.convert_dtypes().
128
129
    category: bool, default True
130
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
131
132
    cat_threshold: float, default 0.03
133
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
134
135
    cat_exclude: list, default None
136
        List of columns to exclude from categorical conversion.
137
138
    show: {'all', 'changes', None} default 'all'
139
        Specify verbosity of the output.
140
        * 'all': Print information about the data before and after cleaning as well as information about changes.
141
        * 'changes': Print out differences in the data before and after cleaning.
142
        * None: No information about the data and the data cleaning is printed.
143
144
    Returns
145
    -------
146
    data_cleaned: Pandas DataFrame
147
148
    See Also
149
    --------
150
    convert_datatypes: Convert columns to best possible dtypes.
151
    drop_missing : Flexibly drop columns and rows.
152
    _memory_usage: Gives the total memory usage in kilobytes.
153
    _missing_vals: Metrics about missing values in the dataset.
154
155
    Notes
156
    -----
157
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
158
    '''
159
160
    # Validate Inputs
161
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
162
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
163
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
164
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
165
    _validate_input_bool(category, 'category')
166
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
167
168
    data = pd.DataFrame(data).copy()
169
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
170
171
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
172
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
173
174
    dupl_rows = None
175
176
    if drop_duplicates:
177
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
178
    if convert_dtypes:
179
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
180
                                         cat_exclude=cat_exclude)
181
182
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
183
184
    return data_cleaned
185
186
187
class DataCleaner(BaseEstimator, TransformerMixin):
188
    '''
189
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
190
    functions (e.g. using MVColHandler() or SubsetPooler()).
191
192
    Parameters:
193
    ---------´
194
    drop_threshold_cols: float, default 0.9
195
        Drop columns with NA-ratio above the specified threshold.
196
197
    drop_threshold_rows: float, default 0.9
198
        Drop rows with NA-ratio above the specified threshold.
199
200
    drop_duplicates: bool, default True
201
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
202
203
    convert_dtypes: bool, default True
204
        Convert dtypes using pd.convert_dtypes().
205
206
    category: bool, default True
207
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
208
209
    cat_threshold: float, default 0.03
210
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
211
212
    cat_exclude: list, default None
213
        List of columns to exclude from categorical conversion.
214
215
    show: {'all', 'changes', None} default 'all'
216
        Specify verbosity of the output.
217
        * 'all': Print information about the data before and after cleaning as well as information about changes.
218
        * 'changes': Print out differences in the data before and after cleaning.
219
        * None: No information about the data and the data cleaning is printed.
220
221
    Returns:
222
    -------
223
    data_cleaned: Pandas DataFrame
224
    '''
225
226
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
227
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
228
        self.drop_threshold_cols = drop_threshold_cols
229
        self.drop_threshold_rows = drop_threshold_rows
230
        self.drop_duplicates = drop_duplicates
231
        self.convert_dtypes = convert_dtypes
232
        self.category = category
233
        self.cat_threshold = cat_threshold
234
        self.cat_exclude = cat_exclude
235
        self.show = show
236
237
    def fit(self, data, target=None):
238
        return self
239
240
    def transform(self, data, target=None):
241
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
242
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
243
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
244
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
245
        return data_cleaned
246
247
248
def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
249
                    return_details=False):
250
    '''
251
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
252
    their correlation with other features and the target variable. This function follows a three step process:
253
    - 1) Identify features with a high ratio of missing values.
254
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
255
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
256
         they correlate reasonably well with the target variable.
257
258
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
259
260
    Parameters
261
    ----------
262
    data: 2D dataset that can be coerced into Pandas DataFrame.
263
264
    target: string, list, np.array or pd.Series, default None
265
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
266
        and the label.
267
268
    mv_threshold: float, default 0.1
269
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
270
        for dropping and undergo further analysis.
271
272
    corr_thresh_features: float, default 0.5
273
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
274
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
275
        analysis.
276
277
    corr_thresh_target: float, default 0.3
278
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
279
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
280
        the feature is ultimately dropped.
281
282
    return_details: bool, default False
283
        Provdies flexibility to return intermediary results.
284
285
    Returns
286
    -------
287
    data: Updated Pandas DataFrame
288
289
    optional:
290
    cols_mv: Columns with missing values included in the analysis
291
    drop_cols: List of dropped columns
292
    '''
293
294
    # Validate Inputs
295
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
296
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
297
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
298
299
    data = pd.DataFrame(data).copy()
300
    data_local = data.copy()
301
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
302
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
303
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
304
305
    high_corr_features = []
306
    data_temp = data_local.copy()
307
    for col in cols_mv:
308
        corrmat = corr_mat(data_temp, colored=False)
309
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
310
            high_corr_features.append(col)
311
            data_temp = data_temp.drop(columns=[col])
312
313
    drop_cols = []
314
    if target is None:
315
        data = data.drop(columns=high_corr_features)
316
    else:
317
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
318
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
319
        data = data.drop(columns=drop_cols)
320
321
    if return_details:
322
        return data, cols_mv, drop_cols
323
324
    return data
325
326
327
class MVColHandler(BaseEstimator, TransformerMixin):
328
    '''
329
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
330
    functions (e.g. using DataCleaner() or SubsetPooler()).
331
332
    Parameters
333
    ----------
334
    target: string, list, np.array or pd.Series, default None
335
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
336
        and the label.
337
338
    mv_threshold: float, default 0.1
339
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
340
        for dropping and undergo further analysis.
341
342
    corr_thresh_features: float, default 0.6
343
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
344
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
345
346
    corr_thresh_target: float, default 0.3
347
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
348
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
349
        the feature is ultimately dropped.
350
351
    return_details: bool, default True
352
        Provdies flexibility to return intermediary results.
353
354
    Returns
355
    -------
356
    data: Updated Pandas DataFrame
357
    '''
358
359
    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
360
                 return_details=True):
361
        self.target = target
362
        self.mv_threshold = mv_threshold
363
        self.corr_thresh_features = corr_thresh_features
364
        self.corr_thresh_target = corr_thresh_target
365
        self.return_details = return_details
366
367
    def fit(self, data, target=None):
368
        return self
369
370
    def transform(self, data, target=None):
371
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
372
                                                      corr_thresh_features=self.corr_thresh_features,
373
                                                      corr_thresh_target=self.corr_thresh_target,
374
                                                      return_details=self.return_details)
375
376
        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
377
        print('Features dropped:', len(dropped_cols), dropped_cols)
378
379
        return data
380
381
382
def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
383
                           return_details=False):
384
    '''
385
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
386
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
387
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
388
    the 'pooled_var' column indicate identical information in the respective rows.
389
390
    Parameters
391
    ----------
392
    data: 2D dataset that can be coerced into Pandas DataFrame.
393
394
    col_dupl_thresh: float, default 0.2
395
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
396
        Columns with a lower ratio are not considered for pooling.
397
398
    subset_thresh: float, default 0.2
399
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
400
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
401
        reached.
402
403
    min_col_pool: integer, default 3
404
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
405
        subsets and stops when 'min_col_pool' is reached.
406
407
    exclude. list, default None
408
        List of column names to be excluded from the analysis. These columns are passed through without modification.
409
410
    return_details: bool, default False
411
        Provdies flexibility to return intermediary results.
412
413
    Returns:
414
    -------
415
    data: pd.DataFrame
416
417
    optional:
418
    subset_cols: List of columns used as subset.
419
    '''
420
421
    # Input validation
422
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
423
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
424
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
425
426
    excluded_cols = []
427
    if exclude is not None:
428
        excluded_cols = data[exclude]
429
        data = data.drop(columns=exclude)
430
431
    subset_cols = []
432
    for i in range(data.shape[1]+1-min_col_pool):
433
        check_list = []
434
        for col in data.columns:
435
            cdr = data.duplicated(subset=col).mean()
436
            if cdr > col_dupl_thresh:
437
                check_list.append(col)
438
439
        if len(check_list) > 0:
440
            combinations = itertools.combinations(check_list, len(check_list)-i)
441
        else:
442
            continue
443
444
        ratios = []
445
        for comb in combinations:
446
            ratios.append(data[list(comb)].duplicated().mean())
447
448
        max_ratio = max(ratios)
449
        max_idx = np.argmax(ratios)
450
451
        if max_ratio > subset_thresh:
452
            best_subset = itertools.islice(itertools.combinations(
453
                check_list, len(check_list)-i), max_idx, max_idx+1)
454
            best_subset = data[list(list(best_subset)[0])]
455
            subset_cols = best_subset.columns.tolist()
456
457
            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
458
            data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
459
                              ).drop(columns=best_subset.columns.tolist())
460
            data.index = pd.RangeIndex(len(data))
461
            break
462
463
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
464
465
    if return_details:
466
        return data, subset_cols
467
468
    return data
469
470
471
class SubsetPooler(BaseEstimator, TransformerMixin):
472
    '''
473
    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
474
    functions (e.g. using DataCleaner() or MVColHandler()).
475
476
    Parameters
477
    ----------
478
    col_dupl_ratio: float, default 0.2
479
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
480
        Columns with a lower ratio are not considered for pooling.
481
482
    dupl_thresh: float, default 0.2
483
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
484
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
485
        reached.
486
487
    min_col_pool: integer, default 3
488
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
489
        subsets and stops when 'min_col_pool' is reached.
490
491
    return_details: bool, default False
492
        Provdies flexibility to return intermediary results.
493
494
    Returns:
495
    -------
496
    data: pd.DataFrame
497
    '''
498
499
    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
500
        self.col_dupl_thresh = col_dupl_thresh
501
        self.subset_thresh = subset_thresh
502
        self.min_col_pool = min_col_pool
503
        self.return_details = return_details
504
505
    def fit(self, data, target=None):
506
        return self
507
508
    def transform(self, data, target=None):
509
        data, subset_cols = pool_duplicate_subsets(
510
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)
511
512
        print('Combined columns:', len(subset_cols), subset_cols)
513
514
        return data
515