GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 8aef93...18d2e3 )
by Andreas
03:19
created

klib.clean   A

Complexity

Total Complexity 38

Size/Duplication

Total Lines 534
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 185
dl 0
loc 534
rs 9.36
c 0
b 0
f 0
wmc 38

9 Methods

Rating   Name   Duplication   Size   Complexity  
A MVColHandler.__init__() 0 7 1
A DataCleaner.__init__() 0 10 1
A DataCleaner.transform() 0 6 1
A MVColHandler.fit() 0 2 1
A MVColHandler.transform() 0 10 1
A SubsetPooler.transform() 0 7 1
A SubsetPooler.fit() 0 2 1
A SubsetPooler.__init__() 0 5 1
A DataCleaner.fit() 0 2 1

7 Functions

Rating   Name   Duplication   Size   Complexity  
C pool_duplicate_subsets() 0 87 9
A optimize_ints() 0 5 1
A drop_missing() 0 34 1
A optimize_floats() 0 5 1
B convert_datatypes() 0 46 7
A data_cleaning() 0 78 3
B mv_col_handling() 0 77 7
1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import itertools
10
import numpy as np
11
import pandas as pd
12
from sklearn.base import BaseEstimator, TransformerMixin
13
14
from .describe import corr_mat
15
from .utils import (_diff_report,
16
                    _drop_duplicates,
17
                    _missing_vals,
18
                    _validate_input_bool,
19
                    _validate_input_range)
20
21
22
__all__ = ['convert_datatypes',
23
           'data_cleaning',
24
           'drop_missing',
25
           'mv_col_handling']
26
27
28
def optimize_ints(data):
29
    data = pd.DataFrame(data).copy()
30
    ints = data.select_dtypes(include=['int64']).columns.tolist()
31
    data[ints] = data[ints].apply(pd.to_numeric, downcast='integer')
32
    return data
33
34
35
def optimize_floats(data):
36
    data = pd.DataFrame(data).copy()
37
    floats = data.select_dtypes(include=['float64']).columns.tolist()
38
    data[floats] = data[floats].apply(pd.to_numeric, downcast='float')
39
    return data
40
41
42
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
43
    '''
44
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
45
        due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
46
        See https://github.com/pandas-dev/pandas/issues/33803
47
48
    Parameters
49
    ----------
50
    data: 2D dataset that can be coerced into Pandas DataFrame.
51
52
    category: bool, default True
53
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
54
        columns using cat_exclude.
55
56
    cat_threshold: float, default 0.05
57
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
58
59
    cat_exclude: list, default None
60
        List of columns to exclude from categorical conversion.
61
62
    Returns
63
    -------
64
    data: Pandas DataFrame
65
    '''
66
67
    # Validate Inputs
68
    _validate_input_bool(category, 'Category')
69
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
70
71
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
72
73
    data = pd.DataFrame(data).copy()
74
    for col in data.columns:
75
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
76
        if (category and
77
            unique_vals_ratio < cat_threshold and
78
            col not in cat_exclude and
79
                data[col].dtype == 'object'):
80
            data[col] = data[col].astype('category')
81
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
82
                                             convert_integer=False, convert_boolean=True)
83
84
    data = optimize_ints(data)
85
    data = optimize_floats(data)
86
87
    return data
88
89
90
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
91
    '''
92
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
93
    drop additional columns and rows based on the fraction of remaining NA-values.
94
95
    Parameters
96
    ----------
97
    data: 2D dataset that can be coerced into Pandas DataFrame.
98
99
    drop_threshold_cols: float, default 1
100
        Drop columns with NA-ratio above the specified threshold.
101
102
    drop_threshold_rows: float, default 1
103
        Drop rows with NA-ratio above the specified threshold.
104
105
    Returns
106
    -------
107
    data_cleaned: Pandas DataFrame
108
109
    Notes
110
    -----
111
    Columns are dropped first. Rows are dropped based on the remaining data.
112
    '''
113
114
    # Validate Inputs
115
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
116
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
117
118
    data = pd.DataFrame(data).copy()
119
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
120
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
121
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
122
123
    return data_cleaned
124
125
126
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
127
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
128
    '''
129
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
130
        columns as well as optimizing the datatypes.
131
132
    Parameters
133
    ----------
134
    data: 2D dataset that can be coerced into Pandas DataFrame.
135
136
    drop_threshold_cols: float, default 0.9
137
        Drop columns with NA-ratio above the specified threshold.
138
139
    drop_threshold_rows: float, default 0.9
140
        Drop rows with NA-ratio above the specified threshold.
141
142
    drop_duplicates: bool, default True
143
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
144
145
    convert_dtypes: bool, default True
146
        Convert dtypes using pd.convert_dtypes().
147
148
    category: bool, default True
149
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
150
151
    cat_threshold: float, default 0.03
152
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
153
154
    cat_exclude: list, default None
155
        List of columns to exclude from categorical conversion.
156
157
    show: {'all', 'changes', None} default 'all'
158
        Specify verbosity of the output.
159
        * 'all': Print information about the data before and after cleaning as well as information about changes.
160
        * 'changes': Print out differences in the data before and after cleaning.
161
        * None: No information about the data and the data cleaning is printed.
162
163
    Returns
164
    -------
165
    data_cleaned: Pandas DataFrame
166
167
    See Also
168
    --------
169
    convert_datatypes: Convert columns to best possible dtypes.
170
    drop_missing : Flexibly drop columns and rows.
171
    _memory_usage: Gives the total memory usage in kilobytes.
172
    _missing_vals: Metrics about missing values in the dataset.
173
174
    Notes
175
    -----
176
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
177
    '''
178
179
    # Validate Inputs
180
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
181
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
182
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
183
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
184
    _validate_input_bool(category, 'category')
185
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
186
187
    data = pd.DataFrame(data).copy()
188
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
189
190
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
191
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
192
193
    dupl_rows = None
194
195
    if drop_duplicates:
196
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
197
    if convert_dtypes:
198
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
199
                                         cat_exclude=cat_exclude)
200
201
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
202
203
    return data_cleaned
204
205
206
class DataCleaner(BaseEstimator, TransformerMixin):
207
    '''
208
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
209
    functions (e.g. using MVColHandler() or SubsetPooler()).
210
211
    Parameters:
212
    ---------´
213
    drop_threshold_cols: float, default 0.9
214
        Drop columns with NA-ratio above the specified threshold.
215
216
    drop_threshold_rows: float, default 0.9
217
        Drop rows with NA-ratio above the specified threshold.
218
219
    drop_duplicates: bool, default True
220
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
221
222
    convert_dtypes: bool, default True
223
        Convert dtypes using pd.convert_dtypes().
224
225
    category: bool, default True
226
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
227
228
    cat_threshold: float, default 0.03
229
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
230
231
    cat_exclude: list, default None
232
        List of columns to exclude from categorical conversion.
233
234
    show: {'all', 'changes', None} default 'all'
235
        Specify verbosity of the output.
236
        * 'all': Print information about the data before and after cleaning as well as information about changes.
237
        * 'changes': Print out differences in the data before and after cleaning.
238
        * None: No information about the data and the data cleaning is printed.
239
240
    Returns:
241
    -------
242
    data_cleaned: Pandas DataFrame
243
    '''
244
245
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
246
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
247
        self.drop_threshold_cols = drop_threshold_cols
248
        self.drop_threshold_rows = drop_threshold_rows
249
        self.drop_duplicates = drop_duplicates
250
        self.convert_dtypes = convert_dtypes
251
        self.category = category
252
        self.cat_threshold = cat_threshold
253
        self.cat_exclude = cat_exclude
254
        self.show = show
255
256
    def fit(self, data, target=None):
257
        return self
258
259
    def transform(self, data, target=None):
260
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
261
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
262
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
263
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
264
        return data_cleaned
265
266
267
def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
268
                    return_details=False):
269
    '''
270
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
271
    their correlation with other features and the target variable. This function follows a three step process:
272
    - 1) Identify features with a high ratio of missing values.
273
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
274
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
275
         they correlate reasonably well with the target variable.
276
277
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
278
279
    Parameters
280
    ----------
281
    data: 2D dataset that can be coerced into Pandas DataFrame.
282
283
    target: string, list, np.array or pd.Series, default None
284
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
285
        and the label.
286
287
    mv_threshold: float, default 0.1
288
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
289
        for dropping and undergo further analysis.
290
291
    corr_thresh_features: float, default 0.5
292
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
293
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
294
        analysis.
295
296
    corr_thresh_target: float, default 0.3
297
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
298
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
299
        the feature is ultimately dropped.
300
301
    return_details: bool, default False
302
        Provdies flexibility to return intermediary results.
303
304
    Returns
305
    -------
306
    data: Updated Pandas DataFrame
307
308
    optional:
309
    cols_mv: Columns with missing values included in the analysis
310
    drop_cols: List of dropped columns
311
    '''
312
313
    # Validate Inputs
314
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
315
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
316
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
317
318
    data = pd.DataFrame(data).copy()
319
    data_local = data.copy()
320
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
321
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
322
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
323
324
    high_corr_features = []
325
    data_temp = data_local.copy()
326
    for col in cols_mv:
327
        corrmat = corr_mat(data_temp, colored=False)
328
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
329
            high_corr_features.append(col)
330
            data_temp = data_temp.drop(columns=[col])
331
332
    drop_cols = []
333
    if target is None:
334
        data = data.drop(columns=high_corr_features)
335
    else:
336
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
337
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
338
        data = data.drop(columns=drop_cols)
339
340
    if return_details:
341
        return data, cols_mv, drop_cols
342
343
    return data
344
345
346
class MVColHandler(BaseEstimator, TransformerMixin):
347
    '''
348
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
349
    functions (e.g. using DataCleaner() or SubsetPooler()).
350
351
    Parameters
352
    ----------
353
    target: string, list, np.array or pd.Series, default None
354
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
355
        and the label.
356
357
    mv_threshold: float, default 0.1
358
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
359
        for dropping and undergo further analysis.
360
361
    corr_thresh_features: float, default 0.6
362
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
363
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
364
365
    corr_thresh_target: float, default 0.3
366
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
367
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
368
        the feature is ultimately dropped.
369
370
    return_details: bool, default True
371
        Provdies flexibility to return intermediary results.
372
373
    Returns
374
    -------
375
    data: Updated Pandas DataFrame
376
    '''
377
378
    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
379
                 return_details=True):
380
        self.target = target
381
        self.mv_threshold = mv_threshold
382
        self.corr_thresh_features = corr_thresh_features
383
        self.corr_thresh_target = corr_thresh_target
384
        self.return_details = return_details
385
386
    def fit(self, data, target=None):
387
        return self
388
389
    def transform(self, data, target=None):
390
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
391
                                                      corr_thresh_features=self.corr_thresh_features,
392
                                                      corr_thresh_target=self.corr_thresh_target,
393
                                                      return_details=self.return_details)
394
395
        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
396
        print('Features dropped:', len(dropped_cols), dropped_cols)
397
398
        return data
399
400
401
def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
402
                           return_details=False):
403
    '''
404
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
405
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
406
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
407
    the 'pooled_var' column indicate identical information in the respective rows.
408
409
    Parameters
410
    ----------
411
    data: 2D dataset that can be coerced into Pandas DataFrame.
412
413
    col_dupl_thresh: float, default 0.2
414
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
415
        Columns with a lower ratio are not considered for pooling.
416
417
    subset_thresh: float, default 0.2
418
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
419
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
420
        reached.
421
422
    min_col_pool: integer, default 3
423
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
424
        subsets and stops when 'min_col_pool' is reached.
425
426
    exclude. list, default None
427
        List of column names to be excluded from the analysis. These columns are passed through without modification.
428
429
    return_details: bool, default False
430
        Provdies flexibility to return intermediary results.
431
432
    Returns:
433
    -------
434
    data: pd.DataFrame
435
436
    optional:
437
    subset_cols: List of columns used as subset.
438
    '''
439
440
    # Input validation
441
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
442
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
443
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
444
445
    excluded_cols = []
446
    if exclude is not None:
447
        excluded_cols = data[exclude]
448
        data = data.drop(columns=exclude)
449
450
    subset_cols = []
451
    for i in range(data.shape[1]+1-min_col_pool):
452
        check_list = []
453
        for col in data.columns:
454
            cdr = data.duplicated(subset=col).mean()
455
            if cdr > col_dupl_thresh:
456
                check_list.append(col)
457
458
        if len(check_list) > 0:
459
            combinations = itertools.combinations(check_list, len(check_list)-i)
460
        else:
461
            continue
462
463
        ratios = []
464
        for comb in combinations:
465
            ratios.append(data[list(comb)].duplicated().mean())
466
467
        max_ratio = max(ratios)
468
        max_idx = np.argmax(ratios)
469
470
        if max_ratio > subset_thresh:
471
            best_subset = itertools.islice(itertools.combinations(
472
                check_list, len(check_list)-i), max_idx, max_idx+1)
473
            best_subset = data[list(list(best_subset)[0])]
474
            subset_cols = best_subset.columns.tolist()
475
476
            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
477
            data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
478
                              ).drop(columns=best_subset.columns.tolist())
479
            data.index = pd.RangeIndex(len(data))
480
            break
481
482
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
483
484
    if return_details:
485
        return data, subset_cols
486
487
    return data
488
489
490
class SubsetPooler(BaseEstimator, TransformerMixin):
491
    '''
492
    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
493
    functions (e.g. using DataCleaner() or MVColHandler()).
494
495
    Parameters
496
    ----------
497
    col_dupl_ratio: float, default 0.2
498
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
499
        Columns with a lower ratio are not considered for pooling.
500
501
    dupl_thresh: float, default 0.2
502
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
503
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
504
        reached.
505
506
    min_col_pool: integer, default 3
507
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
508
        subsets and stops when 'min_col_pool' is reached.
509
510
    return_details: bool, default False
511
        Provdies flexibility to return intermediary results.
512
513
    Returns:
514
    -------
515
    data: pd.DataFrame
516
    '''
517
518
    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
519
        self.col_dupl_thresh = col_dupl_thresh
520
        self.subset_thresh = subset_thresh
521
        self.min_col_pool = min_col_pool
522
        self.return_details = return_details
523
524
    def fit(self, data, target=None):
525
        return self
526
527
    def transform(self, data, target=None):
528
        data, subset_cols = pool_duplicate_subsets(
529
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)
530
531
        print('Combined columns:', len(subset_cols), subset_cols)
532
533
        return data
534