GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Branch master (5deb01)
by Andreas
02:32
created

klib.clean   A

Complexity

Total Complexity 36

Size/Duplication

Total Lines 530
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 179
dl 0
loc 530
rs 9.52
c 0
b 0
f 0
wmc 36

7 Functions

Rating   Name   Duplication   Size   Complexity  
A optimize_ints() 0 5 1
A drop_missing() 0 34 1
A optimize_floats() 0 5 1
B convert_datatypes() 0 46 7
A data_cleaning() 0 79 3
B mv_col_handling() 0 78 7
B pool_duplicate_subsets() 0 81 7

9 Methods

Rating   Name   Duplication   Size   Complexity  
A MVColHandler.__init__() 0 7 1
A DataCleaner.__init__() 0 10 1
A DataCleaner.transform() 0 6 1
A MVColHandler.fit() 0 2 1
A MVColHandler.transform() 0 10 1
A DataCleaner.fit() 0 2 1
A SubsetPooler.transform() 0 7 1
A SubsetPooler.fit() 0 2 1
A SubsetPooler.__init__() 0 5 1
1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import itertools
10
import numpy as np
11
import pandas as pd
12
from sklearn.base import BaseEstimator, TransformerMixin
13
14
from .describe import corr_mat
15
from .utils import (_diff_report,
16
                    _drop_duplicates,
17
                    _missing_vals,
18
                    _validate_input_bool,
19
                    _validate_input_range)
20
21
22
__all__ = ['convert_datatypes',
23
           'data_cleaning',
24
           'drop_missing',
25
           'mv_col_handling']
26
27
28
def optimize_ints(data):
29
    data = pd.DataFrame(data).copy()
30
    ints = data.select_dtypes(include=['int64']).columns.tolist()
31
    data[ints] = data[ints].apply(pd.to_numeric, downcast='integer')
32
    return data
33
34
35
def optimize_floats(data):
36
    data = pd.DataFrame(data).copy()
37
    floats = data.select_dtypes(include=['float64']).columns.tolist()
38
    data[floats] = data[floats].apply(pd.to_numeric, downcast='float')
39
    return data
40
41
42
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
43
    '''
44
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
45
        due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
46
        See https://github.com/pandas-dev/pandas/issues/33803
47
48
    Parameters
49
    ----------
50
    data: 2D dataset that can be coerced into Pandas DataFrame.
51
52
    category: bool, default True
53
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
54
        columns using cat_exclude.
55
56
    cat_threshold: float, default 0.05
57
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
58
59
    cat_exclude: list, default None
60
        List of columns to exclude from categorical conversion.
61
62
    Returns
63
    -------
64
    data: Pandas DataFrame
65
    '''
66
67
    # Validate Inputs
68
    _validate_input_bool(category, 'Category')
69
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
70
71
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
72
73
    data = pd.DataFrame(data).copy()
74
    for col in data.columns:
75
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
76
        if (category and
77
            unique_vals_ratio < cat_threshold and
78
            col not in cat_exclude and
79
                data[col].dtype == 'object'):
80
            data[col] = data[col].astype('category')
81
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
82
                                             convert_integer=False, convert_boolean=True)
83
84
    data = optimize_ints(data)
85
    data = optimize_floats(data)
86
87
    return data
88
89
90
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
91
    '''
92
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
93
    drop additional columns and rows based on the fraction of remaining NA-values.
94
95
    Parameters
96
    ----------
97
    data: 2D dataset that can be coerced into Pandas DataFrame.
98
99
    drop_threshold_cols: float, default 1
100
        Drop columns with NA-ratio above the specified threshold.
101
102
    drop_threshold_rows: float, default 1
103
        Drop rows with NA-ratio above the specified threshold.
104
105
    Returns
106
    -------
107
    data_cleaned: Pandas DataFrame
108
109
    Notes
110
    -----
111
    Columns are dropped first. Rows are dropped based on the remaining data.
112
    '''
113
114
    # Validate Inputs
115
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
116
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
117
118
    data = pd.DataFrame(data).copy()
119
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
120
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
121
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
122
123
    return data_cleaned
124
125
126
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
127
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
128
    '''
129
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
130
        columns as well as optimizing the datatypes.
131
132
    Parameters
133
    ----------
134
    data: 2D dataset that can be coerced into Pandas DataFrame.
135
136
    drop_threshold_cols: float, default 0.9
137
        Drop columns with NA-ratio above the specified threshold.
138
139
    drop_threshold_rows: float, default 0.9
140
        Drop rows with NA-ratio above the specified threshold.
141
142
    drop_duplicates: bool, default True
143
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
144
145
    convert_dtypes: bool, default True
146
        Convert dtypes using pd.convert_dtypes().
147
148
    category: bool, default True
149
        Enable changing dtypes of 'object' columns to "category". Set threshold using cat_threshold. Requires \
150
        convert_dtypes=True.
151
152
    cat_threshold: float, default 0.03
153
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
154
155
    cat_exclude: list, default None
156
        List of columns to exclude from categorical conversion.
157
158
    show: {'all', 'changes', None} default 'all'
159
        Specify verbosity of the output.
160
        * 'all': Print information about the data before and after cleaning as well as information about changes.
161
        * 'changes': Print out differences in the data before and after cleaning.
162
        * None: No information about the data and the data cleaning is printed.
163
164
    Returns
165
    -------
166
    data_cleaned: Pandas DataFrame
167
168
    See Also
169
    --------
170
    convert_datatypes: Convert columns to best possible dtypes.
171
    drop_missing : Flexibly drop columns and rows.
172
    _memory_usage: Gives the total memory usage in kilobytes.
173
    _missing_vals: Metrics about missing values in the dataset.
174
175
    Notes
176
    -----
177
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
178
    '''
179
180
    # Validate Inputs
181
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
182
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
183
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
184
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
185
    _validate_input_bool(category, 'category')
186
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
187
188
    data = pd.DataFrame(data).copy()
189
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
190
191
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
192
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
193
194
    dupl_rows = None
195
196
    if drop_duplicates:
197
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
198
    if convert_dtypes:
199
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
200
                                         cat_exclude=cat_exclude)
201
202
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
203
204
    return data_cleaned
205
206
207
class DataCleaner(BaseEstimator, TransformerMixin):
208
    '''
209
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
210
    functions (e.g. using MVColHandler() or SubsetPooler()).
211
212
    Parameters:
213
    ---------´
214
    drop_threshold_cols: float, default 0.9
215
        Drop columns with NA-ratio above the specified threshold.
216
217
    drop_threshold_rows: float, default 0.9
218
        Drop rows with NA-ratio above the specified threshold.
219
220
    drop_duplicates: bool, default True
221
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
222
223
    convert_dtypes: bool, default True
224
        Convert dtypes using pd.convert_dtypes().
225
226
    category: bool, default True
227
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
228
229
    cat_threshold: float, default 0.03
230
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
231
232
    cat_exclude: list, default None
233
        List of columns to exclude from categorical conversion.
234
235
    show: {'all', 'changes', None} default 'all'
236
        Specify verbosity of the output.
237
        * 'all': Print information about the data before and after cleaning as well as information about changes.
238
        * 'changes': Print out differences in the data before and after cleaning.
239
        * None: No information about the data and the data cleaning is printed.
240
241
    Returns:
242
    -------
243
    data_cleaned: Pandas DataFrame
244
    '''
245
246
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
247
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
248
        self.drop_threshold_cols = drop_threshold_cols
249
        self.drop_threshold_rows = drop_threshold_rows
250
        self.drop_duplicates = drop_duplicates
251
        self.convert_dtypes = convert_dtypes
252
        self.category = category
253
        self.cat_threshold = cat_threshold
254
        self.cat_exclude = cat_exclude
255
        self.show = show
256
257
    def fit(self, data, target=None):
258
        return self
259
260
    def transform(self, data, target=None):
261
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
262
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
263
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
264
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
265
        return data_cleaned
266
267
268
def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.5, corr_thresh_target=0.3,
269
                    return_details=False):
270
    '''
271
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
272
    their correlation with other features and the target variable. This function follows a three step process:
273
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
274
    - 2) Identify high correlations of these features among themselves and with other features in the dataset (above \
275
         'corr_thresh_features').
276
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
277
         they correlate reasonably well with the target variable (above 'corr_thresh_target').
278
279
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
280
281
    Parameters
282
    ----------
283
    data: 2D dataset that can be coerced into Pandas DataFrame.
284
285
    target: string, list, np.array or pd.Series, default None
286
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
287
        and the label.
288
289
    mv_threshold: float, default 0.1
290
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
291
        for dropping and undergo further analysis.
292
293
    corr_thresh_features: float, default 0.5
294
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
295
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
296
        analysis.
297
298
    corr_thresh_target: float, default 0.3
299
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
300
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
301
        the feature is ultimately dropped.
302
303
    return_details: bool, default False
304
        Provdies flexibility to return intermediary results.
305
306
    Returns
307
    -------
308
    data: Updated Pandas DataFrame
309
310
    optional:
311
    cols_mv: Columns with missing values included in the analysis
312
    drop_cols: List of dropped columns
313
    '''
314
315
    # Validate Inputs
316
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
317
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
318
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
319
320
    data = pd.DataFrame(data).copy()
321
    data_local = data.copy()
322
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
323
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
324
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
325
326
    high_corr_features = []
327
    data_temp = data_local.copy()
328
    for col in cols_mv:
329
        corrmat = corr_mat(data_temp, colored=False)
330
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
331
            high_corr_features.append(col)
332
            data_temp = data_temp.drop(columns=[col])
333
334
    drop_cols = []
335
    if target is None:
336
        data = data.drop(columns=high_corr_features)
337
    else:
338
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
339
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
340
        data = data.drop(columns=drop_cols)
341
342
    if return_details:
343
        return data, cols_mv, drop_cols
344
345
    return data
346
347
348
class MVColHandler(BaseEstimator, TransformerMixin):
349
    '''
350
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
351
    functions (e.g. using DataCleaner() or SubsetPooler()).
352
353
    Parameters
354
    ----------
355
    target: string, list, np.array or pd.Series, default None
356
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
357
        and the label.
358
359
    mv_threshold: float, default 0.1
360
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
361
        for dropping and undergo further analysis.
362
363
    corr_thresh_features: float, default 0.6
364
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
365
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
366
367
    corr_thresh_target: float, default 0.3
368
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
369
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
370
        the feature is ultimately dropped.
371
372
    return_details: bool, default True
373
        Provdies flexibility to return intermediary results.
374
375
    Returns
376
    -------
377
    data: Updated Pandas DataFrame
378
    '''
379
380
    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3,
381
                 return_details=True):
382
        self.target = target
383
        self.mv_threshold = mv_threshold
384
        self.corr_thresh_features = corr_thresh_features
385
        self.corr_thresh_target = corr_thresh_target
386
        self.return_details = return_details
387
388
    def fit(self, data, target=None):
389
        return self
390
391
    def transform(self, data, target=None):
392
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
393
                                                      corr_thresh_features=self.corr_thresh_features,
394
                                                      corr_thresh_target=self.corr_thresh_target,
395
                                                      return_details=self.return_details)
396
397
        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
398
        print('Features dropped:', len(dropped_cols), dropped_cols)
399
400
        return data
401
402
403
def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, exclude=None,
404
                           return_details=False):
405
    '''
406
    Checks for duplicates in subsets of columns and pools them. This can reduce the number of columns in the data \
407
    without loosing much information. Suitable columns are combined to subsets and tested for duplicates. In case \
408
    sufficient duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical \
409
    numbers in the 'pooled_var' column indicate identical information in the respective rows.
410
411
    Parameters
412
    ----------
413
    data: 2D dataset that can be coerced into Pandas DataFrame.
414
415
    col_dupl_thresh: float, default 0.2
416
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
417
        Columns with a lower ratio are not considered for pooling.
418
419
    subset_thresh: float, default 0.2
420
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
421
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
422
        reached.
423
424
    min_col_pool: integer, default 3
425
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
426
        subsets and stops when 'min_col_pool' is reached.
427
428
    exclude. list, default None
429
        List of column names to be excluded from the analysis. These columns are passed through without modification.
430
431
    return_details: bool, default False
432
        Provdies flexibility to return intermediary results.
433
434
    Returns:
435
    -------
436
    data: pd.DataFrame
437
438
    optional:
439
    subset_cols: List of columns used as subset.
440
    '''
441
442
    # Input validation
443
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
444
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
445
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
446
447
    excluded_cols = []
448
    if exclude is not None:
449
        excluded_cols = data[exclude]
450
        data = data.drop(columns=exclude)
451
452
    subset_cols = []
453
    for i in range(data.shape[1]+1-min_col_pool):
454
        check_list = [col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh]
455
456
        if len(check_list) > 0:
457
            combinations = itertools.combinations(check_list, len(check_list)-i)
458
        else:
459
            continue
460
461
        ratios = [*map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)]
462
463
        max_ratio = max(ratios)
464
        max_idx = np.argmax(ratios)
465
466
        if max_ratio > subset_thresh:
467
            best_subset = itertools.islice(itertools.combinations(
468
                check_list, len(check_list)-i), max_idx, max_idx+1)
469
            best_subset = data[list(list(best_subset)[0])]
470
            subset_cols = best_subset.columns.tolist()
471
472
            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
473
            data = data.merge(unique_subset, how='left', on=best_subset.columns.tolist()
474
                              ).drop(columns=best_subset.columns.tolist())
475
            data.index = pd.RangeIndex(len(data))
476
            break
477
478
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
479
480
    if return_details:
481
        return data, subset_cols
482
483
    return data
484
485
486
class SubsetPooler(BaseEstimator, TransformerMixin):
487
    '''
488
    Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
489
    functions (e.g. using DataCleaner() or MVColHandler()).
490
491
    Parameters
492
    ----------
493
    col_dupl_ratio: float, default 0.2
494
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
495
        Columns with a lower ratio are not considered for pooling.
496
497
    dupl_thresh: float, default 0.2
498
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
499
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
500
        reached.
501
502
    min_col_pool: integer, default 3
503
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
504
        subsets and stops when 'min_col_pool' is reached.
505
506
    return_details: bool, default False
507
        Provdies flexibility to return intermediary results.
508
509
    Returns:
510
    -------
511
    data: pd.DataFrame
512
    '''
513
514
    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
515
        self.col_dupl_thresh = col_dupl_thresh
516
        self.subset_thresh = subset_thresh
517
        self.min_col_pool = min_col_pool
518
        self.return_details = return_details
519
520
    def fit(self, data, target=None):
521
        return self
522
523
    def transform(self, data, target=None):
524
        data, subset_cols = pool_duplicate_subsets(
525
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True)
526
527
        print('Combined columns:', len(subset_cols), subset_cols)
528
529
        return data
530