GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 125b89...3ddec6 )
by Andreas
03:38
created

klib.clean.SubsetPooler.fit()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
'''
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import itertools
10
import pandas as pd
11
from sklearn.base import BaseEstimator, TransformerMixin
12
13
from .describe import corr_mat
14
from .utils import (_diff_report,
15
                    _drop_duplicates,
16
                    _missing_vals,
17
                    _validate_input_bool,
18
                    _validate_input_range)
19
20
21
__all__ = ['convert_datatypes',
22
           'data_cleaning',
23
           'drop_missing',
24
           'mv_col_handling']
25
26
27
def convert_datatypes(data, category=True, cat_threshold=0.05, cat_exclude=None):
28
    '''
29
    Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting integers.
30
31
    Parameters
32
    ----------
33
    data: 2D dataset that can be coerced into Pandas DataFrame.
34
35
    category: bool, default True
36
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
37
        columns using cat_exclude.
38
39
    cat_threshold: float, default 0.05
40
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
41
42
    cat_exclude: list, default None
43
        List of columns to exclude from categorical conversion.
44
45
    Returns
46
    -------
47
    data: Pandas DataFrame
48
    '''
49
50
    # Validate Inputs
51
    _validate_input_bool(category, 'Category')
52
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
53
54
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
55
56
    data = pd.DataFrame(data).copy()
57
    for col in data.columns:
58
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
59
        if (category and
60
            unique_vals_ratio < cat_threshold and
61
            col not in cat_exclude and
62
                data[col].dtype == 'object'):
63
            data[col] = data[col].astype('category')
64
        data[col] = data[col].convert_dtypes(infer_objects=True, convert_string=True,
65
                                             convert_integer=False, convert_boolean=True)
66
67
    return data
68
69
70
def drop_missing(data, drop_threshold_cols=1, drop_threshold_rows=1):
71
    '''
72
    Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions to \
73
    drop additional columns and rows based on the fraction of remaining NA-values.
74
75
    Parameters
76
    ----------
77
    data: 2D dataset that can be coerced into Pandas DataFrame.
78
79
    drop_threshold_cols: float, default 1
80
        Drop columns with NA-ratio above the specified threshold.
81
82
    drop_threshold_rows: float, default 1
83
        Drop rows with NA-ratio above the specified threshold.
84
85
    Returns
86
    -------
87
    data_cleaned: Pandas DataFrame
88
89
    Notes
90
    -----
91
    Columns are dropped first. Rows are dropped based on the remaining data.
92
    '''
93
94
    # Validate Inputs
95
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
96
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
97
98
    data = pd.DataFrame(data).copy()
99
    data = data.dropna(axis=0, how='all').dropna(axis=1, how='all')
100
    data = data.drop(columns=data.loc[:, _missing_vals(data)['mv_cols_ratio'] > drop_threshold_cols].columns)
101
    data_cleaned = data.drop(index=data.loc[_missing_vals(data)['mv_rows_ratio'] > drop_threshold_rows, :].index)
102
103
    return data_cleaned
104
105
106
def data_cleaning(data, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True,
107
                  convert_dtypes=True, category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
108
    '''
109
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
110
        columns as well as optimizing the datatypes.
111
112
    Parameters
113
    ----------
114
    data: 2D dataset that can be coerced into Pandas DataFrame.
115
116
    drop_threshold_cols: float, default 0.9
117
        Drop columns with NA-ratio above the specified threshold.
118
119
    drop_threshold_rows: float, default 0.9
120
        Drop rows with NA-ratio above the specified threshold.
121
122
    drop_duplicates: bool, default True
123
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
124
125
    convert_dtypes: bool, default True
126
        Convert dtypes using pd.convert_dtypes().
127
128
    category: bool, default True
129
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
130
131
    cat_threshold: float, default 0.03
132
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
133
134
    cat_exclude: list, default None
135
        List of columns to exclude from categorical conversion.
136
137
    show: {'all', 'changes', None} default 'all'
138
        Specify verbosity of the output.
139
        * 'all': Print information about the data before and after cleaning as well as information about changes.
140
        * 'changes': Print out differences in the data before and after cleaning.
141
        * None: No information about the data and the data cleaning is printed.
142
143
    Returns
144
    -------
145
    data_cleaned: Pandas DataFrame
146
147
    See Also
148
    --------
149
    convert_datatypes: Convert columns to best possible dtypes.
150
    drop_missing : Flexibly drop columns and rows.
151
    _memory_usage: Gives the total memory usage in kilobytes.
152
    _missing_vals: Metrics about missing values in the dataset.
153
154
    Notes
155
    -----
156
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
157
    '''
158
159
    # Validate Inputs
160
    _validate_input_range(drop_threshold_cols, 'drop_threshold_cols', 0, 1)
161
    _validate_input_range(drop_threshold_rows, 'drop_threshold_rows', 0, 1)
162
    _validate_input_bool(drop_duplicates, 'drop_duplicates')
163
    _validate_input_bool(convert_dtypes, 'convert_datatypes')
164
    _validate_input_bool(category, 'category')
165
    _validate_input_range(cat_threshold, 'cat_threshold', 0, 1)
166
167
    data = pd.DataFrame(data).copy()
168
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows)
169
170
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
171
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
172
173
    dupl_rows = None
174
175
    if drop_duplicates:
176
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
177
    if convert_dtypes:
178
        data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold,
179
                                         cat_exclude=cat_exclude)
180
181
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
182
183
    return data_cleaned
184
185
186
class DataCleaner(BaseEstimator, TransformerMixin):
187
    '''
188
    Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar
189
    functions (e.g. MVColHandler()).
190
191
    Parameters:
192
    ---------´
193
    drop_threshold_cols: float, default 0.9
194
        Drop columns with NA-ratio above the specified threshold.
195
196
    drop_threshold_rows: float, default 0.9
197
        Drop rows with NA-ratio above the specified threshold.
198
199
    drop_duplicates: bool, default True
200
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
201
202
    convert_dtypes: bool, default True
203
        Convert dtypes using pd.convert_dtypes().
204
205
    category: bool, default True
206
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
207
208
    cat_threshold: float, default 0.03
209
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
210
211
    cat_exclude: list, default None
212
        List of columns to exclude from categorical conversion.
213
214
    show: {'all', 'changes', None} default 'all'
215
        Specify verbosity of the output.
216
        * 'all': Print information about the data before and after cleaning as well as information about changes.
217
        * 'changes': Print out differences in the data before and after cleaning.
218
        * None: No information about the data and the data cleaning is printed.
219
220
    Returns:
221
    -------
222
    data_cleaned: Pandas DataFrame
223
    '''
224
225
    def __init__(self, drop_threshold_cols=0.9, drop_threshold_rows=0.9, drop_duplicates=True, convert_dtypes=True,
226
                 category=True, cat_threshold=0.03, cat_exclude=None, show='changes'):
227
        self.drop_threshold_cols = drop_threshold_cols
228
        self.drop_threshold_rows = drop_threshold_rows
229
        self.drop_duplicates = drop_duplicates
230
        self.convert_dtypes = convert_dtypes
231
        self.category = category
232
        self.cat_threshold = cat_threshold
233
        self.cat_exclude = cat_exclude
234
        self.show = show
235
236
    def fit(self, data, target=None):
237
        return self
238
239
    def transform(self, data, target=None):
240
        data_cleaned = data_cleaning(data, drop_threshold_cols=self.drop_threshold_cols,
241
                                     drop_threshold_rows=self.drop_threshold_rows, drop_duplicates=self.drop_duplicates,
242
                                     convert_dtypes=self.convert_dtypes, category=self.category, cat_threshold=self.
243
                                     cat_threshold, cat_exclude=self.cat_exclude, show=self.show)
244
        return data_cleaned
245
246
247
def mv_col_handling(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
248
    '''
249
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
250
    their correlation with other features and the target variable. This function follows a three step process:
251
    - 1) Identify features with a high ratio of missing values
252
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
253
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
254
         they correlate reasonably well with the target variable.
255
256
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
257
258
    Parameters
259
    ----------
260
    data: 2D dataset that can be coerced into Pandas DataFrame.
261
262
    target: string, list, np.array or pd.Series, default None
263
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
264
        and the label.
265
266
    mv_threshold: float, default 0.1
267
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
268
        for dropping and undergo further analysis.
269
270
    corr_thresh_features: float, default 0.6
271
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
272
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
273
274
    corr_thresh_target: float, default 0.3
275
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
276
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
277
        the feature is ultimately dropped.
278
279
    Returns
280
    -------
281
    data: Updated Pandas DataFrame
282
    cols_mv: Columns with missing values included in the analysis
283
    drop_cols: List of dropped columns
284
    '''
285
286
    # Validate Inputs
287
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
288
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
289
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
290
291
    data = pd.DataFrame(data).copy()
292
    data_local = data.copy()
293
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
294
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
295
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
296
297
    high_corr_features = []
298
    data_temp = data_local.copy()
299
    for col in cols_mv:
300
        corrmat = corr_mat(data_temp, colored=False)
301
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
302
            high_corr_features.append(col)
303
            data_temp = data_temp.drop(columns=[col])
304
305
    drop_cols = []
306
    if target is None:
307
        data = data.drop(columns=high_corr_features)
308
    else:
309
        for col in high_corr_features:
310
            if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
311
                drop_cols.append(col)
312
                data = data.drop(columns=[col])
313
314
    return data, cols_mv, drop_cols
315
316
317
class MVColHandler(BaseEstimator, TransformerMixin):
318
    '''
319
    Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar
320
    functions (e.g. DataCleaner()).
321
322
    Parameters
323
    ----------
324
    target: string, list, np.array or pd.Series, default None
325
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
326
        and the label.
327
328
    mv_threshold: float, default 0.1
329
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
330
        for dropping and undergo further analysis.
331
332
    corr_thresh_features: float, default 0.6
333
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
334
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
335
336
    corr_thresh_target: float, default 0.3
337
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
338
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
339
        the feature is ultimately dropped.
340
341
    Returns
342
    -------
343
    data: Updated Pandas DataFrame
344
    '''
345
346
    def __init__(self, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
347
        self.target = target
348
        self.mv_threshold = mv_threshold
349
        self.corr_thresh_features = corr_thresh_features
350
        self.corr_thresh_target = corr_thresh_target
351
352
    def fit(self, data, target=None):
353
        return self
354
355
    def transform(self, data, target=None):
356
        data, cols_mv, dropped_cols = mv_col_handling(data, target=self.target, mv_threshold=self.mv_threshold,
357
                                                      corr_thresh_features=self.corr_thresh_features,
358
                                                      corr_thresh_target=self.corr_thresh_target)
359
360
        print(f'\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}')
361
        print('Features dropped:', len(dropped_cols), dropped_cols)
362
363
        return data
364
365
366
def pool_duplicate_subsets(data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3):
367
    '''
368
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
369
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
370
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
371
    the 'pooled_var' column indicate identical information in the respective rows.
372
373
    Parameters
374
    ----------
375
    data: 2D dataset that can be coerced into Pandas DataFrame.
376
377
    col_dupl_ratio: float, default 0.2
378
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
379
        Columns with a lower ratio are not considered for pooling.
380
381
    dupl_thresh: float, default 0.2
382
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
383
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
384
        reached.
385
386
    min_col_pool: integer, default 3
387
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
388
        subsets and stops when 'min_col_pool' is reached.
389
390
    Returns:
391
    -------
392
    data: pd.DataFrame
393
    '''
394
395
    # Input validation
396
    _validate_input_range(col_dupl_thresh, 'col_dupl_thresh', 0, 1)
397
    _validate_input_range(subset_thresh, 'subset_thresh', 0, 1)
398
    _validate_input_range(min_col_pool, 'min_col_pool', 0, data.shape[1])
399
400
    for i in range(data.shape[1]+1-min_col_pool):
401
        check_list = []
402
        for col in data.columns:
403
            cdr = data.duplicated(subset=col).mean()
404
            if cdr > col_dupl_thresh:
405
                check_list.append(col)
406
407
        combinations = itertools.combinations(check_list, len(check_list)-i)
408
409
        ratios = []
410
        for comb in combinations:
411
            ratios.append(data[list(comb)].duplicated().mean())
412
413
        max_ratio = pd.DataFrame(ratios).max()
414
        max_idx = pd.DataFrame(ratios).idxmax()
415
416
        subset_cols = []
417
        if max_ratio[0] > subset_thresh:
418
            best_subset = itertools.islice(itertools.combinations(
419
                check_list, len(check_list)-i), max_idx[0], max_idx[0]+1)
420
            best_subset = data[list(list(best_subset)[0])]
421
            subset_cols = best_subset.columns.tolist()
422
423
            unique_subset = best_subset.drop_duplicates().reset_index().rename(columns={'index': 'pooled_vars'})
424
            data = data.merge(unique_subset, how='inner', on=best_subset.columns.tolist()
425
                              ).drop(columns=best_subset.columns.tolist())
426
            data.index = pd.RangeIndex(len(data))
427
            break
428
429
    return data, subset_cols
0 ignored issues
show
introduced by
The variable subset_cols does not seem to be defined in case the for loop on line 400 is not entered. Are you sure this can never be the case?
Loading history...
430
431
432
class SubsetPooler(BaseEstimator, TransformerMixin):
433
    '''
434
    Checks for duplicates in subsets of columns and pools them. This reduced the number of columns in the data without \
435
    loosing any information. Suitable columns are combined to subsets and tested for duplicates. In case sufficient \
436
    duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical numbers in \
437
    the 'pooled_var' column indicate identical information in the respective rows.
438
439
    Parameters
440
    ----------
441
    col_dupl_ratio: float, default 0.2
442
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
443
        Columns with a lower ratio are not considered for pooling.
444
445
    dupl_thresh: float, default 0.2
446
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
447
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
448
        reached.
449
450
    min_col_pool: integer, default 3
451
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
452
        subsets and stops when 'min_col_pool' is reached.
453
454
    Returns:
455
    -------
456
    data: pd.DataFrame
457
    '''
458
459
    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3):
460
        self.col_dupl_thresh = col_dupl_thresh
461
        self.subset_thresh = subset_thresh
462
        self.min_col_pool = min_col_pool
463
464
    def fit(self, data, target=None):
465
        return self
466
467
    def transform(self, data, target=None):
468
        data, subset_cols = pool_duplicate_subsets(
469
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3)
470
471
        print('Combined columns:', len(subset_cols), subset_cols)
472
473
        return data
474