GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( c20d75...406f67 )
by Andreas
01:47
created

klib.clean.data_cleaning()   B

Complexity

Conditions 3

Size

Total Lines 103
Code Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 40
nop 10
dl 0
loc 103
rs 8.92
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
"""
6
7
# Imports
8
import itertools
9
import numpy as np
10
import pandas as pd
11
from sklearn.base import BaseEstimator, TransformerMixin
12
from typing import List, Optional, Union
13
14
from klib.describe import corr_mat
15
from klib.utils import (
16
    _diff_report,
17
    _drop_duplicates,
18
    _missing_vals,
19
    _validate_input_bool,
20
    _validate_input_range,
21
)
22
23
24
__all__ = ["convert_datatypes", "data_cleaning", "drop_missing", "mv_col_handling"]
25
26
27
def optimize_ints(data: Union[pd.Series, pd.DataFrame]):
28
    data = pd.DataFrame(data).copy()
29
    ints = data.select_dtypes(include=["int64"]).columns.tolist()
30
    data[ints] = data[ints].apply(pd.to_numeric, downcast="integer")
31
    return data
32
33
34
def optimize_floats(data: Union[pd.Series, pd.DataFrame]):
35
    data = pd.DataFrame(data).copy()
36
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
37
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
38
    return data
39
40
41
def convert_datatypes(
42
    data: pd.DataFrame,
43
    category: bool = True,
44
    cat_threshold: float = 0.05,
45
    cat_exclude: Optional[List[Union[str, int]]] = None,
46
) -> pd.DataFrame:
47
    """ Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
48
due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
49
See https://github.com/pandas-dev/pandas/issues/33803
50
51
    Parameters
52
    ----------
53
    data : pd.DataFrame
54
        2D dataset that can be coerced into Pandas DataFrame
55
    category : bool, optional
56
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
57
columns using cat_exclude, by default True
58
    cat_threshold : float, optional
59
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical, \
60
by default 0.05
61
    cat_exclude : Optional[List[Union[str, int]]], optional
62
        List of columns to exclude from categorical conversion, by default None
63
64
    Returns
65
    -------
66
    pd.DataFrame
67
        Pandas DataFrame with converted Datatypes
68
    """
69
70
    # Validate Inputs
71
    _validate_input_bool(category, "Category")
72
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
73
74
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
75
76
    data = pd.DataFrame(data).copy()
77
    for col in data.columns:
78
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
79
        if (
80
            category
81
            and unique_vals_ratio < cat_threshold
82
            and col not in cat_exclude
83
            and data[col].dtype == "object"
84
        ):
85
            data[col] = data[col].astype("category")
86
        data[col] = data[col].convert_dtypes(
87
            infer_objects=True,
88
            convert_string=True,
89
            convert_integer=False,
90
            convert_boolean=True,
91
        )
92
93
    data = optimize_ints(data)
94
    data = optimize_floats(data)
95
96
    return data
97
98
99
def drop_missing(
100
    data: pd.DataFrame,
101
    drop_threshold_cols: float = 1,
102
    drop_threshold_rows: float = 1,
103
    col_exclude: Optional[List[str]] = None,
104
) -> pd.DataFrame:
105
    """ Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions \
106
to drop additional non-empty columns and rows based on the fraction of NA-values.
107
108
    Parameters
109
    ----------
110
    data : pd.DataFrame
111
        2D dataset that can be coerced into Pandas DataFrame
112
    drop_threshold_cols : float, optional
113
        Drop columns with NA-ratio equal to or above the specified threshold, by default 1
114
    drop_threshold_rows : float, optional
115
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
116
    col_exclude : Optional[List[str]], optional
117
        Specify a list of columns to exclude from dropping. The excluded columns do not affect the drop thresholds, by \
118
default None
119
120
    Returns
121
    -------
122
    pd.DataFrame
123
        Pandas DataFrame without any empty columns or rows
124
125
    Notes
126
    -----
127
    Columns are dropped first
128
    """
129
130
    # Validate Inputs
131
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
132
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
133
134
    col_exclude = [] if col_exclude is None else col_exclude.copy()
135
    data_exclude = data[col_exclude]
136
137
    data = pd.DataFrame(data).copy()
138
139
    data_dropped = data.drop(columns=col_exclude)
140
    data_dropped = data_dropped.drop(
141
        columns=data_dropped.loc[
142
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
143
        ].columns
144
    ).dropna(axis=1, how="all")
145
146
    data = pd.concat([data_dropped, data_exclude], axis=1)
147
148
    data_cleaned = data.drop(
149
        index=data.loc[
150
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
151
        ].index
152
    ).dropna(axis=0, how="all")
153
    return data_cleaned
154
155
156
def data_cleaning(
157
    data: pd.DataFrame,
158
    drop_threshold_cols: float = 0.9,
159
    drop_threshold_rows: float = 0.9,
160
    drop_duplicates: bool = True,
161
    convert_dtypes: bool = True,
162
    col_exclude: Optional[List[str]] = None,
163
    category: bool = True,
164
    cat_threshold: float = 0.03,
165
    cat_exclude: Optional[List[Union[str, int]]] = None,
166
    show: str = "changes",
167
) -> pd.DataFrame:
168
    """ Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, \
169
empty columns as well as optimizing the datatypes.
170
171
    Parameters
172
    ----------
173
    data : pd.DataFrame
174
        2D dataset that can be coerced into Pandas DataFrame
175
    drop_threshold_cols : float, optional
176
        Drop columns with NA-ratio equal to or above the specified threshold, by default 0.9
177
    drop_threshold_rows : float, optional
178
        Drop rows with NA-ratio equal to or above the specified threshold, by default 0.9
179
    drop_duplicates : bool, optional
180
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values, by \
181
default True
182
    convert_dtypes : bool, optional
183
        Convert dtypes using pd.convert_dtypes(), by default True
184
    col_exclude : Optional[List[str]], optional
185
        Specify a list of columns to exclude from dropping, by default None
186
    category : bool, optional
187
        Enable changing dtypes of 'object' columns to "category". Set threshold using cat_threshold. Requires \
188
convert_dtypes=True, by default True
189
    cat_threshold : float, optional
190
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical, by \
191
default 0.03
192
    cat_exclude : Optional[List[str]], optional
193
        List of columns to exclude from categorical conversion, by default None
194
    show : str, optional
195
        {'all', 'changes', None}, by default "changes"
196
        Specify verbosity of the output:
197
198
            * 'all': Print information about the data before and after cleaning as well as information about changes \
199
and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
200
            * 'changes': Print out differences in the data before and after cleaning.
201
            * None: No information about the data and the data cleaning is printed.
202
203
    Returns
204
    -------
205
    pd.DataFrame
206
        Cleaned Pandas DataFrame
207
208
    See also
209
    --------
210
    convert_datatypes: Convert columns to best possible dtypes.
211
    drop_missing : Flexibly drop columns and rows.
212
    _memory_usage: Gives the total memory usage in megabytes.
213
    _missing_vals: Metrics about missing values in the dataset.
214
215
    Notes
216
    -----
217
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
218
    """
219
220
    # Validate Inputs
221
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
222
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
223
    _validate_input_bool(drop_duplicates, "drop_duplicates")
224
    _validate_input_bool(convert_dtypes, "convert_datatypes")
225
    _validate_input_bool(category, "category")
226
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
227
228
    data = pd.DataFrame(data).copy()
229
    data_cleaned = drop_missing(
230
        data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude
231
    )
232
233
    single_val_cols = data_cleaned.columns[
234
        data_cleaned.nunique(dropna=False) == 1
235
    ].tolist()
236
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
237
238
    dupl_rows = None
239
240
    if drop_duplicates:
241
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
242
    if convert_dtypes:
243
        data_cleaned = convert_datatypes(
244
            data_cleaned,
245
            category=category,
246
            cat_threshold=cat_threshold,
247
            cat_exclude=cat_exclude,
248
        )
249
250
    _diff_report(
251
        data,
252
        data_cleaned,
253
        dupl_rows=dupl_rows,
254
        single_val_cols=single_val_cols,
255
        show=show,
256
    )
257
258
    return data_cleaned
259
260
261
class DataCleaner(BaseEstimator, TransformerMixin):
262
    """ Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
263
    functions (e.g. using MVColHandler() or SubsetPooler()).
264
265
    Parameters:
266
    ---------´
267
    drop_threshold_cols: float, default 0.9
268
        Drop columns with NA-ratio equal to or above the specified threshold.
269
270
    drop_threshold_rows: float, default 0.9
271
        Drop rows with NA-ratio equal to or above the specified threshold.
272
273
    drop_duplicates: bool, default True
274
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
275
276
    convert_dtypes: bool, default True
277
        Convert dtypes using pd.convert_dtypes().
278
279
    col_exclude: list, default None
280
        Specify a list of columns to exclude from dropping.
281
282
    category: bool, default True
283
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
284
285
    cat_threshold: float, default 0.03
286
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
287
288
    cat_exclude: list, default None
289
        List of columns to exclude from categorical conversion.
290
291
    show: str, optional
292
        {'all', 'changes', None}, by default "changes"
293
        Specify verbosity of the output:
294
            * 'all': Print information about the data before and after cleaning as well as information about changes \
295
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
296
            * 'changes': Print out differences in the data before and after cleaning.
297
            * None: No information about the data and the data cleaning is printed.
298
299
    Returns
300
    -------
301
    data_cleaned: Pandas DataFrame
302
    """
303
304
    def __init__(
305
        self,
306
        drop_threshold_cols: float = 0.9,
307
        drop_threshold_rows: float = 0.9,
308
        drop_duplicates: bool = True,
309
        convert_dtypes: bool = True,
310
        col_exclude: Optional[List[str]] = None,
311
        category: bool = True,
312
        cat_threshold: float = 0.03,
313
        cat_exclude: Optional[List[Union[str, int]]] = None,
314
        show: str = "changes",
315
    ):
316
        self.drop_threshold_cols = drop_threshold_cols
317
        self.drop_threshold_rows = drop_threshold_rows
318
        self.drop_duplicates = drop_duplicates
319
        self.convert_dtypes = convert_dtypes
320
        self.col_exclude = col_exclude
321
        self.category = category
322
        self.cat_threshold = cat_threshold
323
        self.cat_exclude = cat_exclude
324
        self.show = show
325
326
    def fit(self, data, target=None):
327
        return self
328
329
    def transform(self, data, target=None):
330
        data_cleaned = data_cleaning(
331
            data,
332
            drop_threshold_cols=self.drop_threshold_cols,
333
            drop_threshold_rows=self.drop_threshold_rows,
334
            drop_duplicates=self.drop_duplicates,
335
            convert_dtypes=self.convert_dtypes,
336
            col_exclude=self.col_exclude,
337
            category=self.category,
338
            cat_threshold=self.cat_threshold,
339
            cat_exclude=self.cat_exclude,
340
            show=self.show,
341
        )
342
        return data_cleaned
343
344
345
def mv_col_handling(
346
    data: pd.DataFrame,
347
    target: Optional[Union[str, pd.Series, List]] = None,
348
    mv_threshold: float = 0.1,
349
    corr_thresh_features: float = 0.5,
350
    corr_thresh_target: float = 0.3,
351
    return_details: bool = False,
352
) -> pd.DataFrame:
353
    """ Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
354
their correlation with other features and the target variable. This function follows a three step process:
355
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
356
    - 2) Identify high correlations of these features among themselves and with other features in the dataset (above \
357
'corr_thresh_features').
358
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
359
they correlate reasonably well with the target variable (above 'corr_thresh_target').
360
361
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
362
363
    Parameters
364
    ----------
365
    data : pd.DataFrame
366
        2D dataset that can be coerced into Pandas DataFrame
367
    target : Optional[Union[str, pd.Series, List]], optional
368
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
369
and the label, by default None
370
    mv_threshold : float, optional
371
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
372
for dropping and undergo further analysis, by default 0.1
373
    corr_thresh_features : float, optional
374
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
375
is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
376
analysis, by default 0.5
377
    corr_thresh_target : float, optional
378
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
379
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
380
the feature is ultimately dropped, by default 0.3
381
    return_details : bool, optional
382
        Provdies flexibility to return intermediary results, by default False
383
384
    Returns
385
    -------
386
    pd.DataFrame
387
        Updated Pandas DataFrame
388
389
    optional:
390
    cols_mv: Columns with missing values included in the analysis
391
    drop_cols: List of dropped columns
392
    """
393
394
    # Validate Inputs
395
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
396
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
397
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
398
399
    data = pd.DataFrame(data).copy()
400
    data_local = data.copy()
401
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
402
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
403
    data_local[cols_mv] = (
404
        data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
405
    )
406
407
    high_corr_features = []
408
    data_temp = data_local.copy()
409
    for col in cols_mv:
410
        corrmat = corr_mat(data_temp, colored=False)
411
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
412
            high_corr_features.append(col)
413
            data_temp = data_temp.drop(columns=[col])
414
415
    drop_cols = []
416
    if target is None:
417
        data = data.drop(columns=high_corr_features)
418
    else:
419
        corrs = corr_mat(data_local, target=target, colored=False).loc[
420
            high_corr_features
421
        ]
422
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
423
        data = data.drop(columns=drop_cols)
424
425
    if return_details:
426
        return data, cols_mv, drop_cols
427
428
    return data
429
430
431
class MVColHandler(BaseEstimator, TransformerMixin):
432
    """ Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
433
functions (e.g. using DataCleaner() or SubsetPooler()).
434
435
    Parameters
436
    ----------
437
    target: string, list, np.array or pd.Series, default None
438
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
439
and the label.
440
441
    mv_threshold: float, default 0.1
442
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
443
for dropping and undergo further analysis.
444
445
    corr_thresh_features: float, default 0.6
446
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio \
447
is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
448
analysis.
449
450
    corr_thresh_target: float, default 0.3
451
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
452
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
453
the feature is ultimately dropped.
454
455
    return_details: bool, default True
456
        Provdies flexibility to return intermediary results.
457
458
    Returns
459
    -------
460
    data: Updated Pandas DataFrame
461
    """
462
463
    def __init__(
464
        self,
465
        target: Optional[Union[str, pd.Series, List]] = None,
466
        mv_threshold: float = 0.1,
467
        corr_thresh_features: float = 0.6,
468
        corr_thresh_target: float = 0.3,
469
        return_details: bool = True,
470
    ):
471
        self.target = target
472
        self.mv_threshold = mv_threshold
473
        self.corr_thresh_features = corr_thresh_features
474
        self.corr_thresh_target = corr_thresh_target
475
        self.return_details = return_details
476
477
    def fit(self, data, target=None):
478
        return self
479
480
    def transform(self, data, target=None):
481
        data, cols_mv, dropped_cols = mv_col_handling(
482
            data,
483
            target=self.target,
484
            mv_threshold=self.mv_threshold,
485
            corr_thresh_features=self.corr_thresh_features,
486
            corr_thresh_target=self.corr_thresh_target,
487
            return_details=self.return_details,
488
        )
489
490
        print(f"\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}")
491
        print("Features dropped:", len(dropped_cols), dropped_cols)
492
493
        return data
494
495
496
def pool_duplicate_subsets(
497
    data: pd.DataFrame,
498
    col_dupl_thresh: float = 0.2,
499
    subset_thresh: float = 0.2,
500
    min_col_pool: int = 3,
501
    exclude: Optional[List[str]] = None,
502
    return_details=False,
503
) -> pd.DataFrame:
504
    """ Checks for duplicates in subsets of columns and pools them. This can reduce the number of columns in the data \
505
without loosing much information. Suitable columns are combined to subsets and tested for duplicates. In case \
506
sufficient duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical \
507
numbers in the 'pooled_var' column indicate identical information in the respective rows.
508
509
    Parameters
510
    ----------
511
    data : pd.DataFrame
512
        2D dataset that can be coerced into Pandas DataFrame
513
    col_dupl_thresh : float, optional
514
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
515
Columns with a lower ratio are not considered for pooling, by default 0.2
516
    subset_thresh : float, optional
517
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
518
reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
519
reached, by default 0.2
520
    min_col_pool : int, optional
521
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
522
subsets and stops when 'min_col_pool' is reached, by default 3
523
    exclude : Optional[List[str]], optional
524
        List of column names to be excluded from the analysis. These columns are passed through without modification, \
525
by default None
526
    return_details : bool, optional
527
        Provdies flexibility to return intermediary results, by default False
528
529
    Returns
530
    -------
531
    pd.DataFrame
532
        DataFrame with low cardinality columns pooled
533
534
    optional:
535
    subset_cols: List of columns used as subset
536
    """
537
538
    # Input validation
539
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
540
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
541
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
542
543
    excluded_cols = []
544
    if exclude is not None:
545
        excluded_cols = data[exclude]
546
        data = data.drop(columns=exclude)
547
548
    subset_cols = []
549
    for i in range(data.shape[1] + 1 - min_col_pool):
550
        check_list = [
551
            col
552
            for col in data.columns
553
            if data.duplicated(subset=col).mean() > col_dupl_thresh
554
        ]
555
556
        if len(check_list) > 0:
557
            combinations = itertools.combinations(check_list, len(check_list) - i)
558
        else:
559
            continue
560
561
        ratios = [
562
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)
563
        ]
564
565
        max_ratio = max(ratios)
566
        max_idx = np.argmax(ratios)
567
568
        if max_ratio > subset_thresh:
569
            best_subset = itertools.islice(
570
                itertools.combinations(check_list, len(check_list) - i),
571
                max_idx,
572
                max_idx + 1,
573
            )
574
            best_subset = data[list(list(best_subset)[0])]
575
            subset_cols = best_subset.columns.tolist()
576
577
            unique_subset = (
578
                best_subset.drop_duplicates()
579
                .reset_index()
580
                .rename(columns={"index": "pooled_vars"})
581
            )
582
            data = data.merge(
583
                unique_subset, how="left", on=best_subset.columns.tolist()
584
            ).drop(columns=best_subset.columns.tolist())
585
            data.index = pd.RangeIndex(len(data))
586
            break
587
588
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
589
590
    if return_details:
591
        return data, subset_cols
592
593
    return data
594
595
596
class SubsetPooler(BaseEstimator, TransformerMixin):
597
    """ Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
598
functions (e.g. using DataCleaner() or MVColHandler()).
599
600
    Parameters
601
    ----------
602
    col_dupl_ratio: float, default 0.2
603
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
604
Columns with a lower ratio are not considered for pooling.
605
606
    dupl_thresh: float, default 0.2
607
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
608
reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
609
reached.
610
611
    min_col_pool: integer, default 3
612
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
613
subsets and stops when 'min_col_pool' is reached.
614
615
    return_details: bool, default False
616
        Provdies flexibility to return intermediary results.
617
618
    Returns:
619
    -------
620
    data: pd.DataFrame
621
    """
622
623
    def __init__(
624
        self,
625
        col_dupl_thresh=0.2,
626
        subset_thresh=0.2,
627
        min_col_pool=3,
628
        return_details=True,
629
    ):
630
        self.col_dupl_thresh = col_dupl_thresh
631
        self.subset_thresh = subset_thresh
632
        self.min_col_pool = min_col_pool
633
        self.return_details = return_details
634
635
    def fit(self, data, target=None):
636
        return self
637
638
    def transform(self, data, target=None):
639
        data, subset_cols = pool_duplicate_subsets(
640
            data,
641
            col_dupl_thresh=0.2,
642
            subset_thresh=0.2,
643
            min_col_pool=3,
644
            return_details=True,
645
        )
646
647
        print("Combined columns:", len(subset_cols), subset_cols)
648
649
        return data
650