GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 012cfd...853c75 )
by Andreas
01:13
created

klib.clean.data_cleaning()   A

Complexity

Conditions 3

Size

Total Lines 92
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 30
nop 10
dl 0
loc 92
rs 9.16
c 0
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
"""
6
7
# Imports
8
import itertools
9
import numpy as np
10
import pandas as pd
11
from sklearn.base import BaseEstimator, TransformerMixin
12
from typing import List, Optional, Union
13
14
from klib.describe import corr_mat
15
from klib.utils import (
16
    _diff_report,
17
    _drop_duplicates,
18
    _missing_vals,
19
    _validate_input_bool,
20
    _validate_input_range,
21
)
22
23
24
__all__ = ["convert_datatypes", "data_cleaning", "drop_missing", "mv_col_handling"]
25
26
27
def optimize_ints(data: Union[pd.Series, pd.DataFrame]):
28
    data = pd.DataFrame(data).copy()
29
    ints = data.select_dtypes(include=["int64"]).columns.tolist()
30
    data[ints] = data[ints].apply(pd.to_numeric, downcast="integer")
31
    return data
32
33
34
def optimize_floats(data: Union[pd.Series, pd.DataFrame]):
35
    data = pd.DataFrame(data).copy()
36
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
37
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
38
    return data
39
40
41
def convert_datatypes(
42
    data: pd.DataFrame,
43
    category: bool = True,
44
    cat_threshold: float = 0.05,
45
    cat_exclude: Optional[List[Union[str, int]]] = None,
46
) -> pd.DataFrame:
47
    """ Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
48
due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
49
See https://github.com/pandas-dev/pandas/issues/33803
50
51
    Parameters
52
    ----------
53
    data : pd.DataFrame
54
        2D dataset that can be coerced into Pandas DataFrame
55
    category : bool, optional
56
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
57
columns using cat_exclude, by default True
58
    cat_threshold : float, optional
59
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical, \
60
by default 0.05
61
    cat_exclude : Optional[List[Union[str, int]]], optional
62
        List of columns to exclude from categorical conversion, by default None
63
64
    Returns
65
    -------
66
    pd.DataFrame
67
        Pandas DataFrame with converted Datatypes
68
    """
69
70
    # Validate Inputs
71
    _validate_input_bool(category, "Category")
72
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
73
74
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
75
76
    data = pd.DataFrame(data).copy()
77
    for col in data.columns:
78
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
79
        if (
80
            category
81
            and unique_vals_ratio < cat_threshold
82
            and col not in cat_exclude
83
            and data[col].dtype == "object"
84
        ):
85
            data[col] = data[col].astype("category")
86
        data[col] = data[col].convert_dtypes(
87
            infer_objects=True, convert_string=True, convert_integer=False, convert_boolean=True,
88
        )
89
90
    data = optimize_ints(data)
91
    data = optimize_floats(data)
92
93
    return data
94
95
96
def drop_missing(
97
    data: pd.DataFrame,
98
    drop_threshold_cols: float = 1,
99
    drop_threshold_rows: float = 1,
100
    col_exclude: Optional[List[str]] = None,
101
) -> pd.DataFrame:
102
    """ Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions \
103
to drop additional non-empty columns and rows based on the fraction of NA-values.
104
105
    Parameters
106
    ----------
107
    data : pd.DataFrame
108
        2D dataset that can be coerced into Pandas DataFrame
109
    drop_threshold_cols : float, optional
110
        Drop columns with NA-ratio equal to or above the specified threshold, by default 1
111
    drop_threshold_rows : float, optional
112
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
113
    col_exclude : Optional[List[str]], optional
114
        Specify a list of columns to exclude from dropping. The excluded columns do not affect the drop thresholds, by \
115
default None
116
117
    Returns
118
    -------
119
    pd.DataFrame
120
        Pandas DataFrame without any empty columns or rows
121
122
    Notes
123
    -----
124
    Columns are dropped first
125
    """
126
127
    # Validate Inputs
128
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
129
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
130
131
    col_exclude = [] if col_exclude is None else col_exclude.copy()
132
    data_exclude = data[col_exclude]
133
134
    data = pd.DataFrame(data).copy()
135
136
    data_dropped = data.drop(columns=col_exclude)
137
    data_dropped = data_dropped.drop(
138
        columns=data_dropped.loc[:, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols].columns
139
    ).dropna(axis=1, how="all")
140
141
    data = pd.concat([data_dropped, data_exclude], axis=1)
142
143
    data_cleaned = data.drop(
144
        index=data.loc[_missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :].index
145
    ).dropna(axis=0, how="all")
146
    return data_cleaned
147
148
149
def data_cleaning(
150
    data: pd.DataFrame,
151
    drop_threshold_cols: float = 0.9,
152
    drop_threshold_rows: float = 0.9,
153
    drop_duplicates: bool = True,
154
    convert_dtypes: bool = True,
155
    col_exclude: Optional[List[str]] = None,
156
    category: bool = True,
157
    cat_threshold: float = 0.03,
158
    cat_exclude: Optional[List[Union[str, int]]] = None,
159
    show: str = "changes",
160
) -> pd.DataFrame:
161
    """ Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, \
162
empty columns as well as optimizing the datatypes.
163
164
    Parameters
165
    ----------
166
    data : pd.DataFrame
167
        2D dataset that can be coerced into Pandas DataFrame
168
    drop_threshold_cols : float, optional
169
        Drop columns with NA-ratio equal to or above the specified threshold, by default 0.9
170
    drop_threshold_rows : float, optional
171
        Drop rows with NA-ratio equal to or above the specified threshold, by default 0.9
172
    drop_duplicates : bool, optional
173
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values, by \
174
default True
175
    convert_dtypes : bool, optional
176
        Convert dtypes using pd.convert_dtypes(), by default True
177
    col_exclude : Optional[List[str]], optional
178
        Specify a list of columns to exclude from dropping, by default None
179
    category : bool, optional
180
        Enable changing dtypes of 'object' columns to "category". Set threshold using cat_threshold. Requires \
181
convert_dtypes=True, by default True
182
    cat_threshold : float, optional
183
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical, by \
184
default 0.03
185
    cat_exclude : Optional[List[str]], optional
186
        List of columns to exclude from categorical conversion, by default None
187
    show : str, optional
188
        {'all', 'changes', None}, by default "changes"
189
        Specify verbosity of the output:
190
191
            * 'all': Print information about the data before and after cleaning as well as information about changes \
192
and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
193
            * 'changes': Print out differences in the data before and after cleaning.
194
            * None: No information about the data and the data cleaning is printed.
195
196
    Returns
197
    -------
198
    pd.DataFrame
199
        Cleaned Pandas DataFrame
200
201
    See also
202
    --------
203
    convert_datatypes: Convert columns to best possible dtypes.
204
    drop_missing : Flexibly drop columns and rows.a
205
    _memory_usage: Gives the total memory usage in megabytes.
206
    _missing_vals: Metrics about missing values in the dataset.
207
208
    Notes
209
    -----
210
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
211
    """
212
213
    # Validate Inputs
214
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
215
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
216
    _validate_input_bool(drop_duplicates, "drop_duplicates")
217
    _validate_input_bool(convert_dtypes, "convert_datatypes")
218
    _validate_input_bool(category, "category")
219
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
220
221
    data = pd.DataFrame(data).copy()
222
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude)
223
224
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
225
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
226
227
    dupl_rows = None
228
229
    if drop_duplicates:
230
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
231
    if convert_dtypes:
232
        data_cleaned = convert_datatypes(
233
            data_cleaned, category=category, cat_threshold=cat_threshold, cat_exclude=cat_exclude,
234
        )
235
236
    _diff_report(
237
        data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show,
238
    )
239
240
    return data_cleaned
241
242
243
class DataCleaner(BaseEstimator, TransformerMixin):
244
    """ Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
245
    functions (e.g. using MVColHandler() or SubsetPooler()).
246
247
    Parameters:
248
    ---------´
249
    drop_threshold_cols: float, default 0.9
250
        Drop columns with NA-ratio equal to or above the specified threshold.
251
252
    drop_threshold_rows: float, default 0.9
253
        Drop rows with NA-ratio equal to or above the specified threshold.
254
255
    drop_duplicates: bool, default True
256
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
257
258
    convert_dtypes: bool, default True
259
        Convert dtypes using pd.convert_dtypes().
260
261
    col_exclude: list, default None
262
        Specify a list of columns to exclude from dropping.
263
264
    category: bool, default True
265
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
266
267
    cat_threshold: float, default 0.03
268
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
269
270
    cat_exclude: list, default None
271
        List of columns to exclude from categorical conversion.
272
273
    show: str, optional
274
        {'all', 'changes', None}, by default "changes"
275
        Specify verbosity of the output:
276
            * 'all': Print information about the data before and after cleaning as well as information about changes \
277
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
278
            * 'changes': Print out differences in the data before and after cleaning.
279
            * None: No information about the data and the data cleaning is printed.
280
281
    Returns
282
    -------
283
    data_cleaned: Pandas DataFrame
284
    """
285
286
    def __init__(
287
        self,
288
        drop_threshold_cols: float = 0.9,
289
        drop_threshold_rows: float = 0.9,
290
        drop_duplicates: bool = True,
291
        convert_dtypes: bool = True,
292
        col_exclude: Optional[List[str]] = None,
293
        category: bool = True,
294
        cat_threshold: float = 0.03,
295
        cat_exclude: Optional[List[Union[str, int]]] = None,
296
        show: str = "changes",
297
    ):
298
        self.drop_threshold_cols = drop_threshold_cols
299
        self.drop_threshold_rows = drop_threshold_rows
300
        self.drop_duplicates = drop_duplicates
301
        self.convert_dtypes = convert_dtypes
302
        self.col_exclude = col_exclude
303
        self.category = category
304
        self.cat_threshold = cat_threshold
305
        self.cat_exclude = cat_exclude
306
        self.show = show
307
308
    def fit(self, data, target=None):
309
        return self
310
311
    def transform(self, data, target=None):
312
        data_cleaned = data_cleaning(
313
            data,
314
            drop_threshold_cols=self.drop_threshold_cols,
315
            drop_threshold_rows=self.drop_threshold_rows,
316
            drop_duplicates=self.drop_duplicates,
317
            convert_dtypes=self.convert_dtypes,
318
            col_exclude=self.col_exclude,
319
            category=self.category,
320
            cat_threshold=self.cat_threshold,
321
            cat_exclude=self.cat_exclude,
322
            show=self.show,
323
        )
324
        return data_cleaned
325
326
327
def mv_col_handling(
328
    data: pd.DataFrame,
329
    target: Optional[Union[str, pd.Series, List]] = None,
330
    mv_threshold: float = 0.1,
331
    corr_thresh_features: float = 0.5,
332
    corr_thresh_target: float = 0.3,
333
    return_details: bool = False,
334
) -> pd.DataFrame:
335
    """ Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
336
their correlation with other features and the target variable. This function follows a three step process:
337
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
338
    - 2) Identify high correlations of these features among themselves and with other features in the dataset (above \
339
'corr_thresh_features').
340
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
341
they correlate reasonably well with the target variable (above 'corr_thresh_target').
342
343
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
344
345
    Parameters
346
    ----------
347
    data : pd.DataFrame
348
        2D dataset that can be coerced into Pandas DataFrame
349
    target : Optional[Union[str, pd.Series, List]], optional
350
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
351
and the label, by default None
352
    mv_threshold : float, optional
353
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
354
for dropping and undergo further analysis, by default 0.1
355
    corr_thresh_features : float, optional
356
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
357
is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
358
analysis, by default 0.5
359
    corr_thresh_target : float, optional
360
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
361
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
362
the feature is ultimately dropped, by default 0.3
363
    return_details : bool, optional
364
        Provdies flexibility to return intermediary results, by default False
365
366
    Returns
367
    -------
368
    pd.DataFrame
369
        Updated Pandas DataFrame
370
371
    optional:
372
    cols_mv: Columns with missing values included in the analysis
373
    drop_cols: List of dropped columns
374
    """
375
376
    # Validate Inputs
377
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
378
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
379
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
380
381
    data = pd.DataFrame(data).copy()
382
    data_local = data.copy()
383
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
384
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
385
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
386
387
    high_corr_features = []
388
    data_temp = data_local.copy()
389
    for col in cols_mv:
390
        corrmat = corr_mat(data_temp, colored=False)
391
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
392
            high_corr_features.append(col)
393
            data_temp = data_temp.drop(columns=[col])
394
395
    drop_cols = []
396
    if target is None:
397
        data = data.drop(columns=high_corr_features)
398
    else:
399
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
400
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
401
        data = data.drop(columns=drop_cols)
402
403
    if return_details:
404
        return data, cols_mv, drop_cols
405
406
    return data
407
408
409
class MVColHandler(BaseEstimator, TransformerMixin):
410
    """ Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
411
functions (e.g. using DataCleaner() or SubsetPooler()).
412
413
    Parameters
414
    ----------
415
    target: string, list, np.array or pd.Series, default None
416
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
417
and the label.
418
419
    mv_threshold: float, default 0.1
420
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
421
for dropping and undergo further analysis.
422
423
    corr_thresh_features: float, default 0.6
424
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio \
425
is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
426
analysis.
427
428
    corr_thresh_target: float, default 0.3
429
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
430
high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
431
the feature is ultimately dropped.
432
433
    return_details: bool, default True
434
        Provdies flexibility to return intermediary results.
435
436
    Returns
437
    -------
438
    data: Updated Pandas DataFrame
439
    """
440
441
    def __init__(
442
        self,
443
        target: Optional[Union[str, pd.Series, List]] = None,
444
        mv_threshold: float = 0.1,
445
        corr_thresh_features: float = 0.6,
446
        corr_thresh_target: float = 0.3,
447
        return_details: bool = True,
448
    ):
449
        self.target = target
450
        self.mv_threshold = mv_threshold
451
        self.corr_thresh_features = corr_thresh_features
452
        self.corr_thresh_target = corr_thresh_target
453
        self.return_details = return_details
454
455
    def fit(self, data, target=None):
456
        return self
457
458
    def transform(self, data, target=None):
459
        data, cols_mv, dropped_cols = mv_col_handling(
460
            data,
461
            target=self.target,
462
            mv_threshold=self.mv_threshold,
463
            corr_thresh_features=self.corr_thresh_features,
464
            corr_thresh_target=self.corr_thresh_target,
465
            return_details=self.return_details,
466
        )
467
468
        print(f"\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}")
469
        print("Features dropped:", len(dropped_cols), dropped_cols)
470
471
        return data
472
473
474
def pool_duplicate_subsets(
475
    data: pd.DataFrame,
476
    col_dupl_thresh: float = 0.2,
477
    subset_thresh: float = 0.2,
478
    min_col_pool: int = 3,
479
    exclude: Optional[List[str]] = None,
480
    return_details=False,
481
) -> pd.DataFrame:
482
    """ Checks for duplicates in subsets of columns and pools them. This can reduce the number of columns in the data \
483
without loosing much information. Suitable columns are combined to subsets and tested for duplicates. In case \
484
sufficient duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. Identical \
485
numbers in the 'pooled_var' column indicate identical information in the respective rows.
486
487
    Parameters
488
    ----------
489
    data : pd.DataFrame
490
        2D dataset that can be coerced into Pandas DataFrame
491
    col_dupl_thresh : float, optional
492
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
493
Columns with a lower ratio are not considered for pooling, by default 0.2
494
    subset_thresh : float, optional
495
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
496
reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
497
reached, by default 0.2
498
    min_col_pool : int, optional
499
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
500
subsets and stops when 'min_col_pool' is reached, by default 3
501
    exclude : Optional[List[str]], optional
502
        List of column names to be excluded from the analysis. These columns are passed through without modification, \
503
by default None
504
    return_details : bool, optional
505
        Provdies flexibility to return intermediary results, by default False
506
507
    Returns
508
    -------
509
    pd.DataFrame
510
        DataFrame with low cardinality columns pooled
511
512
    optional:
513
    subset_cols: List of columns used as subset
514
    """
515
516
    # Input validation
517
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
518
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
519
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
520
521
    excluded_cols = []
522
    if exclude is not None:
523
        excluded_cols = data[exclude]
524
        data = data.drop(columns=exclude)
525
526
    subset_cols = []
527
    for i in range(data.shape[1] + 1 - min_col_pool):
528
        check_list = [col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh]
529
530
        if len(check_list) > 0:
531
            combinations = itertools.combinations(check_list, len(check_list) - i)
532
        else:
533
            continue
534
535
        ratios = [*map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)]
536
537
        max_ratio = max(ratios)
538
        max_idx = np.argmax(ratios)
539
540
        if max_ratio > subset_thresh:
541
            best_subset = itertools.islice(
542
                itertools.combinations(check_list, len(check_list) - i), max_idx, max_idx + 1,
543
            )
544
            best_subset = data[list(list(best_subset)[0])]
545
            subset_cols = best_subset.columns.tolist()
546
547
            unique_subset = (
548
                best_subset.drop_duplicates().reset_index().rename(columns={"index": "pooled_vars"})
549
            )
550
            data = data.merge(unique_subset, how="left", on=best_subset.columns.tolist()).drop(
551
                columns=best_subset.columns.tolist()
552
            )
553
            data.index = pd.RangeIndex(len(data))
554
            break
555
556
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
557
558
    if return_details:
559
        return data, subset_cols
560
561
    return data
562
563
564
class SubsetPooler(BaseEstimator, TransformerMixin):
565
    """ Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
566
functions (e.g. using DataCleaner() or MVColHandler()).
567
568
    Parameters
569
    ----------
570
    col_dupl_ratio: float, default 0.2
571
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
572
Columns with a lower ratio are not considered for pooling.
573
574
    dupl_thresh: float, default 0.2
575
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
576
reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
577
reached.
578
579
    min_col_pool: integer, default 3
580
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
581
subsets and stops when 'min_col_pool' is reached.
582
583
    return_details: bool, default False
584
        Provdies flexibility to return intermediary results.
585
586
    Returns:
587
    -------
588
    data: pd.DataFrame
589
    """
590
591
    def __init__(
592
        self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True,
593
    ):
594
        self.col_dupl_thresh = col_dupl_thresh
595
        self.subset_thresh = subset_thresh
596
        self.min_col_pool = min_col_pool
597
        self.return_details = return_details
598
599
    def fit(self, data, target=None):
600
        return self
601
602
    def transform(self, data, target=None):
603
        data, subset_cols = pool_duplicate_subsets(
604
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True,
605
        )
606
607
        print("Combined columns:", len(subset_cols), subset_cols)
608
609
        return data
610