GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( bb0913...522ac3 )
by Andreas
01:07
created

klib.clean   B

Complexity

Total Complexity 48

Size/Duplication

Total Lines 695
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 293
dl 0
loc 695
rs 8.5599
c 0
b 0
f 0
wmc 48

8 Functions

Rating   Name   Duplication   Size   Complexity  
A optimize_ints() 0 5 1
A optimize_floats() 0 5 1
B pool_duplicate_subsets() 0 91 7
A drop_missing() 0 51 2
B convert_datatypes() 0 55 7
A data_cleaning() 0 95 4
B mv_col_handling() 0 80 7
C clean_column_names() 0 76 10

9 Methods

Rating   Name   Duplication   Size   Complexity  
A MVColHandler.__init__() 0 13 1
A DataCleaner.__init__() 0 21 1
A DataCleaner.transform() 0 14 1
A MVColHandler.fit() 0 2 1
A MVColHandler.transform() 0 14 1
A SubsetPooler.transform() 0 8 1
A SubsetPooler.fit() 0 2 1
A SubsetPooler.__init__() 0 5 1
A DataCleaner.fit() 0 2 1

How to fix   Complexity   

Complexity

Complex classes like klib.clean often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
"""
6
7
# Imports
8
import itertools
9
import numpy as np
10
import pandas as pd
11
import re
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from typing import List, Optional, Union
14
15
from klib.describe import corr_mat
16
from klib.utils import (
17
    _diff_report,
18
    _drop_duplicates,
19
    _missing_vals,
20
    _validate_input_bool,
21
    _validate_input_range,
22
)
23
24
25
__all__ = ["convert_datatypes", "data_cleaning", "drop_missing", "mv_col_handling"]
26
27
28
def optimize_ints(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
29
    data = pd.DataFrame(data).copy()
30
    ints = data.select_dtypes(include=["int64"]).columns.tolist()
31
    data[ints] = data[ints].apply(pd.to_numeric, downcast="integer")
32
    return data
33
34
35
def optimize_floats(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
36
    data = pd.DataFrame(data).copy()
37
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
38
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
39
    return data
40
41
42
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
43
    """Cleans the column names of the provided Pandas Dataframe and optionally provides hints on duplicate and long \
44
        column names.
45
46
    Parameters
47
    ----------
48
    data : pd.DataFrame
49
        Original Dataframe with columns to be cleaned
50
    hints : bool, optional
51
        Print out hints on column name duplication and colum name length, by default True
52
53
    Returns
54
    -------
55
    pd.DataFrame
56
        andas DataFrame with cleaned column names
57
    """
58
59
    for i, col in enumerate(data.columns):
60
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
61
        for match in matches:
62
            column = col.replace(match, match[0] + "_" + match[1])
63
            data.rename(columns={data.columns[i]: column}, inplace=True)
64
65
    for i, col in enumerate(data.columns):
66
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
67
        column = col
68
        for match in matches:
69
            column = column.replace(match, match[0] + "_" + match[1])
70
            data.rename(columns={data.columns[i]: column}, inplace=True)
71
72
    data.columns = (
73
        data.columns.str.replace("(", " ")
74
        .str.replace(")", " ")
75
        .str.replace("'", " ")
76
        .str.replace('"', " ")
77
        .str.replace("/", " ")
78
        .str.replace("-", "")
79
        .str.replace("+", " plus ")
80
        .str.replace("-", " minus ")
81
        .str.replace("*", " times ")
82
        .str.replace("ä", "ae")
83
        .str.replace("ö", "oe")
84
        .str.replace("ü", "ue")
85
        .str.replace("ß", "ss")
86
        .str.replace("%", " percent ")
87
        .str.replace("$", " dollar ")
88
        .str.replace("€", " euro ")
89
        .str.replace("@", " at ")
90
        .str.replace("#", " number ")
91
        .str.replace("&", " and ")
92
        .str.lower()
93
        .str.strip()
94
        .str.replace("   ", " ")
95
        .str.replace("  ", " ")
96
        .str.replace(" ", "_")
97
    )
98
99
    dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x]
100
    if len(dupl_idx) > 0:
101
        dupl_before = data.columns[dupl_idx].tolist()
102
        data.columns = [
103
            col if col not in data.columns[:i] else col + "_" + str(i) for i, col in enumerate(data.columns)
104
        ]
105
        if hints:
106
            print(
107
                f"- Duplicate column names detected! Columns with index {dupl_idx} and names {dupl_before}) have \n\
108
            been renamed to {data.columns[dupl_idx].tolist()}."
109
            )
110
111
    long_col_names = [x for x in data.columns if len(x) > 25]
112
    if len(long_col_names) > 0 and hints:
113
        print(
114
            f"- Long column names detected (>25 characters)! Consider renaming the following columns {long_col_names}."
115
        )
116
117
    return data
118
119
120
def convert_datatypes(
121
    data: pd.DataFrame,
122
    category: bool = True,
123
    cat_threshold: float = 0.05,
124
    cat_exclude: Optional[List[Union[str, int]]] = None,
125
) -> pd.DataFrame:
126
    """ Converts columns to best possible dtypes using dtypes supporting pd.NA. Temporarily not converting to integers \
127
        due to an issue in pandas. This is expected to be fixed in pandas 1.1. \
128
        See https://github.com/pandas-dev/pandas/issues/33803
129
130
    Parameters
131
    ----------
132
    data : pd.DataFrame
133
        2D dataset that can be coerced into Pandas DataFrame
134
    category : bool, optional
135
        Change dtypes of columns with dtype "object" to "category". Set threshold using cat_threshold or exclude \
136
        columns using cat_exclude, by default True
137
    cat_threshold : float, optional
138
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical, \
139
        by default 0.05
140
    cat_exclude : Optional[List[Union[str, int]]], optional
141
        List of columns to exclude from categorical conversion, by default None
142
143
    Returns
144
    -------
145
    pd.DataFrame
146
        Pandas DataFrame with converted Datatypes
147
    """
148
149
    # Validate Inputs
150
    _validate_input_bool(category, "Category")
151
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
152
153
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
154
155
    data = pd.DataFrame(data).copy()
156
    for col in data.columns:
157
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
158
        if (
159
            category
160
            and unique_vals_ratio < cat_threshold
161
            and col not in cat_exclude
162
            and data[col].dtype == "object"
163
        ):
164
            data[col] = data[col].astype("category")
165
166
        # convert_ints = True if int(pd.__version__.replace(".", "")) >= 110 else False
167
        data[col] = data[col].convert_dtypes(
168
            infer_objects=True, convert_string=True, convert_integer=False, convert_boolean=True
169
        )
170
171
    data = optimize_ints(data)
172
    data = optimize_floats(data)
173
174
    return data
175
176
177
def drop_missing(
178
    data: pd.DataFrame,
179
    drop_threshold_cols: float = 1,
180
    drop_threshold_rows: float = 1,
181
    col_exclude: Optional[List[str]] = None,
182
) -> pd.DataFrame:
183
    """ Drops completely empty columns and rows by default and optionally provides flexibility to loosen restrictions \
184
        to drop additional non-empty columns and rows based on the fraction of NA-values.
185
186
    Parameters
187
    ----------
188
    data : pd.DataFrame
189
        2D dataset that can be coerced into Pandas DataFrame
190
    drop_threshold_cols : float, optional
191
        Drop columns with NA-ratio equal to or above the specified threshold, by default 1
192
    drop_threshold_rows : float, optional
193
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
194
    col_exclude : Optional[List[str]], optional
195
        Specify a list of columns to exclude from dropping. The excluded columns do not affect the drop thresholds, by \
196
        default None
197
198
    Returns
199
    -------
200
    pd.DataFrame
201
        Pandas DataFrame without any empty columns or rows
202
203
    Notes
204
    -----
205
    Columns are dropped first
206
    """
207
208
    # Validate Inputs
209
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
210
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
211
212
    col_exclude = [] if col_exclude is None else col_exclude.copy()
213
    data_exclude = data[col_exclude]
214
215
    data = pd.DataFrame(data).copy()
216
217
    data_dropped = data.drop(columns=col_exclude)
218
    data_dropped = data_dropped.drop(
219
        columns=data_dropped.loc[:, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols].columns
220
    ).dropna(axis=1, how="all")
221
222
    data = pd.concat([data_dropped, data_exclude], axis=1)
223
224
    data_cleaned = data.drop(
225
        index=data.loc[_missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :].index
226
    ).dropna(axis=0, how="all")
227
    return data_cleaned
228
229
230
def data_cleaning(
231
    data: pd.DataFrame,
232
    drop_threshold_cols: float = 0.9,
233
    drop_threshold_rows: float = 0.9,
234
    drop_duplicates: bool = True,
235
    convert_dtypes: bool = True,
236
    col_exclude: Optional[List[str]] = None,
237
    category: bool = True,
238
    cat_threshold: float = 0.03,
239
    cat_exclude: Optional[List[Union[str, int]]] = None,
240
    clean_col_names: bool = True,
241
    show: str = "changes",
242
) -> pd.DataFrame:
243
    """ Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, \
244
        empty columns as well as optimizing the datatypes.
245
246
    Parameters
247
    ----------
248
    data : pd.DataFrame
249
        2D dataset that can be coerced into Pandas DataFrame
250
    drop_threshold_cols : float, optional
251
        Drop columns with NA-ratio equal to or above the specified threshold, by default 0.9
252
    drop_threshold_rows : float, optional
253
        Drop rows with NA-ratio equal to or above the specified threshold, by default 0.9
254
    drop_duplicates : bool, optional
255
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values, by \
256
        default True
257
    convert_dtypes : bool, optional
258
        Convert dtypes using pd.convert_dtypes(), by default True
259
    col_exclude : Optional[List[str]], optional
260
        Specify a list of columns to exclude from dropping, by default None
261
    category : bool, optional
262
        Enable changing dtypes of 'object' columns to "category". Set threshold using cat_threshold. Requires \
263
        convert_dtypes=True, by default True
264
    cat_threshold : float, optional
265
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical, by \
266
        default 0.03
267
    cat_exclude : Optional[List[str]], optional
268
        List of columns to exclude from categorical conversion, by default None
269
    clean_column_names: bool, optional
270
        Cleans the column names and provides hints on duplicate and long names, by default True
271
    show : str, optional
272
        {'all', 'changes', None}, by default "changes"
273
        Specify verbosity of the output:
274
275
            * 'all': Print information about the data before and after cleaning as well as information about changes \
276
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
277
            * 'changes': Print out differences in the data before and after cleaning.
278
            * None: No information about the data and the data cleaning is printed.
279
280
    Returns
281
    -------
282
    pd.DataFrame
283
        Cleaned Pandas DataFrame
284
285
    See also
286
    --------
287
    convert_datatypes: Convert columns to best possible dtypes.
288
    drop_missing : Flexibly drop columns and rows.a
289
    _memory_usage: Gives the total memory usage in megabytes.
290
    _missing_vals: Metrics about missing values in the dataset.
291
292
    Notes
293
    -----
294
    The category dtype is not grouped in the summary, unless it contains exactly the same categories.
295
    """
296
297
    # Validate Inputs
298
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
299
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
300
    _validate_input_bool(drop_duplicates, "drop_duplicates")
301
    _validate_input_bool(convert_dtypes, "convert_datatypes")
302
    _validate_input_bool(category, "category")
303
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
304
305
    data = pd.DataFrame(data).copy()
306
    data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude)
307
308
    single_val_cols = data_cleaned.columns[data_cleaned.nunique(dropna=False) == 1].tolist()
309
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
310
311
    dupl_rows = None
312
313
    if drop_duplicates:
314
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
315
    if convert_dtypes:
316
        data_cleaned = convert_datatypes(
317
            data_cleaned, category=category, cat_threshold=cat_threshold, cat_exclude=cat_exclude
318
        )
319
    if clean_col_names:
320
        data_cleaned = clean_column_names(data_cleaned)
321
322
    _diff_report(data, data_cleaned, dupl_rows=dupl_rows, single_val_cols=single_val_cols, show=show)
323
324
    return data_cleaned
325
326
327
class DataCleaner(BaseEstimator, TransformerMixin):
328
    """ Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline with similar \
329
    functions (e.g. using MVColHandler() or SubsetPooler()).
330
331
    Parameters:
332
    ---------´
333
    drop_threshold_cols: float, default 0.9
334
        Drop columns with NA-ratio equal to or above the specified threshold.
335
336
    drop_threshold_rows: float, default 0.9
337
        Drop rows with NA-ratio equal to or above the specified threshold.
338
339
    drop_duplicates: bool, default True
340
        Drop duplicate rows, keeping the first occurence. This step comes after the dropping of missing values.
341
342
    convert_dtypes: bool, default True
343
        Convert dtypes using pd.convert_dtypes().
344
345
    col_exclude: list, default None
346
        Specify a list of columns to exclude from dropping.
347
348
    category: bool, default True
349
        Change dtypes of columns to "category". Set threshold using cat_threshold. Requires convert_dtypes=True
350
351
    cat_threshold: float, default 0.03
352
        Ratio of unique values below which categories are inferred and column dtype is changed to categorical.
353
354
    cat_exclude: list, default None
355
        List of columns to exclude from categorical conversion.
356
357
    show: str, optional
358
        {'all', 'changes', None}, by default "changes"
359
        Specify verbosity of the output:
360
            * 'all': Print information about the data before and after cleaning as well as information about changes \
361
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
362
            * 'changes': Print out differences in the data before and after cleaning.
363
            * None: No information about the data and the data cleaning is printed.
364
365
    Returns
366
    -------
367
    data_cleaned: Pandas DataFrame
368
    """
369
370
    def __init__(
371
        self,
372
        drop_threshold_cols: float = 0.9,
373
        drop_threshold_rows: float = 0.9,
374
        drop_duplicates: bool = True,
375
        convert_dtypes: bool = True,
376
        col_exclude: Optional[List[str]] = None,
377
        category: bool = True,
378
        cat_threshold: float = 0.03,
379
        cat_exclude: Optional[List[Union[str, int]]] = None,
380
        show: str = "changes",
381
    ):
382
        self.drop_threshold_cols = drop_threshold_cols
383
        self.drop_threshold_rows = drop_threshold_rows
384
        self.drop_duplicates = drop_duplicates
385
        self.convert_dtypes = convert_dtypes
386
        self.col_exclude = col_exclude
387
        self.category = category
388
        self.cat_threshold = cat_threshold
389
        self.cat_exclude = cat_exclude
390
        self.show = show
391
392
    def fit(self, data, target=None):
393
        return self
394
395
    def transform(self, data, target=None):
396
        data_cleaned = data_cleaning(
397
            data,
398
            drop_threshold_cols=self.drop_threshold_cols,
399
            drop_threshold_rows=self.drop_threshold_rows,
400
            drop_duplicates=self.drop_duplicates,
401
            convert_dtypes=self.convert_dtypes,
402
            col_exclude=self.col_exclude,
403
            category=self.category,
404
            cat_threshold=self.cat_threshold,
405
            cat_exclude=self.cat_exclude,
406
            show=self.show,
407
        )
408
        return data_cleaned
409
410
411
def mv_col_handling(
412
    data: pd.DataFrame,
413
    target: Optional[Union[str, pd.Series, List]] = None,
414
    mv_threshold: float = 0.1,
415
    corr_thresh_features: float = 0.5,
416
    corr_thresh_target: float = 0.3,
417
    return_details: bool = False,
418
) -> pd.DataFrame:
419
    """ Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
420
        their correlation with other features and the target variable. This function follows a three step process:
421
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
422
    - 2) Identify high correlations of these features among themselves and with other features in the dataset (above \
423
        'corr_thresh_features').
424
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
425
        they correlate reasonably well with the target variable (above 'corr_thresh_target').
426
427
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
428
429
    Parameters
430
    ----------
431
    data : pd.DataFrame
432
        2D dataset that can be coerced into Pandas DataFrame
433
    target : Optional[Union[str, pd.Series, List]], optional
434
        Specify target for correlation. I.e. label column to generate only the correlations between each feature \
435
        and the label, by default None
436
    mv_threshold : float, optional
437
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
438
        for dropping and undergo further analysis, by default 0.1
439
    corr_thresh_features : float, optional
440
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features (with a high mv-ratio) \
441
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
442
        analysis, by default 0.5
443
    corr_thresh_target : float, optional
444
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
445
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
446
        the feature is ultimately dropped, by default 0.3
447
    return_details : bool, optional
448
        Provdies flexibility to return intermediary results, by default False
449
450
    Returns
451
    -------
452
    pd.DataFrame
453
        Updated Pandas DataFrame
454
455
    optional:
456
    cols_mv: Columns with missing values included in the analysis
457
    drop_cols: List of dropped columns
458
    """
459
460
    # Validate Inputs
461
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
462
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
463
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
464
465
    data = pd.DataFrame(data).copy()
466
    data_local = data.copy()
467
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
468
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
469
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
470
471
    high_corr_features = []
472
    data_temp = data_local.copy()
473
    for col in cols_mv:
474
        corrmat = corr_mat(data_temp, colored=False)
475
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
476
            high_corr_features.append(col)
477
            data_temp = data_temp.drop(columns=[col])
478
479
    drop_cols = []
480
    if target is None:
481
        data = data.drop(columns=high_corr_features)
482
    else:
483
        corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
484
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
485
        data = data.drop(columns=drop_cols)
486
487
    if return_details:
488
        return data, cols_mv, drop_cols
489
490
    return data
491
492
493
class MVColHandler(BaseEstimator, TransformerMixin):
494
    """ Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a pipeline with similar \
495
        functions (e.g. using DataCleaner() or SubsetPooler()).
496
497
    Parameters
498
    ----------
499
    target: string, list, np.array or pd.Series, default None
500
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
501
        and the label.
502
503
    mv_threshold: float, default 0.1
504
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
505
        for dropping and undergo further analysis.
506
507
    corr_thresh_features: float, default 0.6
508
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio \
509
        is allowed to have with another feature. If this threshold is overstepped, the feature undergoes further \
510
        analysis.
511
512
    corr_thresh_target: float, default 0.3
513
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
514
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
515
        the feature is ultimately dropped.
516
517
    return_details: bool, default True
518
        Provdies flexibility to return intermediary results.
519
520
    Returns
521
    -------
522
    data: Updated Pandas DataFrame
523
    """
524
525
    def __init__(
526
        self,
527
        target: Optional[Union[str, pd.Series, List]] = None,
528
        mv_threshold: float = 0.1,
529
        corr_thresh_features: float = 0.6,
530
        corr_thresh_target: float = 0.3,
531
        return_details: bool = True,
532
    ):
533
        self.target = target
534
        self.mv_threshold = mv_threshold
535
        self.corr_thresh_features = corr_thresh_features
536
        self.corr_thresh_target = corr_thresh_target
537
        self.return_details = return_details
538
539
    def fit(self, data, target=None):
540
        return self
541
542
    def transform(self, data, target=None):
543
        data, cols_mv, dropped_cols = mv_col_handling(
544
            data,
545
            target=self.target,
546
            mv_threshold=self.mv_threshold,
547
            corr_thresh_features=self.corr_thresh_features,
548
            corr_thresh_target=self.corr_thresh_target,
549
            return_details=self.return_details,
550
        )
551
552
        print(f"\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}")
553
        print("Features dropped:", len(dropped_cols), dropped_cols)
554
555
        return data
556
557
558
def pool_duplicate_subsets(
559
    data: pd.DataFrame,
560
    col_dupl_thresh: float = 0.2,
561
    subset_thresh: float = 0.2,
562
    min_col_pool: int = 3,
563
    exclude: Optional[List[str]] = None,
564
    return_details=False,
565
) -> pd.DataFrame:
566
    """ Checks for duplicates in subsets of columns and pools them. This can reduce the number of columns in the data \
567
        without loosing much information. Suitable columns are combined to subsets and tested for duplicates. In case \
568
        sufficient duplicates can be found, the respective columns are aggregated into a 'pooled_var' column. \
569
        Identical numbers in the 'pooled_var' column indicate identical information in the respective rows.
570
571
        Note:  It is advised to exclude features that provide sufficient informational content by themselves as well \
572
        as the target column by using the "exclude" setting.
573
574
    Parameters
575
    ----------
576
    data : pd.DataFrame
577
        2D dataset that can be coerced into Pandas DataFrame
578
    col_dupl_thresh : float, optional
579
        Columns with a ratio of duplicates higher than 'col_dupl_thresh' are considered in the further analysis. \
580
        Columns with a lower ratio are not considered for pooling, by default 0.2
581
    subset_thresh : float, optional
582
        The first subset with a duplicate threshold higher than 'subset_thresh' is chosen and aggregated. If no subset \
583
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
584
        reached, by default 0.2
585
    min_col_pool : int, optional
586
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
587
        subsets and stops when 'min_col_pool' is reached, by default 3
588
    exclude : Optional[List[str]], optional
589
        List of column names to be excluded from the analysis. These columns are passed through without modification, \
590
        by default None
591
    return_details : bool, optional
592
        Provdies flexibility to return intermediary results, by default False
593
594
    Returns
595
    -------
596
    pd.DataFrame
597
        DataFrame with low cardinality columns pooled
598
599
    optional:
600
    subset_cols: List of columns used as subset
601
    """
602
603
    # Input validation
604
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
605
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
606
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
607
608
    excluded_cols = []
609
    if exclude is not None:
610
        excluded_cols = data[exclude]
611
        data = data.drop(columns=exclude)
612
613
    subset_cols = []
614
    for i in range(data.shape[1] + 1 - min_col_pool):
615
        check_list = [col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh]
616
617
        if len(check_list) > 0:
618
            combinations = itertools.combinations(check_list, len(check_list) - i)
619
        else:
620
            continue
621
622
        ratios = [*map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)]
623
624
        max_ratio = max(ratios)
625
        max_idx = np.argmax(ratios)
626
627
        if max_ratio > subset_thresh:
628
            best_subset = itertools.islice(
629
                itertools.combinations(check_list, len(check_list) - i), max_idx, max_idx + 1
630
            )
631
            best_subset = data[list(list(best_subset)[0])]
632
            subset_cols = best_subset.columns.tolist()
633
634
            unique_subset = (
635
                best_subset.drop_duplicates().reset_index().rename(columns={"index": "pooled_vars"})
636
            )
637
            data = data.merge(unique_subset, how="left", on=best_subset.columns.tolist()).drop(
638
                columns=best_subset.columns.tolist()
639
            )
640
            data.index = pd.RangeIndex(len(data))
641
            break
642
643
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
644
645
    if return_details:
646
        return data, subset_cols
647
648
    return data
649
650
651
class SubsetPooler(BaseEstimator, TransformerMixin):
652
    """ Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be put into a pipeline with similar \
653
        functions (e.g. using DataCleaner() or MVColHandler()).
654
655
    Parameters
656
    ----------
657
    col_dupl_ratio: float, default 0.2
658
        Columns with a ratio of duplicates higher than 'col_dupl_ratio' are considered in the further analysis. \
659
        Columns with a lower ratio are not considered for pooling.
660
661
    dupl_thresh: float, default 0.2
662
        The first subset with a duplicate threshold higher than 'dupl_thresh' is chosen and aggregated. If no subset \
663
        reaches the threshold, the algorithm continues with continuously smaller subsets until 'min_col_pool' is \
664
        reached.
665
666
    min_col_pool: integer, default 3
667
        Minimum number of columns to pool. The algorithm attempts to combine as many columns as possible to suitable \
668
        subsets and stops when 'min_col_pool' is reached.
669
670
    return_details: bool, default False
671
        Provdies flexibility to return intermediary results.
672
673
    Returns:
674
    -------
675
    data: pd.DataFrame
676
    """
677
678
    def __init__(self, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True):
679
        self.col_dupl_thresh = col_dupl_thresh
680
        self.subset_thresh = subset_thresh
681
        self.min_col_pool = min_col_pool
682
        self.return_details = return_details
683
684
    def fit(self, data, target=None):
685
        return self
686
687
    def transform(self, data, target=None):
688
        data, subset_cols = pool_duplicate_subsets(
689
            data, col_dupl_thresh=0.2, subset_thresh=0.2, min_col_pool=3, return_details=True
690
        )
691
692
        print("Combined columns:", len(subset_cols), subset_cols)
693
694
        return data
695