GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 8dc02e...5d400f )
by Andreas
01:18
created

klib.clean.MVColHandler.__init__()   A

Complexity

Conditions 1

Size

Total Lines 13
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 12
nop 6
dl 0
loc 13
rs 9.8
c 0
b 0
f 0
1
"""
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
"""
6
7
# Imports
8
import itertools
9
import numpy as np
10
import pandas as pd
11
import re
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from typing import List, Optional, Union
14
15
from klib.describe import corr_mat
16
from klib.utils import (
17
    _diff_report,
18
    _drop_duplicates,
19
    _missing_vals,
20
    _validate_input_bool,
21
    _validate_input_range,
22
)
23
24
25
__all__ = [
26
    "clean_column_names",
27
    "convert_datatypes",
28
    "data_cleaning",
29
    "drop_missing",
30
    "mv_col_handling",
31
]
32
33
34
def optimize_ints(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
35
    data = pd.DataFrame(data).copy()
36
    ints = data.select_dtypes(include=["int64"]).columns.tolist()
37
    data[ints] = data[ints].apply(pd.to_numeric, downcast="integer")
38
    return data
39
40
41
def optimize_floats(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
42
    data = pd.DataFrame(data).copy()
43
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
44
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
45
    return data
46
47
48
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
49
    """ Cleans the column names of the provided Pandas Dataframe and optionally \
50
        provides hints on duplicate and long column names.
51
52
    Parameters
53
    ----------
54
    data : pd.DataFrame
55
        Original Dataframe with columns to be cleaned
56
    hints : bool, optional
57
        Print out hints on column name duplication and colum name length, by default \
58
        True
59
60
    Returns
61
    -------
62
    pd.DataFrame
63
        Pandas DataFrame with cleaned column names
64
    """
65
66
    _validate_input_bool(hints, "hints")
67
68
    for i, col in enumerate(data.columns):
69
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
70
        column = col
71
        for match in matches:
72
            column = column.replace(match, match[0] + "_" + match[1])
73
            data.rename(columns={data.columns[i]: column}, inplace=True)
74
75
    data.columns = (
76
        data.columns.str.replace("\n", "_")
77
        .str.replace("(", "_")
78
        .str.replace(")", "_")
79
        .str.replace("'", "_")
80
        .str.replace('"', "_")
81
        .str.replace(".", "_")
82
        .str.replace("!", "_")
83
        .str.replace("?", "_")
84
        .str.replace(":", "_")
85
        .str.replace(";", "_")
86
        .str.replace("-", "_")
87
        .str.replace("/", "_")
88
        .str.replace("+", "_plus_")
89
        .str.replace("*", "_times_")
90
        .str.replace("ä", "ae")
91
        .str.replace("ö", "oe")
92
        .str.replace("ü", "ue")
93
        .str.replace("ß", "ss")
94
        .str.replace("%", "_percent_")
95
        .str.replace("$", "_dollar_")
96
        .str.replace("€", "_euro_")
97
        .str.replace("@", "_at_")
98
        .str.replace("#", "_number_")
99
        .str.replace("&", "_and_")
100
        .str.lower()
101
        .str.replace("   ", " ")
102
        .str.replace("  ", " ")
103
        .str.replace(" ", "_")
104
        .str.replace("___", "_")
105
        .str.replace("__", "_")
106
        .str.strip("_")
107
    )
108
109
    dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x]
110
    if len(dupl_idx) > 0:
111
        dupl_before = data.columns[dupl_idx].tolist()
112
        data.columns = [
113
            col if col not in data.columns[:i] else col + "_" + str(i)
114
            for i, col in enumerate(data.columns)
115
        ]
116
        if hints:
117
            print(
118
                f"Duplicate column names detected! Columns with index {dupl_idx} and "
119
                f"names {dupl_before}) have been renamed to "
120
                f"{data.columns[dupl_idx].tolist()}."
121
            )
122
123
    long_col_names = [x for x in data.columns if len(x) > 25]
124
    if len(long_col_names) > 0 and hints:
125
        print(
126
            "- Long column names detected (>25 characters)! Consider renaming the \
127
                following columns "
128
            f"{long_col_names}."
129
        )
130
131
    return data
132
133
134
def convert_datatypes(
135
    data: pd.DataFrame,
136
    category: bool = True,
137
    cat_threshold: float = 0.05,
138
    cat_exclude: Optional[List[Union[str, int]]] = None,
139
) -> pd.DataFrame:
140
    """ Converts columns to best possible dtypes using dtypes supporting pd.NA.
141
    Temporarily not converting to integers due to an issue in pandas. This is expected \
142
        to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803
143
144
    Parameters
145
    ----------
146
    data : pd.DataFrame
147
        2D dataset that can be coerced into Pandas DataFrame
148
    category : bool, optional
149
        Change dtypes of columns with dtype "object" to "category". Set threshold \
150
        using cat_threshold or exclude columns using cat_exclude, by default True
151
    cat_threshold : float, optional
152
        Ratio of unique values below which categories are inferred and column dtype is \
153
        changed to categorical, by default 0.05
154
    cat_exclude : Optional[List[Union[str, int]]], optional
155
        List of columns to exclude from categorical conversion, by default None
156
157
    Returns
158
    -------
159
    pd.DataFrame
160
        Pandas DataFrame with converted Datatypes
161
    """
162
163
    # Validate Inputs
164
    _validate_input_bool(category, "Category")
165
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
166
167
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
168
169
    data = pd.DataFrame(data).copy()
170
    for col in data.columns:
171
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
172
        if (
173
            category
174
            and unique_vals_ratio < cat_threshold
175
            and col not in cat_exclude
176
            and data[col].dtype == "object"
177
        ):
178
            data[col] = data[col].astype("category")
179
180
        # convert_ints = True if int(pd.__version__.replace(".", "")) >= 110 else False
181
        # convert_integer does not work as expected until pandas 1.1.0 while
182
        # convert_string is still experimental
183
        data[col] = data[col].convert_dtypes(
184
            infer_objects=True,
185
            convert_string=True,
186
            convert_integer=False,
187
            convert_boolean=True,
188
        )
189
190
    data = optimize_ints(data)
191
    data = optimize_floats(data)
192
193
    return data
194
195
196
def drop_missing(
197
    data: pd.DataFrame,
198
    drop_threshold_cols: float = 1,
199
    drop_threshold_rows: float = 1,
200
    col_exclude: Optional[List[str]] = None,
201
) -> pd.DataFrame:
202
    """ Drops completely empty columns and rows by default and optionally provides \
203
        flexibility to loosen restrictions to drop additional non-empty columns and \
204
        rows based on the fraction of NA-values.
205
206
    Parameters
207
    ----------
208
    data : pd.DataFrame
209
        2D dataset that can be coerced into Pandas DataFrame
210
    drop_threshold_cols : float, optional
211
        Drop columns with NA-ratio equal to or above the specified threshold, by \
212
        default 1
213
    drop_threshold_rows : float, optional
214
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
215
    col_exclude : Optional[List[str]], optional
216
        Specify a list of columns to exclude from dropping. The excluded columns do \
217
        not affect the drop thresholds, by default None
218
219
    Returns
220
    -------
221
    pd.DataFrame
222
        Pandas DataFrame without any empty columns or rows
223
224
    Notes
225
    -----
226
    Columns are dropped first
227
    """
228
229
    # Validate Inputs
230
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
231
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
232
233
    col_exclude = [] if col_exclude is None else col_exclude.copy()
234
    data_exclude = data[col_exclude]
235
236
    data = pd.DataFrame(data).copy()
237
238
    data_dropped = data.drop(columns=col_exclude, errors="ignore")
239
    data_dropped = data_dropped.drop(
240
        columns=data_dropped.loc[
241
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
242
        ].columns
243
    ).dropna(axis=1, how="all")
244
245
    data = pd.concat([data_dropped, data_exclude], axis=1)
246
247
    data_cleaned = data.drop(
248
        index=data.loc[
249
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
250
        ].index
251
    ).dropna(axis=0, how="all")
252
    return data_cleaned
253
254
255
def data_cleaning(
256
    data: pd.DataFrame,
257
    drop_threshold_cols: float = 0.9,
258
    drop_threshold_rows: float = 0.9,
259
    drop_duplicates: bool = True,
260
    convert_dtypes: bool = True,
261
    col_exclude: Optional[List[str]] = None,
262
    category: bool = True,
263
    cat_threshold: float = 0.03,
264
    cat_exclude: Optional[List[Union[str, int]]] = None,
265
    clean_col_names: bool = True,
266
    show: str = "changes",
267
) -> pd.DataFrame:
268
    """ Perform initial data cleaning tasks on a dataset, such as dropping single \
269
        valued and empty rows, empty columns as well as optimizing the datatypes.
270
271
    Parameters
272
    ----------
273
    data : pd.DataFrame
274
        2D dataset that can be coerced into Pandas DataFrame
275
    drop_threshold_cols : float, optional
276
        Drop columns with NA-ratio equal to or above the specified threshold, by \
277
        default 0.9
278
    drop_threshold_rows : float, optional
279
        Drop rows with NA-ratio equal to or above the specified threshold, by \
280
        default 0.9
281
    drop_duplicates : bool, optional
282
        Drop duplicate rows, keeping the first occurence. This step comes after the \
283
        dropping of missing values, by default True
284
    convert_dtypes : bool, optional
285
        Convert dtypes using pd.convert_dtypes(), by default True
286
    col_exclude : Optional[List[str]], optional
287
        Specify a list of columns to exclude from dropping, by default None
288
    category : bool, optional
289
        Enable changing dtypes of "object" columns to "category". Set threshold using \
290
        cat_threshold. Requires convert_dtypes=True, by default True
291
    cat_threshold : float, optional
292
        Ratio of unique values below which categories are inferred and column dtype is \
293
        changed to categorical, by default 0.03
294
    cat_exclude : Optional[List[str]], optional
295
        List of columns to exclude from categorical conversion, by default None
296
    clean_column_names: bool, optional
297
        Cleans the column names and provides hints on duplicate and long names, by \
298
        default True
299
    show : str, optional
300
        {"all", "changes", None}, by default "changes"
301
        Specify verbosity of the output:
302
303
            * "all": Print information about the data before and after cleaning as \
304
            well as information about  changes and memory usage (deep). Please be \
305
            aware, that this can slow down the function by quite a bit.
306
            * "changes": Print out differences in the data before and after cleaning.
307
            * None: No information about the data and the data cleaning is printed.
308
309
    Returns
310
    -------
311
    pd.DataFrame
312
        Cleaned Pandas DataFrame
313
314
    See also
315
    --------
316
    convert_datatypes: Convert columns to best possible dtypes.
317
    drop_missing : Flexibly drop columns and rows.
318
    _memory_usage: Gives the total memory usage in megabytes.
319
    _missing_vals: Metrics about missing values in the dataset.
320
321
    Notes
322
    -----
323
    The category dtype is not grouped in the summary, unless it contains exactly the \
324
    same categories.
325
    """
326
327
    # Validate Inputs
328
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
329
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
330
    _validate_input_bool(drop_duplicates, "drop_duplicates")
331
    _validate_input_bool(convert_dtypes, "convert_datatypes")
332
    _validate_input_bool(category, "category")
333
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
334
335
    data = pd.DataFrame(data).copy()
336
    data_cleaned = drop_missing(
337
        data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude
338
    )
339
340
    if clean_col_names:
341
        data_cleaned = clean_column_names(data_cleaned)
342
343
    single_val_cols = data_cleaned.columns[
344
        data_cleaned.nunique(dropna=False) == 1
345
    ].tolist()
346
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
347
348
    dupl_rows = None
349
350
    if drop_duplicates:
351
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
352
    if convert_dtypes:
353
        data_cleaned = convert_datatypes(
354
            data_cleaned,
355
            category=category,
356
            cat_threshold=cat_threshold,
357
            cat_exclude=cat_exclude,
358
        )
359
360
    _diff_report(
361
        data,
362
        data_cleaned,
363
        dupl_rows=dupl_rows,
364
        single_val_cols=single_val_cols,
365
        show=show,
366
    )
367
368
    return data_cleaned
369
370
371
class DataCleaner(BaseEstimator, TransformerMixin):
372
    """ Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline \
373
    with similar functions (e.g. using MVColHandler() or SubsetPooler()).
374
375
    Parameters:
376
    ---------´
377
    drop_threshold_cols: float, default 0.9
378
        Drop columns with NA-ratio equal to or above the specified threshold.
379
    drop_threshold_rows: float, default 0.9
380
        Drop rows with NA-ratio equal to or above the specified threshold.
381
    drop_duplicates: bool, default True
382
        Drop duplicate rows, keeping the first occurence. This step comes after the \
383
        dropping of missing values.
384
    convert_dtypes: bool, default True
385
        Convert dtypes using pd.convert_dtypes().
386
    col_exclude: list, default None
387
        Specify a list of columns to exclude from dropping.
388
    category: bool, default True
389
        Change dtypes of columns to "category". Set threshold using cat_threshold. \
390
        Requires convert_dtypes=True
391
    cat_threshold: float, default 0.03
392
        Ratio of unique values below which categories are inferred and column dtype is \
393
        changed to categorical.
394
    cat_exclude: list, default None
395
        List of columns to exclude from categorical conversion.
396
    clean_column_names: bool, optional
397
        Cleans the column names and provides hints on duplicate and long names, by \
398
        default True
399
    show: str, optional
400
        {"all", "changes", None}, by default "changes"
401
        Specify verbosity of the output:
402
            * "all": Print information about the data before and after cleaning as \
403
            well as information about changes and memory usage (deep). Please be \
404
            aware, that this can slow down the function by quite a bit.
405
            * "changes": Print out differences in the data before and after cleaning.
406
            * None: No information about the data and the data cleaning is printed.
407
408
    Returns
409
    -------
410
    data_cleaned: Pandas DataFrame
411
    """
412
413
    def __init__(
414
        self,
415
        drop_threshold_cols: float = 0.9,
416
        drop_threshold_rows: float = 0.9,
417
        drop_duplicates: bool = True,
418
        convert_dtypes: bool = True,
419
        col_exclude: Optional[List[str]] = None,
420
        category: bool = True,
421
        cat_threshold: float = 0.03,
422
        cat_exclude: Optional[List[Union[str, int]]] = None,
423
        clean_col_names: bool = True,
424
        show: str = "changes",
425
    ):
426
        self.drop_threshold_cols = drop_threshold_cols
427
        self.drop_threshold_rows = drop_threshold_rows
428
        self.drop_duplicates = drop_duplicates
429
        self.convert_dtypes = convert_dtypes
430
        self.col_exclude = col_exclude
431
        self.category = category
432
        self.cat_threshold = cat_threshold
433
        self.cat_exclude = cat_exclude
434
        self.clean_col_names = clean_col_names
435
        self.show = show
436
437
    def fit(self, data, target=None):
438
        return self
439
440
    def transform(self, data, target=None):
441
        data_cleaned = data_cleaning(
442
            data,
443
            drop_threshold_cols=self.drop_threshold_cols,
444
            drop_threshold_rows=self.drop_threshold_rows,
445
            drop_duplicates=self.drop_duplicates,
446
            convert_dtypes=self.convert_dtypes,
447
            col_exclude=self.col_exclude,
448
            category=self.category,
449
            cat_threshold=self.cat_threshold,
450
            cat_exclude=self.cat_exclude,
451
            clean_col_names=self.clean_col_names,
452
            show=self.show,
453
        )
454
        return data_cleaned
455
456
457
def mv_col_handling(
458
    data: pd.DataFrame,
459
    target: Optional[Union[str, pd.Series, List]] = None,
460
    mv_threshold: float = 0.1,
461
    corr_thresh_features: float = 0.5,
462
    corr_thresh_target: float = 0.3,
463
    return_details: bool = False,
464
) -> pd.DataFrame:
465
    """ Converts columns with a high ratio of missing values into binary features and \
466
    eventually drops them based on their correlation with other features and the \
467
    target variable. This function follows a three step process:
468
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
469
    - 2) Identify high correlations of these features among themselves and with \
470
        other features in the dataset (above 'corr_thresh_features').
471
    - 3) Features with high ratio of missing values and high correlation among each \
472
        other are dropped unless they correlate reasonably well with the target \
473
        variable (above 'corr_thresh_target').
474
475
    Note: If no target is provided, the process exits after step two and drops columns \
476
    identified up to this point.
477
478
    Parameters
479
    ----------
480
    data : pd.DataFrame
481
        2D dataset that can be coerced into Pandas DataFrame
482
    target : Optional[Union[str, pd.Series, List]], optional
483
        Specify target for correlation. I.e. label column to generate only the \
484
        correlations between each feature and the label, by default None
485
    mv_threshold : float, optional
486
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
487
        than mv_threshold are candidates for dropping and undergo further analysis, by \
488
        default 0.1
489
    corr_thresh_features : float, optional
490
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
491
        features (with a high mv-ratio) is allowed to have with another feature. If \
492
        this threshold is overstepped, the feature undergoes further analysis, by \
493
        default 0.5
494
    corr_thresh_target : float, optional
495
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
496
        feature (i.e. feature with a high mv-ratio and high correlation to another \
497
        existing feature) with the target. If this threshold is not met the feature is \
498
        ultimately dropped, by default 0.3
499
    return_details : bool, optional
500
        Provdies flexibility to return intermediary results, by default False
501
502
    Returns
503
    -------
504
    pd.DataFrame
505
        Updated Pandas DataFrame
506
507
    optional:
508
    cols_mv: Columns with missing values included in the analysis
509
    drop_cols: List of dropped columns
510
    """
511
512
    # Validate Inputs
513
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
514
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
515
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
516
517
    data = pd.DataFrame(data).copy()
518
    data_local = data.copy()
519
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
520
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
521
    data_local[cols_mv] = (
522
        data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
523
    )
524
525
    high_corr_features = []
526
    data_temp = data_local.copy()
527
    for col in cols_mv:
528
        corrmat = corr_mat(data_temp, colored=False)
529
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
530
            high_corr_features.append(col)
531
            data_temp = data_temp.drop(columns=[col])
532
533
    drop_cols = []
534
    if target is None:
535
        data = data.drop(columns=high_corr_features)
536
    else:
537
        corrs = corr_mat(data_local, target=target, colored=False).loc[
538
            high_corr_features
539
        ]
540
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
541
        data = data.drop(columns=drop_cols)
542
543
    if return_details:
544
        return data, cols_mv, drop_cols
545
546
    return data
547
548
549
class MVColHandler(BaseEstimator, TransformerMixin):
550
    """ Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a \
551
        pipeline with similar functions (e.g. using DataCleaner() or SubsetPooler()).
552
553
    Parameters
554
    ----------
555
    target: string, list, np.array or pd.Series, default None
556
        Specify target for correlation. E.g. label column to generate only the \
557
        correlations between each feature and the label.
558
    mv_threshold: float, default 0.1
559
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
560
        than mv_threshold are candidates for dropping and undergo further analysis.
561
    corr_thresh_features: float, default 0.6
562
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
563
        features with a high mv-ratio is allowed to have with another feature. If this \
564
        threshold is overstepped, the feature undergoes further analysis.
565
    corr_thresh_target: float, default 0.3
566
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
567
        feature (i.e. feature with a high mv-ratio and high correlation to another \
568
        existing feature) with the target. If this threshold is not met the feature is \
569
        ultimately dropped.
570
    return_details: bool, default True
571
        Provdies flexibility to return intermediary results.
572
573
    Returns
574
    -------
575
    data: Updated Pandas DataFrame
576
    """
577
578
    def __init__(
579
        self,
580
        target: Optional[Union[str, pd.Series, List]] = None,
581
        mv_threshold: float = 0.1,
582
        corr_thresh_features: float = 0.6,
583
        corr_thresh_target: float = 0.3,
584
        return_details: bool = True,
585
    ):
586
        self.target = target
587
        self.mv_threshold = mv_threshold
588
        self.corr_thresh_features = corr_thresh_features
589
        self.corr_thresh_target = corr_thresh_target
590
        self.return_details = return_details
591
592
    def fit(self, data, target=None):
593
        return self
594
595
    def transform(self, data, target=None):
596
        data, cols_mv, dropped_cols = mv_col_handling(
597
            data,
598
            target=self.target,
599
            mv_threshold=self.mv_threshold,
600
            corr_thresh_features=self.corr_thresh_features,
601
            corr_thresh_target=self.corr_thresh_target,
602
            return_details=self.return_details,
603
        )
604
605
        print(f"\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}")
606
        print("Features dropped:", len(dropped_cols), dropped_cols)
607
608
        return data
609
610
611
def pool_duplicate_subsets(
612
    data: pd.DataFrame,
613
    col_dupl_thresh: float = 0.2,
614
    subset_thresh: float = 0.2,
615
    min_col_pool: int = 3,
616
    exclude: Optional[List[str]] = None,
617
    return_details=False,
618
) -> pd.DataFrame:
619
    """ Checks for duplicates in subsets of columns and pools them. This can reduce \
620
        the number of columns in the data without loosing much information. Suitable \
621
        columns are combined to subsets and tested for duplicates. In case sufficient \
622
        duplicates can be found, the respective columns are aggregated into a \
623
        "pooled_var" column. Identical numbers in the "pooled_var" column indicate \
624
        identical information in the respective rows.
625
626
        Note:  It is advised to exclude features that provide sufficient informational \
627
        content by themselves as well as the target column by using the "exclude" \
628
        setting.
629
630
    Parameters
631
    ----------
632
    data : pd.DataFrame
633
        2D dataset that can be coerced into Pandas DataFrame
634
    col_dupl_thresh : float, optional
635
        Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
636
        considered in the further analysis. Columns with a lower ratio are not \
637
        considered for pooling, by default 0.2
638
    subset_thresh : float, optional
639
        The first subset with a duplicate threshold higher than "subset_thresh" is \
640
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
641
        continues with continuously smaller subsets until "min_col_pool" is reached, \
642
        by default 0.2
643
    min_col_pool : int, optional
644
        Minimum number of columns to pool. The algorithm attempts to combine as many \
645
        columns as possible to suitable subsets and stops when "min_col_pool" is \
646
        reached, by default 3
647
    exclude : Optional[List[str]], optional
648
        List of column names to be excluded from the analysis. These columns are \
649
        passed through without modification, by default None
650
    return_details : bool, optional
651
        Provdies flexibility to return intermediary results, by default False
652
653
    Returns
654
    -------
655
    pd.DataFrame
656
        DataFrame with low cardinality columns pooled
657
658
    optional:
659
    subset_cols: List of columns used as subset
660
    """
661
662
    # Input validation
663
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
664
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
665
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
666
667
    excluded_cols = []
668
    if exclude is not None:
669
        excluded_cols = data[exclude]
670
        data = data.drop(columns=exclude)
671
672
    subset_cols = []
673
    for i in range(data.shape[1] + 1 - min_col_pool):
674
        check_list = [
675
            col
676
            for col in data.columns
677
            if data.duplicated(subset=col).mean() > col_dupl_thresh
678
        ]
679
680
        if len(check_list) > 0:
681
            combinations = itertools.combinations(check_list, len(check_list) - i)
682
        else:
683
            continue
684
685
        ratios = [
686
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)
687
        ]
688
689
        max_ratio = max(ratios)
690
        max_idx = np.argmax(ratios)
691
692
        if max_ratio > subset_thresh:
693
            best_subset = itertools.islice(
694
                itertools.combinations(check_list, len(check_list) - i),
695
                max_idx,
696
                max_idx + 1,
697
            )
698
            best_subset = data[list(list(best_subset)[0])]
699
            subset_cols = best_subset.columns.tolist()
700
701
            unique_subset = (
702
                best_subset.drop_duplicates()
703
                .reset_index()
704
                .rename(columns={"index": "pooled_vars"})
705
            )
706
            data = data.merge(
707
                unique_subset, how="left", on=best_subset.columns.tolist()
708
            ).drop(columns=best_subset.columns.tolist())
709
            data.index = pd.RangeIndex(len(data))
710
            break
711
712
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
713
714
    if return_details:
715
        return data, subset_cols
716
717
    return data
718
719
720
class SubsetPooler(BaseEstimator, TransformerMixin):
721
    """ Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be \
722
        put into a pipeline with similar functions (e.g. using DataCleaner() or \
723
        MVColHandler()).
724
725
    Parameters
726
    ----------
727
    col_dupl_ratio: float, default 0.2
728
        Columns with a ratio of duplicates higher than "col_dupl_ratio" are considered \
729
        in the further analysis. Columns with a lower ratio are not considered for \
730
        pooling.
731
    dupl_thresh: float, default 0.2
732
        The first subset with a duplicate threshold higher than "dupl_thresh" is \
733
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
734
        continues with continuously smaller subsets until "min_col_pool" is reached.
735
    min_col_pool: integer, default 3
736
        Minimum number of columns to pool. The algorithm attempts to combine as many \
737
        columns as possible to suitable subsets and stops when "min_col_pool" is \
738
        reached.
739
    return_details: bool, default False
740
        Provdies flexibility to return intermediary results.
741
742
    Returns:
743
    -------
744
    data: pd.DataFrame
745
    """
746
747
    def __init__(
748
        self,
749
        col_dupl_thresh=0.2,
750
        subset_thresh=0.2,
751
        min_col_pool=3,
752
        return_details=True,
753
    ):
754
        self.col_dupl_thresh = col_dupl_thresh
755
        self.subset_thresh = subset_thresh
756
        self.min_col_pool = min_col_pool
757
        self.return_details = return_details
758
759
    def fit(self, data, target=None):
760
        return self
761
762
    def transform(self, data, target=None):
763
        data, subset_cols = pool_duplicate_subsets(
764
            data,
765
            col_dupl_thresh=0.2,
766
            subset_thresh=0.2,
767
            min_col_pool=3,
768
            return_details=True,
769
        )
770
771
        print("Combined columns:", len(subset_cols), subset_cols)
772
773
        return data
774