GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — main ( 476ed9...09d8d6 )
by Andreas
02:41
created

klib.clean.mv_col_handling()   B

Complexity

Conditions 7

Size

Total Lines 91
Code Lines 33

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 50.3519

Importance

Changes 0
Metric Value
cc 7
eloc 33
nop 6
dl 0
loc 91
ccs 1
cts 25
cp 0.04
crap 50.3519
rs 7.688
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
"""
6
7
# Imports
8 1
import itertools
9 1
import numpy as np
10 1
import pandas as pd
11 1
import re
12 1
from sklearn.base import BaseEstimator, TransformerMixin
13 1
from typing import List, Optional, Union
14
15 1
from klib.describe import corr_mat
16 1
from klib.utils import (
17
    _diff_report,
18
    _drop_duplicates,
19
    _missing_vals,
20
    _validate_input_bool,
21
    _validate_input_range,
22
)
23
24 1
__all__ = [
25
    "clean_column_names",
26
    "convert_datatypes",
27
    "data_cleaning",
28
    "drop_missing",
29
    "mv_col_handling",
30
]
31
32
33 1
def optimize_ints(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
34 1
    data = pd.DataFrame(data).copy()
35 1
    ints = data.select_dtypes(include=["int64"]).columns.tolist()
36 1
    data[ints] = data[ints].apply(pd.to_numeric, downcast="integer")
37 1
    return data
38
39
40 1
def optimize_floats(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
41 1
    data = pd.DataFrame(data).copy()
42 1
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
43 1
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
44 1
    return data
45
46
47 1
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
48
    """Clean the column names of the provided Pandas Dataframe and optionally \
49
        provides hints on duplicate and long column names.
50
51
    Parameters
52
    ----------
53
    data : pd.DataFrame
54
        Original Dataframe with columns to be cleaned
55
    hints : bool, optional
56
        Print out hints on column name duplication and colum name length, by default \
57
        True
58
59
    Returns
60
    -------
61
    pd.DataFrame
62
        Pandas DataFrame with cleaned column names
63
    """
64 1
    _validate_input_bool(hints, "hints")
65
66
    # Handle CamelCase
67 1
    for i, col in enumerate(data.columns):
68 1
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
69 1
        column = col
70 1
        for match in matches:
71 1
            column = column.replace(match, match[0] + "_" + match[1])
72 1
            data.rename(columns={data.columns[i]: column}, inplace=True)
73
74 1
    data.columns = (
75
        data.columns.str.replace("\n", "_", regex=False)
76
        .str.replace("(", "_", regex=False)
77
        .str.replace(")", "_", regex=False)
78
        .str.replace("'", "_", regex=False)
79
        .str.replace('"', "_", regex=False)
80
        .str.replace(".", "_", regex=False)
81
        .str.replace("-", "_", regex=False)
82
        .str.replace(r"[!?:;/]", "_", regex=True)
83
        .str.replace("+", "_plus_", regex=False)
84
        .str.replace("*", "_times_", regex=False)
85
        .str.replace("<", "_smaller", regex=False)
86
        .str.replace(">", "_larger_", regex=False)
87
        .str.replace("=", "_equal_", regex=False)
88
        .str.replace("ä", "ae", regex=False)
89
        .str.replace("ö", "oe", regex=False)
90
        .str.replace("ü", "ue", regex=False)
91
        .str.replace("ß", "ss", regex=False)
92
        .str.replace("%", "_percent_", regex=False)
93
        .str.replace("$", "_dollar_", regex=False)
94
        .str.replace("€", "_euro_", regex=False)
95
        .str.replace("@", "_at_", regex=False)
96
        .str.replace("#", "_hash_", regex=False)
97
        .str.replace("&", "_and_", regex=False)
98
        .str.replace(r"\s+", "_", regex=True)
99
        .str.replace(r"_+", "_", regex=True)
100
        .str.strip("_")
101
        .str.lower()
102
    )
103
104 1
    dupl_idx = [i for i, x in enumerate(data.columns.duplicated()) if x]
105 1
    if dupl_idx:
106 1
        dupl_before = data.columns[dupl_idx].tolist()
107 1
        data.columns = [
108
            col if col not in data.columns[:i] else col + "_" + str(i)
109
            for i, col in enumerate(data.columns)
110
        ]
111 1
        if hints:
112 1
            print(
113
                f"Duplicate column names detected! Columns with index {dupl_idx} and "
114
                f"names {dupl_before}) have been renamed to "
115
                f"{data.columns[dupl_idx].tolist()}."
116
            )
117
118 1
    long_col_names = [x for x in data.columns if len(x) > 25]
119 1
    if long_col_names and hints:
120 1
        print(
121
            "Long column names detected (>25 characters). Consider renaming the "
122
            f"following columns {long_col_names}."
123
        )
124
125 1
    return data
126
127
128 1
def convert_datatypes(
129
    data: pd.DataFrame,
130
    category: bool = True,
131
    cat_threshold: float = 0.05,
132
    cat_exclude: Optional[List[Union[str, int]]] = None,
133
) -> pd.DataFrame:
134
    """Convert columns to best possible dtypes using dtypes supporting pd.NA.
135
136
    Temporarily not converting to integers due to an issue in pandas. This is expected \
137
        to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803
138
139
    Parameters
140
    ----------
141
    data : pd.DataFrame
142
        2D dataset that can be coerced into Pandas DataFrame
143
    category : bool, optional
144
        Change dtypes of columns with dtype "object" to "category". Set threshold \
145
        using cat_threshold or exclude columns using cat_exclude, by default True
146
    cat_threshold : float, optional
147
        Ratio of unique values below which categories are inferred and column dtype is \
148
        changed to categorical, by default 0.05
149
    cat_exclude : Optional[List[Union[str, int]]], optional
150
        List of columns to exclude from categorical conversion, by default None
151
152
    Returns
153
    -------
154
    pd.DataFrame
155
        Pandas DataFrame with converted Datatypes
156
    """
157
    # Validate Inputs
158 1
    _validate_input_bool(category, "Category")
159 1
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
160
161 1
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
162
163 1
    data = pd.DataFrame(data).copy()
164 1
    for col in data.columns:
165 1
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
166 1
        if (
167
            category
168
            and unique_vals_ratio < cat_threshold
169
            and col not in cat_exclude
170
            and data[col].dtype == "object"
171
        ):
172 1
            data[col] = data[col].astype("category")
173
174 1
        data[col] = data[col].convert_dtypes(
175
            infer_objects=True,
176
            convert_string=True,
177
            convert_integer=False,
178
            convert_boolean=True,
179
        )
180
181 1
    data = optimize_ints(data)
182 1
    data = optimize_floats(data)
183
184 1
    return data
185
186
187 1
def drop_missing(
188
    data: pd.DataFrame,
189
    drop_threshold_cols: float = 1,
190
    drop_threshold_rows: float = 1,
191
    col_exclude: Optional[List[str]] = None,
192
) -> pd.DataFrame:
193
    """Drop completely empty columns and rows by default and optionally provides \
194
        flexibility to loosen restrictions to drop additional non-empty columns and \
195
        rows based on the fraction of NA-values.
196
197
    Parameters
198
    ----------
199
    data : pd.DataFrame
200
        2D dataset that can be coerced into Pandas DataFrame
201
    drop_threshold_cols : float, optional
202
        Drop columns with NA-ratio equal to or above the specified threshold, by \
203
        default 1
204
    drop_threshold_rows : float, optional
205
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
206
    col_exclude : Optional[List[str]], optional
207
        Specify a list of columns to exclude from dropping. The excluded columns do \
208
        not affect the drop thresholds, by default None
209
210
    Returns
211
    -------
212
    pd.DataFrame
213
        Pandas DataFrame without any empty columns or rows
214
215
    Notes
216
    -----
217
    Columns are dropped first
218
    """
219
    # Validate Inputs
220 1
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
221 1
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
222
223 1
    col_exclude = [] if col_exclude is None else col_exclude.copy()
224 1
    data_exclude = data[col_exclude]
225
226 1
    data = pd.DataFrame(data).copy()
227
228 1
    data_dropped = data.drop(columns=col_exclude, errors="ignore")
229 1
    data_dropped = data_dropped.drop(
230
        columns=data_dropped.loc[
231
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
232
        ].columns
233
    ).dropna(axis=1, how="all")
234
235 1
    data = pd.concat([data_dropped, data_exclude], axis=1)
236
237 1
    return data.drop(
238
        index=data.loc[
239
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
240
        ].index
241
    ).dropna(axis=0, how="all")
242
243
244 1
def data_cleaning(
245
    data: pd.DataFrame,
246
    drop_threshold_cols: float = 0.9,
247
    drop_threshold_rows: float = 0.9,
248
    drop_duplicates: bool = True,
249
    convert_dtypes: bool = True,
250
    col_exclude: Optional[List[str]] = None,
251
    category: bool = True,
252
    cat_threshold: float = 0.03,
253
    cat_exclude: Optional[List[Union[str, int]]] = None,
254
    clean_col_names: bool = True,
255
    show: str = "changes",
256
) -> pd.DataFrame:
257
    """Perform initial data cleaning tasks on a dataset, such as dropping single \
258
        valued and empty rows, empty columns as well as optimizing the datatypes.
259
260
    Parameters
261
    ----------
262
    data : pd.DataFrame
263
        2D dataset that can be coerced into Pandas DataFrame
264
    drop_threshold_cols : float, optional
265
        Drop columns with NA-ratio equal to or above the specified threshold, by \
266
        default 0.9
267
    drop_threshold_rows : float, optional
268
        Drop rows with NA-ratio equal to or above the specified threshold, by \
269
        default 0.9
270
    drop_duplicates : bool, optional
271
        Drop duplicate rows, keeping the first occurence. This step comes after the \
272
        dropping of missing values, by default True
273
    convert_dtypes : bool, optional
274
        Convert dtypes using pd.convert_dtypes(), by default True
275
    col_exclude : Optional[List[str]], optional
276
        Specify a list of columns to exclude from dropping, by default None
277
    category : bool, optional
278
        Enable changing dtypes of "object" columns to "category". Set threshold using \
279
        cat_threshold. Requires convert_dtypes=True, by default True
280
    cat_threshold : float, optional
281
        Ratio of unique values below which categories are inferred and column dtype is \
282
        changed to categorical, by default 0.03
283
    cat_exclude : Optional[List[str]], optional
284
        List of columns to exclude from categorical conversion, by default None
285
    clean_column_names: bool, optional
286
        Cleans the column names and provides hints on duplicate and long names, by \
287
        default True
288
    show : str, optional
289
        {"all", "changes", None}, by default "changes"
290
        Specify verbosity of the output:
291
292
            * "all": Print information about the data before and after cleaning as \
293
            well as information about  changes and memory usage (deep). Please be \
294
            aware, that this can slow down the function by quite a bit.
295
            * "changes": Print out differences in the data before and after cleaning.
296
            * None: No information about the data and the data cleaning is printed.
297
298
    Returns
299
    -------
300
    pd.DataFrame
301
        Cleaned Pandas DataFrame
302
303
    See Also
304
    --------
305
    convert_datatypes: Convert columns to best possible dtypes.
306
    drop_missing : Flexibly drop columns and rows.
307
    _memory_usage: Gives the total memory usage in megabytes.
308
    _missing_vals: Metrics about missing values in the dataset.
309
310
    Notes
311
    -----
312
    The category dtype is not grouped in the summary, unless it contains exactly the \
313
    same categories.
314
    """
315
    # Validate Inputs
316 1
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
317 1
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
318 1
    _validate_input_bool(drop_duplicates, "drop_duplicates")
319 1
    _validate_input_bool(convert_dtypes, "convert_datatypes")
320 1
    _validate_input_bool(category, "category")
321 1
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
322
323 1
    data = pd.DataFrame(data).copy()
324 1
    data_cleaned = drop_missing(
325
        data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude
326
    )
327
328 1
    if clean_col_names:
329 1
        data_cleaned = clean_column_names(data_cleaned)
330
331 1
    single_val_cols = data_cleaned.columns[
332
        data_cleaned.nunique(dropna=False) == 1
333
    ].tolist()
334 1
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
335
336 1
    dupl_rows = None
337
338 1
    if drop_duplicates:
339 1
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
340 1
    if convert_dtypes:
341 1
        data_cleaned = convert_datatypes(
342
            data_cleaned,
343
            category=category,
344
            cat_threshold=cat_threshold,
345
            cat_exclude=cat_exclude,
346
        )
347
348 1
    _diff_report(
349
        data,
350
        data_cleaned,
351
        dupl_rows=dupl_rows,
352
        single_val_cols=single_val_cols,
353
        show=show,
354
    )
355
356 1
    return data_cleaned
357
358
359 1
class DataCleaner(BaseEstimator, TransformerMixin):
360
    """Wrapper for data_cleaning(). Allows data_cleaning() to be put into a pipeline \
361
    with similar functions (e.g. using MVColHandler() or SubsetPooler()).
362
363
    Parameters
364
    ----------
365
    drop_threshold_cols: float, default 0.9
366
        Drop columns with NA-ratio equal to or above the specified threshold.
367
    drop_threshold_rows: float, default 0.9
368
        Drop rows with NA-ratio equal to or above the specified threshold.
369
    drop_duplicates: bool, default True
370
        Drop duplicate rows, keeping the first occurence. This step comes after the \
371
        dropping of missing values.
372
    convert_dtypes: bool, default True
373
        Convert dtypes using pd.convert_dtypes().
374
    col_exclude: list, default None
375
        Specify a list of columns to exclude from dropping.
376
    category: bool, default True
377
        Change dtypes of columns to "category". Set threshold using cat_threshold. \
378
        Requires convert_dtypes=True
379
    cat_threshold: float, default 0.03
380
        Ratio of unique values below which categories are inferred and column dtype is \
381
        changed to categorical.
382
    cat_exclude: list, default None
383
        List of columns to exclude from categorical conversion.
384
    clean_column_names: bool, optional
385
        Cleans the column names and provides hints on duplicate and long names, by \
386
        default True
387
    show: str, optional
388
        {"all", "changes", None}, by default "changes"
389
        Specify verbosity of the output:
390
            * "all": Print information about the data before and after cleaning as \
391
            well as information about changes and memory usage (deep). Please be \
392
            aware, that this can slow down the function by quite a bit.
393
            * "changes": Print out differences in the data before and after cleaning.
394
            * None: No information about the data and the data cleaning is printed.
395
396
    Returns
397
    -------
398
    data_cleaned: Pandas DataFrame
399
    """
400
401 1
    def __init__(
402
        self,
403
        drop_threshold_cols: float = 0.9,
404
        drop_threshold_rows: float = 0.9,
405
        drop_duplicates: bool = True,
406
        convert_dtypes: bool = True,
407
        col_exclude: Optional[List[str]] = None,
408
        category: bool = True,
409
        cat_threshold: float = 0.03,
410
        cat_exclude: Optional[List[Union[str, int]]] = None,
411
        clean_col_names: bool = True,
412
        show: str = "changes",
413
    ):
414
        self.drop_threshold_cols = drop_threshold_cols
415
        self.drop_threshold_rows = drop_threshold_rows
416
        self.drop_duplicates = drop_duplicates
417
        self.convert_dtypes = convert_dtypes
418
        self.col_exclude = col_exclude
419
        self.category = category
420
        self.cat_threshold = cat_threshold
421
        self.cat_exclude = cat_exclude
422
        self.clean_col_names = clean_col_names
423
        self.show = show
424
425 1
    def fit(self, data, target=None):
426
        return self
427
428 1
    def transform(self, data, target=None):
429
        return data_cleaning(
430
            data,
431
            drop_threshold_cols=self.drop_threshold_cols,
432
            drop_threshold_rows=self.drop_threshold_rows,
433
            drop_duplicates=self.drop_duplicates,
434
            convert_dtypes=self.convert_dtypes,
435
            col_exclude=self.col_exclude,
436
            category=self.category,
437
            cat_threshold=self.cat_threshold,
438
            cat_exclude=self.cat_exclude,
439
            clean_col_names=self.clean_col_names,
440
            show=self.show,
441
        )
442
443
444 1
def mv_col_handling(
445
    data: pd.DataFrame,
446
    target: Optional[Union[str, pd.Series, List]] = None,
447
    mv_threshold: float = 0.1,
448
    corr_thresh_features: float = 0.5,
449
    corr_thresh_target: float = 0.3,
450
    return_details: bool = False,
451
) -> pd.DataFrame:
452
    """Convert columns with a high ratio of missing values into binary features and \
453
    eventually drops them based on their correlation with other features and the \
454
    target variable.
455
456
    This function follows a three step process:
457
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
458
    - 2) Identify high correlations of these features among themselves and with \
459
        other features in the dataset (above 'corr_thresh_features').
460
    - 3) Features with high ratio of missing values and high correlation among each \
461
        other are dropped unless they correlate reasonably well with the target \
462
        variable (above 'corr_thresh_target').
463
464
    Note: If no target is provided, the process exits after step two and drops columns \
465
    identified up to this point.
466
467
    Parameters
468
    ----------
469
    data : pd.DataFrame
470
        2D dataset that can be coerced into Pandas DataFrame
471
    target : Optional[Union[str, pd.Series, List]], optional
472
        Specify target for correlation. I.e. label column to generate only the \
473
        correlations between each feature and the label, by default None
474
    mv_threshold : float, optional
475
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
476
        than mv_threshold are candidates for dropping and undergo further analysis, by \
477
        default 0.1
478
    corr_thresh_features : float, optional
479
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
480
        features (with a high mv-ratio) is allowed to have with another feature. If \
481
        this threshold is overstepped, the feature undergoes further analysis, by \
482
        default 0.5
483
    corr_thresh_target : float, optional
484
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
485
        feature (i.e. feature with a high mv-ratio and high correlation to another \
486
        existing feature) with the target. If this threshold is not met the feature is \
487
        ultimately dropped, by default 0.3
488
    return_details : bool, optional
489
        Provdies flexibility to return intermediary results, by default False
490
491
    Returns
492
    -------
493
    pd.DataFrame
494
        Updated Pandas DataFrame
495
496
    optional:
497
    cols_mv: Columns with missing values included in the analysis
498
    drop_cols: List of dropped columns
499
    """
500
    # Validate Inputs
501
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
502
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
503
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
504
505
    data = pd.DataFrame(data).copy()
506
    data_local = data.copy()
507
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
508
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
509
    data_local[cols_mv] = (
510
        data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
511
    )
512
513
    high_corr_features = []
514
    data_temp = data_local.copy()
515
    for col in cols_mv:
516
        corrmat = corr_mat(data_temp, colored=False)
517
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
518
            high_corr_features.append(col)
519
            data_temp = data_temp.drop(columns=[col])
520
521
    drop_cols = []
522
    if target is None:
523
        data = data.drop(columns=high_corr_features)
524
    else:
525
        corrs = corr_mat(data_local, target=target, colored=False).loc[
526
            high_corr_features
527
        ]
528
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
529
        data = data.drop(columns=drop_cols)
530
531
    if return_details:
532
        return data, cols_mv, drop_cols
533
534
    return data
535
536
537 1
class MVColHandler(BaseEstimator, TransformerMixin):
538
    """Wrapper for mv_col_handling(). Allows mv_col_handling() to be put into a \
539
        pipeline with similar functions (e.g. using DataCleaner() or SubsetPooler()).
540
541
    Parameters
542
    ----------
543
    target: string, list, np.array or pd.Series, default None
544
        Specify target for correlation. E.g. label column to generate only the \
545
        correlations between each feature and the label.
546
    mv_threshold: float, default 0.1
547
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
548
        than mv_threshold are candidates for dropping and undergo further analysis.
549
    corr_thresh_features: float, default 0.6
550
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
551
        features with a high mv-ratio is allowed to have with another feature. If this \
552
        threshold is overstepped, the feature undergoes further analysis.
553
    corr_thresh_target: float, default 0.3
554
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
555
        feature (i.e. feature with a high mv-ratio and high correlation to another \
556
        existing feature) with the target. If this threshold is not met the feature is \
557
        ultimately dropped.
558
    return_details: bool, default True
559
        Provdies flexibility to return intermediary results.
560
561
    Returns
562
    -------
563
    data: Updated Pandas DataFrame
564
    """
565
566 1
    def __init__(
567
        self,
568
        target: Optional[Union[str, pd.Series, List]] = None,
569
        mv_threshold: float = 0.1,
570
        corr_thresh_features: float = 0.6,
571
        corr_thresh_target: float = 0.3,
572
        return_details: bool = True,
573
    ):
574
        self.target = target
575
        self.mv_threshold = mv_threshold
576
        self.corr_thresh_features = corr_thresh_features
577
        self.corr_thresh_target = corr_thresh_target
578
        self.return_details = return_details
579
580 1
    def fit(self, data, target=None):
581
        return self
582
583 1
    def transform(self, data, target=None):
584
        data, cols_mv, dropped_cols = mv_col_handling(
585
            data,
586
            target=self.target,
587
            mv_threshold=self.mv_threshold,
588
            corr_thresh_features=self.corr_thresh_features,
589
            corr_thresh_target=self.corr_thresh_target,
590
            return_details=self.return_details,
591
        )
592
593
        print(f"\nFeatures with MV-ratio > {self.mv_threshold}: {len(cols_mv)}")
594
        print("Features dropped:", len(dropped_cols), dropped_cols)
595
596
        return data
597
598
599 1
def pool_duplicate_subsets(
600
    data: pd.DataFrame,
601
    col_dupl_thresh: float = 0.2,
602
    subset_thresh: float = 0.2,
603
    min_col_pool: int = 3,
604
    exclude: Optional[List[str]] = None,
605
    return_details=False,
606
) -> pd.DataFrame:
607
    """Check for duplicates in subsets of columns and pools them. This can reduce \
608
        the number of columns in the data without loosing much information. Suitable \
609
        columns are combined to subsets and tested for duplicates. In case sufficient \
610
        duplicates can be found, the respective columns are aggregated into a \
611
        "pooled_var" column. Identical numbers in the "pooled_var" column indicate \
612
        identical information in the respective rows.
613
614
        Note:  It is advised to exclude features that provide sufficient informational \
615
        content by themselves as well as the target column by using the "exclude" \
616
        setting.
617
618
    Parameters
619
    ----------
620
    data : pd.DataFrame
621
        2D dataset that can be coerced into Pandas DataFrame
622
    col_dupl_thresh : float, optional
623
        Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
624
        considered in the further analysis. Columns with a lower ratio are not \
625
        considered for pooling, by default 0.2
626
    subset_thresh : float, optional
627
        The first subset with a duplicate threshold higher than "subset_thresh" is \
628
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
629
        continues with continuously smaller subsets until "min_col_pool" is reached, \
630
        by default 0.2
631
    min_col_pool : int, optional
632
        Minimum number of columns to pool. The algorithm attempts to combine as many \
633
        columns as possible to suitable subsets and stops when "min_col_pool" is \
634
        reached, by default 3
635
    exclude : Optional[List[str]], optional
636
        List of column names to be excluded from the analysis. These columns are \
637
        passed through without modification, by default None
638
    return_details : bool, optional
639
        Provdies flexibility to return intermediary results, by default False
640
641
    Returns
642
    -------
643
    pd.DataFrame
644
        DataFrame with low cardinality columns pooled
645
646
    optional:
647
    subset_cols: List of columns used as subset
648
    """
649
    # Input validation
650 1
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
651 1
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
652 1
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
653
654 1
    excluded_cols = []
655 1
    if exclude is not None:
656
        excluded_cols = data[exclude]
657
        data = data.drop(columns=exclude)
658
659 1
    subset_cols = []
660 1
    for i in range(data.shape[1] + 1 - min_col_pool):
661
        # Consider only columns with lots of duplicates
662 1
        check_list = [
663
            col
664
            for col in data.columns
665
            if data.duplicated(subset=col).mean() > col_dupl_thresh
666
        ]
667
668
        # Identify all possible combinations for the current interation
669 1
        if check_list:
670 1
            combinations = itertools.combinations(check_list, len(check_list) - i)
671
        else:
672
            continue
673
674
        # Check subsets for all possible combinations
675 1
        ratios = [
676
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)
677
        ]
678 1
        max_idx = np.argmax(ratios)
679
680 1
        if max(ratios) > subset_thresh:
681
            # Get the best possible iterator and process the data
682 1
            best_subset = itertools.islice(
683
                itertools.combinations(check_list, len(check_list) - i),
684
                max_idx,
685
                max_idx + 1,
686
            )
687
688 1
            best_subset = data[list(list(best_subset)[0])]
689 1
            subset_cols = best_subset.columns.tolist()
690
691 1
            unique_subset = (
692
                best_subset.drop_duplicates()
693
                .reset_index()
694
                .rename(columns={"index": "pooled_vars"})
695
            )
696 1
            data = data.merge(unique_subset, how="left", on=subset_cols).drop(
697
                columns=subset_cols
698
            )
699 1
            data.index = pd.RangeIndex(len(data))
700 1
            break
701
702 1
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
703
704 1
    if return_details:
705
        return data, subset_cols
706
707 1
    return data
708
709
710 1
class SubsetPooler(BaseEstimator, TransformerMixin):
711
    """Wrapper for pool_duplicate_subsets(). Allows pool_duplicate_subsets() to be \
712
        put into a pipeline with similar functions (e.g. using DataCleaner() or \
713
        MVColHandler()).
714
715
    Parameters
716
    ----------
717
    col_dupl_ratio: float, default 0.2
718
        Columns with a ratio of duplicates higher than "col_dupl_ratio" are considered \
719
        in the further analysis. Columns with a lower ratio are not considered for \
720
        pooling.
721
    dupl_thresh: float, default 0.2
722
        The first subset with a duplicate threshold higher than "dupl_thresh" is \
723
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
724
        continues with continuously smaller subsets until "min_col_pool" is reached.
725
    min_col_pool: integer, default 3
726
        Minimum number of columns to pool. The algorithm attempts to combine as many \
727
        columns as possible to suitable subsets and stops when "min_col_pool" is \
728
        reached.
729
    return_details: bool, default False
730
        Provdies flexibility to return intermediary results.
731
732
    Returns
733
    -------
734
    data: pd.DataFrame
735
    """
736
737 1
    def __init__(
738
        self,
739
        col_dupl_thresh=0.2,
740
        subset_thresh=0.2,
741
        min_col_pool=3,
742
        return_details=True,
743
    ):
744
        self.col_dupl_thresh = col_dupl_thresh
745
        self.subset_thresh = subset_thresh
746
        self.min_col_pool = min_col_pool
747
        self.return_details = return_details
748
749 1
    def fit(self, data, target=None):
750
        return self
751
752 1
    @staticmethod
753 1
    def transform(data, target=None):
754
        data, subset_cols = pool_duplicate_subsets(
755
            data,
756
            col_dupl_thresh=0.2,
757
            subset_thresh=0.2,
758
            min_col_pool=3,
759
            return_details=True,
760
        )
761
762
        print("Combined columns:", len(subset_cols), subset_cols)
763
764
        return data
765