GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Test Failed
Push — main ( 7b7d02...0ccad7 )
by Andreas
06:54
created

klib.clean._optimize_ints()   A

Complexity

Conditions 1

Size

Total Lines 5
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 5
nop 1
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
"""
2
Functions for data cleaning.
3
4
:author: Andreas Kanz
5
"""
6
7
from __future__ import annotations
8
9
import itertools
10
import re
11
from typing import Literal
12
from typing import Optional
13
14
import numpy as np
15
import pandas as pd
16
17
from klib.describe import corr_mat
18
from klib.utils import _diff_report
19
from klib.utils import _drop_duplicates
20
from klib.utils import _missing_vals
21
from klib.utils import _validate_input_bool
22
from klib.utils import _validate_input_range
23
24
__all__ = [
25
    "clean_column_names",
26
    "convert_datatypes",
27
    "data_cleaning",
28
    "drop_missing",
29
    "mv_col_handling",
30
]
31
32
33
def _optimize_ints(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
34
    df = pd.DataFrame(data).copy()
35
    ints = df.select_dtypes(include=["int64"]).columns.tolist()
36
    df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")
37
    return df
38
39
40
def _optimize_floats(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
41
    data = pd.DataFrame(data).copy()
42
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
43
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
44
    return data
45
46
47
def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
48
    """Clean the column names of the provided Pandas Dataframe and optionally \
49
        provides hints on duplicate and long column names.
50
51
    Parameters
52
    ----------
53
    data : pd.DataFrame
54
        Original Dataframe with columns to be cleaned
55
    hints : bool, optional
56
        Print out hints on column name duplication and colum name length, by default \
57
        True
58
59
    Returns
60
    -------
61
    pd.DataFrame
62
        Pandas DataFrame with cleaned column names
63
    """
64
    _validate_input_bool(hints, "hints")
65
66
    # Handle CamelCase
67
    for i, col in enumerate(data.columns):
68
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
69
        column = col
70
        for match in matches:
71
            column = column.replace(match, f"{match[0]}_{match[1]}")
72
            data.rename(columns={data.columns[i]: column}, inplace=True)
73
74
    data.columns = (
75
        data.columns.str.replace("\n", "_", regex=False)
76
        .str.replace("(", "_", regex=False)
77
        .str.replace(")", "_", regex=False)
78
        .str.replace("'", "_", regex=False)
79
        .str.replace('"', "_", regex=False)
80
        .str.replace(".", "_", regex=False)
81
        .str.replace("-", "_", regex=False)
82
        .str.replace(r"[!?:;/]", "_", regex=True)
83
        .str.replace("+", "_plus_", regex=False)
84
        .str.replace("*", "_times_", regex=False)
85
        .str.replace("<", "_smaller", regex=False)
86
        .str.replace(">", "_larger_", regex=False)
87
        .str.replace("=", "_equal_", regex=False)
88
        .str.replace("ä", "ae", regex=False)
89
        .str.replace("ö", "oe", regex=False)
90
        .str.replace("ü", "ue", regex=False)
91
        .str.replace("ß", "ss", regex=False)
92
        .str.replace("%", "_percent_", regex=False)
93
        .str.replace("$", "_dollar_", regex=False)
94
        .str.replace("€", "_euro_", regex=False)
95
        .str.replace("@", "_at_", regex=False)
96
        .str.replace("#", "_hash_", regex=False)
97
        .str.replace("&", "_and_", regex=False)
98
        .str.replace(r"\s+", "_", regex=True)
99
        .str.replace(r"_+", "_", regex=True)
100
        .str.strip("_")
101
        .str.lower()
102
    )
103
104
    if dupl_idx := [i for i, x in enumerate(data.columns.duplicated()) if x]:
105
        dupl_before = data.columns[dupl_idx].tolist()
106
        data.columns = [
107
            col if col not in data.columns[:i] else f"{col}_{str(i)}"
108
            for i, col in enumerate(data.columns)
109
        ]
110
111
        if hints:
112
            print(
113
                f"Duplicate column names detected! Columns with index {dupl_idx} and "
114
                f"names {dupl_before}) have been renamed to "
115
                f"{data.columns[dupl_idx].tolist()}."
116
            )
117
118
    long_col_names = [x for x in data.columns if len(x) > 25]
119
    if long_col_names and hints:
120
        print(
121
            "Long column names detected (>25 characters). Consider renaming the "
122
            f"following columns {long_col_names}."
123
        )
124
125
    return data
126
127
128
def convert_datatypes(
129
    data: pd.DataFrame,
130
    category: bool = True,
131
    cat_threshold: float = 0.05,
132
    cat_exclude: Optional[list[str | int]] = None,
133
) -> pd.DataFrame:
134
    """Convert columns to best possible dtypes using dtypes supporting pd.NA.
135
136
    Temporarily not converting to integers due to an issue in pandas. This is expected \
137
        to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803
138
139
    Parameters
140
    ----------
141
    data : pd.DataFrame
142
        2D dataset that can be coerced into Pandas DataFrame
143
    category : bool, optional
144
        Change dtypes of columns with dtype "object" to "category". Set threshold \
145
        using cat_threshold or exclude columns using cat_exclude, by default True
146
    cat_threshold : float, optional
147
        Ratio of unique values below which categories are inferred and column dtype is \
148
        changed to categorical, by default 0.05
149
    cat_exclude : Optional[list[str | int]], optional
150
        List of columns to exclude from categorical conversion, by default None
151
152
    Returns
153
    -------
154
    pd.DataFrame
155
        Pandas DataFrame with converted Datatypes
156
    """
157
    # Validate Inputs
158
    _validate_input_bool(category, "Category")
159
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
160
161
    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
162
163
    data = pd.DataFrame(data).copy()
164
    for col in data.columns:
165
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
166
        if (
167
            category
168
            and unique_vals_ratio < cat_threshold
169
            and col not in cat_exclude
170
            and data[col].dtype == "object"
171
        ):
172
            data[col] = data[col].astype("category")
173
174
        data[col] = data[col].convert_dtypes(
175
            infer_objects=True,
176
            convert_string=True,
177
            convert_integer=False,
178
            convert_boolean=True,
179
        )
180
181
    data = _optimize_ints(data)
182
    data = _optimize_floats(data)
183
184
    return data
185
186
187
def drop_missing(
188
    data: pd.DataFrame,
189
    drop_threshold_cols: float = 1,
190
    drop_threshold_rows: float = 1,
191
    col_exclude: Optional[list[str]] = None,
192
) -> pd.DataFrame:
193
    """Drop completely empty columns and rows by default and optionally provides \
194
        flexibility to loosen restrictions to drop additional non-empty columns and \
195
        rows based on the fraction of NA-values.
196
197
    Parameters
198
    ----------
199
    data : pd.DataFrame
200
        2D dataset that can be coerced into Pandas DataFrame
201
    drop_threshold_cols : float, optional
202
        Drop columns with NA-ratio equal to or above the specified threshold, by \
203
        default 1
204
    drop_threshold_rows : float, optional
205
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
206
    col_exclude : Optional[list[str]], optional
207
        Specify a list of columns to exclude from dropping. The excluded columns do \
208
        not affect the drop thresholds, by default None
209
210
    Returns
211
    -------
212
    pd.DataFrame
213
        Pandas DataFrame without any empty columns or rows
214
215
    Notes
216
    -----
217
    Columns are dropped first
218
    """
219
    # Validate Inputs
220
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
221
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
222
223
    col_exclude = [] if col_exclude is None else col_exclude.copy()
224
    data_exclude = data[col_exclude]
225
226
    data = pd.DataFrame(data).copy()
227
228
    data_dropped = data.drop(columns=col_exclude, errors="ignore")
229
    data_dropped = data_dropped.drop(
230
        columns=data_dropped.loc[
231
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
232
        ].columns
233
    ).dropna(axis=1, how="all")
234
235
    data = pd.concat([data_dropped, data_exclude], axis=1)
236
237
    return data.drop(
238
        index=data.loc[
239
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
240
        ].index
241
    ).dropna(axis=0, how="all")
242
243
244
def data_cleaning(
245
    data: pd.DataFrame,
246
    drop_threshold_cols: float = 0.9,
247
    drop_threshold_rows: float = 0.9,
248
    drop_duplicates: bool = True,
249
    convert_dtypes: bool = True,
250
    col_exclude: Optional[list[str]] = None,
251
    category: bool = True,
252
    cat_threshold: float = 0.03,
253
    cat_exclude: Optional[list[str | int]] = None,
254
    clean_col_names: bool = True,
255
    show: Optional[Literal["all", "changes"]] = "changes",
256
) -> pd.DataFrame:
257
    """Perform initial data cleaning tasks on a dataset, such as dropping single \
258
        valued and empty rows, empty columns as well as optimizing the datatypes.
259
260
    Parameters
261
    ----------
262
    data : pd.DataFrame
263
        2D dataset that can be coerced into Pandas DataFrame
264
    drop_threshold_cols : float, optional
265
        Drop columns with NA-ratio equal to or above the specified threshold, by \
266
        default 0.9
267
    drop_threshold_rows : float, optional
268
        Drop rows with NA-ratio equal to or above the specified threshold, by \
269
        default 0.9
270
    drop_duplicates : bool, optional
271
        Drop duplicate rows, keeping the first occurence. This step comes after the \
272
        dropping of missing values, by default True
273
    convert_dtypes : bool, optional
274
        Convert dtypes using pd.convert_dtypes(), by default True
275
    col_exclude : Optional[list[str]], optional
276
        Specify a list of columns to exclude from dropping, by default None
277
    category : bool, optional
278
        Enable changing dtypes of "object" columns to "category". Set threshold using \
279
        cat_threshold. Requires convert_dtypes=True, by default True
280
    cat_threshold : float, optional
281
        Ratio of unique values below which categories are inferred and column dtype is \
282
        changed to categorical, by default 0.03
283
    cat_exclude : Optional[list[str]], optional
284
        List of columns to exclude from categorical conversion, by default None
285
    clean_column_names: bool, optional
286
        Cleans the column names and provides hints on duplicate and long names, by \
287
        default True
288
    show : Optional[Literal["all", "changes"]], optional
289
        {"all", "changes", None}, by default "changes"
290
        Specify verbosity of the output:
291
292
            * "all": Print information about the data before and after cleaning as \
293
            well as information about  changes and memory usage (deep). Please be \
294
            aware, that this can slow down the function by quite a bit.
295
            * "changes": Print out differences in the data before and after cleaning.
296
            * None: No information about the data and the data cleaning is printed.
297
298
    Returns
299
    -------
300
    pd.DataFrame
301
        Cleaned Pandas DataFrame
302
303
    See Also
304
    --------
305
    convert_datatypes: Convert columns to best possible dtypes.
306
    drop_missing : Flexibly drop columns and rows.
307
    _memory_usage: Gives the total memory usage in megabytes.
308
    _missing_vals: Metrics about missing values in the dataset.
309
310
    Notes
311
    -----
312
    The category dtype is not grouped in the summary, unless it contains exactly the \
313
    same categories.
314
    """
315
    # Validate Inputs
316
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
317
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
318
    _validate_input_bool(drop_duplicates, "drop_duplicates")
319
    _validate_input_bool(convert_dtypes, "convert_datatypes")
320
    _validate_input_bool(category, "category")
321
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)
322
323
    data = pd.DataFrame(data).copy()
324
    data_cleaned = drop_missing(
325
        data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude
326
    )
327
328
    if clean_col_names:
329
        data_cleaned = clean_column_names(data_cleaned)
330
331
    single_val_cols = data_cleaned.columns[
332
        data_cleaned.nunique(dropna=False) == 1
333
    ].tolist()
334
    data_cleaned = data_cleaned.drop(columns=single_val_cols)
335
336
    dupl_rows = None
337
338
    if drop_duplicates:
339
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
340
    if convert_dtypes:
341
        data_cleaned = convert_datatypes(
342
            data_cleaned,
343
            category=category,
344
            cat_threshold=cat_threshold,
345
            cat_exclude=cat_exclude,
346
        )
347
348
    _diff_report(
349
        data,
350
        data_cleaned,
351
        dupl_rows=dupl_rows,
352
        single_val_cols=single_val_cols,
353
        show=show,
354
    )
355
356
    return data_cleaned
357
358
359
def mv_col_handling(
360
    data: pd.DataFrame,
361
    target: Optional[str | pd.Series | list[str]] = None,
362
    mv_threshold: float = 0.1,
363
    corr_thresh_features: float = 0.5,
364
    corr_thresh_target: float = 0.3,
365
    return_details: bool = False,
366
) -> pd.DataFrame | tuple[pd.DataFrame, list[str], list[str]]:
367
    """Convert columns with a high ratio of missing values into binary features and \
368
    eventually drops them based on their correlation with other features and the \
369
    target variable.
370
371
    This function follows a three step process:
372
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
373
    - 2) Identify high correlations of these features among themselves and with \
374
        other features in the dataset (above 'corr_thresh_features').
375
    - 3) Features with high ratio of missing values and high correlation among each \
376
        other are dropped unless they correlate reasonably well with the target \
377
        variable (above 'corr_thresh_target').
378
379
    Note: If no target is provided, the process exits after step two and drops columns \
380
    identified up to this point.
381
382
    Parameters
383
    ----------
384
    data : pd.DataFrame
385
        2D dataset that can be coerced into Pandas DataFrame
386
    target : Optional[str | pd.Series | list]], optional
387
        Specify target for correlation. I.e. label column to generate only the \
388
        correlations between each feature and the label, by default None
389
    mv_threshold : float, optional
390
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
391
        than mv_threshold are candidates for dropping and undergo further analysis, by \
392
        default 0.1
393
    corr_thresh_features : float, optional
394
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
395
        features (with a high mv-ratio) is allowed to have with another feature. If \
396
        this threshold is overstepped, the feature undergoes further analysis, by \
397
        default 0.5
398
    corr_thresh_target : float, optional
399
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
400
        feature (i.e. feature with a high mv-ratio and high correlation to another \
401
        existing feature) with the target. If this threshold is not met the feature is \
402
        ultimately dropped, by default 0.3
403
    return_details : bool, optional
404
        Provdies flexibility to return intermediary results, by default False
405
406
    Returns
407
    -------
408
    pd.DataFrame
409
        Updated Pandas DataFrame
410
411
    optional:
412
    cols_mv: Columns with missing values included in the analysis
413
    drop_cols: List of dropped columns
414
    """
415
    # Validate Inputs
416
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
417
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
418
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
419
420
    data = pd.DataFrame(data).copy()
421
    data_local = data.copy()
422
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
423
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
424
    data_local[cols_mv] = (
425
        data_local[cols_mv].applymap(lambda x: x if pd.isnull(x) else 1).fillna(0)
426
    )
427
428
    high_corr_features = []
429
    data_temp = data_local.copy()
430
    for col in cols_mv:
431
        corrmat = corr_mat(data_temp, colored=False)
432
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
433
            high_corr_features.append(col)
434
            data_temp = data_temp.drop(columns=[col])
435
436
    drop_cols = []
437
    if target is None:
438
        data = data.drop(columns=high_corr_features)
439
    else:
440
        corrs = corr_mat(data_local, target=target, colored=False).loc[
441
            high_corr_features
442
        ]
443
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
444
        data = data.drop(columns=drop_cols)
445
446
    return (data, cols_mv, drop_cols) if return_details else data
447
448
449
def pool_duplicate_subsets(
450
    data: pd.DataFrame,
451
    col_dupl_thresh: float = 0.2,
452
    subset_thresh: float = 0.2,
453
    min_col_pool: int = 3,
454
    exclude: Optional[list[str]] = None,
455
    return_details=False,
456
) -> pd.DataFrame | tuple[pd.DataFrame, list[str]]:
457
    """Check for duplicates in subsets of columns and pools them. This can reduce \
458
        the number of columns in the data without loosing much information. Suitable \
459
        columns are combined to subsets and tested for duplicates. In case sufficient \
460
        duplicates can be found, the respective columns are aggregated into a \
461
        "pooled_var" column. Identical numbers in the "pooled_var" column indicate \
462
        identical information in the respective rows.
463
464
        Note:  It is advised to exclude features that provide sufficient informational \
465
        content by themselves as well as the target column by using the "exclude" \
466
        setting.
467
468
    Parameters
469
    ----------
470
    data : pd.DataFrame
471
        2D dataset that can be coerced into Pandas DataFrame
472
    col_dupl_thresh : float, optional
473
        Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
474
        considered in the further analysis. Columns with a lower ratio are not \
475
        considered for pooling, by default 0.2
476
    subset_thresh : float, optional
477
        The first subset with a duplicate threshold higher than "subset_thresh" is \
478
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
479
        continues with continuously smaller subsets until "min_col_pool" is reached, \
480
        by default 0.2
481
    min_col_pool : int, optional
482
        Minimum number of columns to pool. The algorithm attempts to combine as many \
483
        columns as possible to suitable subsets and stops when "min_col_pool" is \
484
        reached, by default 3
485
    exclude : Optional[list[str]], optional
486
        List of column names to be excluded from the analysis. These columns are \
487
        passed through without modification, by default None
488
    return_details : bool, optional
489
        Provdies flexibility to return intermediary results, by default False
490
491
    Returns
492
    -------
493
    pd.DataFrame
494
        DataFrame with low cardinality columns pooled
495
496
    optional:
497
    subset_cols: List of columns used as subset
498
    """
499
    # Input validation
500
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
501
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
502
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
503
504
    excluded_cols = []
505
    if exclude is not None:
506
        excluded_cols = data[exclude]
507
        data = data.drop(columns=exclude)
508
509
    subset_cols = []
510
    for i in range(data.shape[1] + 1 - min_col_pool):
511
        # Consider only columns with lots of duplicates
512
        check_list = [
513
            col
514
            for col in data.columns
515
            if data.duplicated(subset=col).mean() > col_dupl_thresh
516
        ]
517
518
        # Identify all possible combinations for the current interation
519
        if check_list:
520
            combinations = itertools.combinations(check_list, len(check_list) - i)
521
        else:
522
            continue
523
524
        # Check subsets for all possible combinations
525
        ratios = [
526
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)
527
        ]
528
        max_idx = np.argmax(ratios)
529
530
        if max(ratios) > subset_thresh:
531
            # Get the best possible iterator and process the data
532
            best_subset = itertools.islice(
533
                itertools.combinations(check_list, len(check_list) - i),
534
                max_idx,
535
                max_idx + 1,
536
            )
537
538
            best_subset = data[list(list(best_subset)[0])]
539
            subset_cols = best_subset.columns.tolist()
540
541
            unique_subset = (
542
                best_subset.drop_duplicates()
543
                .reset_index()
544
                .rename(columns={"index": "pooled_vars"})
545
            )
546
            data = data.merge(unique_subset, how="left", on=subset_cols).drop(
547
                columns=subset_cols
548
            )
549
            data.index = pd.RangeIndex(len(data))
550
            break
551
552
    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
553
554
    return (data, subset_cols) if return_details else data
555