klib.clean.data_cleaning() - Code Metrics - Inspection of "types and updates (#27)" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — main ( 7b7d02...0ccad7 )

by Andreas

created 2022-07-31 08:20 UTC

klib.clean.data_cleaning() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	113
Code Lines	43

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	43
nop	11
dl	0
loc	113
rs	8.8478
c	0
b	0
f	0

How to fix Long Method Many Parameters

"""
Functions for data cleaning.

:author: Andreas Kanz
"""

from __future__ import annotations

import itertools
import re
from typing import Literal
from typing import Optional

import numpy as np
import pandas as pd

from klib.describe import corr_mat
from klib.utils import _diff_report
from klib.utils import _drop_duplicates
from klib.utils import _missing_vals
from klib.utils import _validate_input_bool
from klib.utils import _validate_input_range

__all__ = [
    "clean_column_names",
    "convert_datatypes",
    "data_cleaning",
    "drop_missing",
    "mv_col_handling",
]


def _optimize_ints(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame(data).copy()
    ints = df.select_dtypes(include=["int64"]).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")
    return df


def _optimize_floats(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
    data = pd.DataFrame(data).copy()
    floats = data.select_dtypes(include=["float64"]).columns.tolist()
    data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
    return data


def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
    """Clean the column names of the provided Pandas Dataframe and optionally \
        provides hints on duplicate and long column names.

    Parameters
    ----------
    data : pd.DataFrame
        Original Dataframe with columns to be cleaned
    hints : bool, optional
        Print out hints on column name duplication and colum name length, by default \
        True

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame with cleaned column names
    """
    _validate_input_bool(hints, "hints")

    # Handle CamelCase
    for i, col in enumerate(data.columns):
        matches = re.findall(re.compile("[a-z][A-Z]"), col)
        column = col
        for match in matches:
            column = column.replace(match, f"{match[0]}_{match[1]}")
            data.rename(columns={data.columns[i]: column}, inplace=True)

    data.columns = (
        data.columns.str.replace("\n", "_", regex=False)
        .str.replace("(", "_", regex=False)
        .str.replace(")", "_", regex=False)
        .str.replace("'", "_", regex=False)
        .str.replace('"', "_", regex=False)
        .str.replace(".", "_", regex=False)
        .str.replace("-", "_", regex=False)
        .str.replace(r"[!?:;/]", "_", regex=True)
        .str.replace("+", "_plus_", regex=False)
        .str.replace("*", "_times_", regex=False)
        .str.replace("<", "_smaller", regex=False)
        .str.replace(">", "_larger_", regex=False)
        .str.replace("=", "_equal_", regex=False)
        .str.replace("ä", "ae", regex=False)
        .str.replace("ö", "oe", regex=False)
        .str.replace("ü", "ue", regex=False)
        .str.replace("ß", "ss", regex=False)
        .str.replace("%", "_percent_", regex=False)
        .str.replace("$", "_dollar_", regex=False)
        .str.replace("€", "_euro_", regex=False)
        .str.replace("@", "_at_", regex=False)
        .str.replace("#", "_hash_", regex=False)
        .str.replace("&", "_and_", regex=False)
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"_+", "_", regex=True)
        .str.strip("_")
        .str.lower()
    )

    if dupl_idx := [i for i, x in enumerate(data.columns.duplicated()) if x]:
        dupl_before = data.columns[dupl_idx].tolist()
        data.columns = [
            col if col not in data.columns[:i] else f"{col}_{str(i)}"
            for i, col in enumerate(data.columns)
        ]

        if hints:
            print(
                f"Duplicate column names detected! Columns with index {dupl_idx} and "
                f"names {dupl_before}) have been renamed to "
                f"{data.columns[dupl_idx].tolist()}."
            )

    long_col_names = [x for x in data.columns if len(x) > 25]
    if long_col_names and hints:
        print(
            "Long column names detected (>25 characters). Consider renaming the "
            f"following columns {long_col_names}."
        )

    return data


def convert_datatypes(
    data: pd.DataFrame,
    category: bool = True,
    cat_threshold: float = 0.05,
    cat_exclude: Optional[list[str | int]] = None,
) -> pd.DataFrame:
    """Convert columns to best possible dtypes using dtypes supporting pd.NA.

    Temporarily not converting to integers due to an issue in pandas. This is expected \
        to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    category : bool, optional
        Change dtypes of columns with dtype "object" to "category". Set threshold \
        using cat_threshold or exclude columns using cat_exclude, by default True
    cat_threshold : float, optional
        Ratio of unique values below which categories are inferred and column dtype is \
        changed to categorical, by default 0.05
    cat_exclude : Optional[list[str | int]], optional
        List of columns to exclude from categorical conversion, by default None

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame with converted Datatypes
    """
    # Validate Inputs
    _validate_input_bool(category, "Category")
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)

    cat_exclude = [] if cat_exclude is None else cat_exclude.copy()

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if (
            category
            and unique_vals_ratio < cat_threshold
            and col not in cat_exclude
            and data[col].dtype == "object"
        ):
            data[col] = data[col].astype("category")

        data[col] = data[col].convert_dtypes(
            infer_objects=True,
            convert_string=True,
            convert_integer=False,
            convert_boolean=True,
        )

    data = _optimize_ints(data)
    data = _optimize_floats(data)

    return data


def drop_missing(
    data: pd.DataFrame,
    drop_threshold_cols: float = 1,
    drop_threshold_rows: float = 1,
    col_exclude: Optional[list[str]] = None,
) -> pd.DataFrame:
    """Drop completely empty columns and rows by default and optionally provides \
        flexibility to loosen restrictions to drop additional non-empty columns and \
        rows based on the fraction of NA-values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    drop_threshold_cols : float, optional
        Drop columns with NA-ratio equal to or above the specified threshold, by \
        default 1
    drop_threshold_rows : float, optional
        Drop rows with NA-ratio equal to or above the specified threshold, by default 1
    col_exclude : Optional[list[str]], optional
        Specify a list of columns to exclude from dropping. The excluded columns do \
        not affect the drop thresholds, by default None

    Returns
    -------
    pd.DataFrame
        Pandas DataFrame without any empty columns or rows

    Notes
    -----
    Columns are dropped first
    """
    # Validate Inputs
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)

    col_exclude = [] if col_exclude is None else col_exclude.copy()
    data_exclude = data[col_exclude]

    data = pd.DataFrame(data).copy()

    data_dropped = data.drop(columns=col_exclude, errors="ignore")
    data_dropped = data_dropped.drop(
        columns=data_dropped.loc[
            :, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
        ].columns
    ).dropna(axis=1, how="all")

    data = pd.concat([data_dropped, data_exclude], axis=1)

    return data.drop(
        index=data.loc[
            _missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
        ].index
    ).dropna(axis=0, how="all")


def data_cleaning(
    data: pd.DataFrame,
    drop_threshold_cols: float = 0.9,
    drop_threshold_rows: float = 0.9,
    drop_duplicates: bool = True,
    convert_dtypes: bool = True,
    col_exclude: Optional[list[str]] = None,
    category: bool = True,
    cat_threshold: float = 0.03,
    cat_exclude: Optional[list[str | int]] = None,
    clean_col_names: bool = True,
    show: Optional[Literal["all", "changes"]] = "changes",
) -> pd.DataFrame:
    """Perform initial data cleaning tasks on a dataset, such as dropping single \
        valued and empty rows, empty columns as well as optimizing the datatypes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    drop_threshold_cols : float, optional
        Drop columns with NA-ratio equal to or above the specified threshold, by \
        default 0.9
    drop_threshold_rows : float, optional
        Drop rows with NA-ratio equal to or above the specified threshold, by \
        default 0.9
    drop_duplicates : bool, optional
        Drop duplicate rows, keeping the first occurence. This step comes after the \
        dropping of missing values, by default True
    convert_dtypes : bool, optional
        Convert dtypes using pd.convert_dtypes(), by default True
    col_exclude : Optional[list[str]], optional
        Specify a list of columns to exclude from dropping, by default None
    category : bool, optional
        Enable changing dtypes of "object" columns to "category". Set threshold using \
        cat_threshold. Requires convert_dtypes=True, by default True
    cat_threshold : float, optional
        Ratio of unique values below which categories are inferred and column dtype is \
        changed to categorical, by default 0.03
    cat_exclude : Optional[list[str]], optional
        List of columns to exclude from categorical conversion, by default None
    clean_column_names: bool, optional
        Cleans the column names and provides hints on duplicate and long names, by \
        default True
    show : Optional[Literal["all", "changes"]], optional
        {"all", "changes", None}, by default "changes"
        Specify verbosity of the output:

            * "all": Print information about the data before and after cleaning as \
            well as information about  changes and memory usage (deep). Please be \
            aware, that this can slow down the function by quite a bit.
            * "changes": Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    pd.DataFrame
        Cleaned Pandas DataFrame

    See Also
    --------
    convert_datatypes: Convert columns to best possible dtypes.
    drop_missing : Flexibly drop columns and rows.
    _memory_usage: Gives the total memory usage in megabytes.
    _missing_vals: Metrics about missing values in the dataset.

    Notes
    -----
    The category dtype is not grouped in the summary, unless it contains exactly the \
    same categories.
    """
    # Validate Inputs
    _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
    _validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
    _validate_input_bool(drop_duplicates, "drop_duplicates")
    _validate_input_bool(convert_dtypes, "convert_datatypes")
    _validate_input_bool(category, "category")
    _validate_input_range(cat_threshold, "cat_threshold", 0, 1)

    data = pd.DataFrame(data).copy()
    data_cleaned = drop_missing(
        data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude
    )

    if clean_col_names:
        data_cleaned = clean_column_names(data_cleaned)

    single_val_cols = data_cleaned.columns[
        data_cleaned.nunique(dropna=False) == 1
    ].tolist()
    data_cleaned = data_cleaned.drop(columns=single_val_cols)

    dupl_rows = None

    if drop_duplicates:
        data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
    if convert_dtypes:
        data_cleaned = convert_datatypes(
            data_cleaned,
            category=category,
            cat_threshold=cat_threshold,
            cat_exclude=cat_exclude,
        )

    _diff_report(
        data,
        data_cleaned,
        dupl_rows=dupl_rows,
        single_val_cols=single_val_cols,
        show=show,
    )

    return data_cleaned


def mv_col_handling(
    data: pd.DataFrame,
    target: Optional[str | pd.Series | list[str]] = None,
    mv_threshold: float = 0.1,
    corr_thresh_features: float = 0.5,
    corr_thresh_target: float = 0.3,
    return_details: bool = False,
) -> pd.DataFrame | tuple[pd.DataFrame, list[str], list[str]]:
    """Convert columns with a high ratio of missing values into binary features and \
    eventually drops them based on their correlation with other features and the \
    target variable.

    This function follows a three step process:
    - 1) Identify features with a high ratio of missing values (above 'mv_threshold').
    - 2) Identify high correlations of these features among themselves and with \
        other features in the dataset (above 'corr_thresh_features').
    - 3) Features with high ratio of missing values and high correlation among each \
        other are dropped unless they correlate reasonably well with the target \
        variable (above 'corr_thresh_target').

    Note: If no target is provided, the process exits after step two and drops columns \
    identified up to this point.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    target : Optional[str | pd.Series | list]], optional
        Specify target for correlation. I.e. label column to generate only the \
        correlations between each feature and the label, by default None
    mv_threshold : float, optional
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
        than mv_threshold are candidates for dropping and undergo further analysis, by \
        default 0.1
    corr_thresh_features : float, optional
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
        features (with a high mv-ratio) is allowed to have with another feature. If \
        this threshold is overstepped, the feature undergoes further analysis, by \
        default 0.5
    corr_thresh_target : float, optional
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
        feature (i.e. feature with a high mv-ratio and high correlation to another \
        existing feature) with the target. If this threshold is not met the feature is \
        ultimately dropped, by default 0.3
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        Updated Pandas DataFrame

    optional:
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    """
    # Validate Inputs
    _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
    _validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
    _validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = (
        data_local[cols_mv].applymap(lambda x: x if pd.isnull(x) else 1).fillna(0)
    )

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        corrs = corr_mat(data_local, target=target, colored=False).loc[
            high_corr_features
        ]
        drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
        data = data.drop(columns=drop_cols)

    return (data, cols_mv, drop_cols) if return_details else data


def pool_duplicate_subsets(
    data: pd.DataFrame,
    col_dupl_thresh: float = 0.2,
    subset_thresh: float = 0.2,
    min_col_pool: int = 3,
    exclude: Optional[list[str]] = None,
    return_details=False,
) -> pd.DataFrame | tuple[pd.DataFrame, list[str]]:
    """Check for duplicates in subsets of columns and pools them. This can reduce \
        the number of columns in the data without loosing much information. Suitable \
        columns are combined to subsets and tested for duplicates. In case sufficient \
        duplicates can be found, the respective columns are aggregated into a \
        "pooled_var" column. Identical numbers in the "pooled_var" column indicate \
        identical information in the respective rows.

        Note:  It is advised to exclude features that provide sufficient informational \
        content by themselves as well as the target column by using the "exclude" \
        setting.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    col_dupl_thresh : float, optional
        Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
        considered in the further analysis. Columns with a lower ratio are not \
        considered for pooling, by default 0.2
    subset_thresh : float, optional
        The first subset with a duplicate threshold higher than "subset_thresh" is \
        chosen and aggregated. If no subset reaches the threshold, the algorithm \
        continues with continuously smaller subsets until "min_col_pool" is reached, \
        by default 0.2
    min_col_pool : int, optional
        Minimum number of columns to pool. The algorithm attempts to combine as many \
        columns as possible to suitable subsets and stops when "min_col_pool" is \
        reached, by default 3
    exclude : Optional[list[str]], optional
        List of column names to be excluded from the analysis. These columns are \
        passed through without modification, by default None
    return_details : bool, optional
        Provdies flexibility to return intermediary results, by default False

    Returns
    -------
    pd.DataFrame
        DataFrame with low cardinality columns pooled

    optional:
    subset_cols: List of columns used as subset
    """
    # Input validation
    _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
    _validate_input_range(subset_thresh, "subset_thresh", 0, 1)
    _validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])

    excluded_cols = []
    if exclude is not None:
        excluded_cols = data[exclude]
        data = data.drop(columns=exclude)

    subset_cols = []
    for i in range(data.shape[1] + 1 - min_col_pool):
        # Consider only columns with lots of duplicates
        check_list = [
            col
            for col in data.columns
            if data.duplicated(subset=col).mean() > col_dupl_thresh
        ]

        # Identify all possible combinations for the current interation
        if check_list:
            combinations = itertools.combinations(check_list, len(check_list) - i)
        else:
            continue

        # Check subsets for all possible combinations
        ratios = [
            *map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)
        ]
        max_idx = np.argmax(ratios)

        if max(ratios) > subset_thresh:
            # Get the best possible iterator and process the data
            best_subset = itertools.islice(
                itertools.combinations(check_list, len(check_list) - i),
                max_idx,
                max_idx + 1,
            )

            best_subset = data[list(list(best_subset)[0])]
            subset_cols = best_subset.columns.tolist()

            unique_subset = (
                best_subset.drop_duplicates()
                .reset_index()
                .rename(columns={"index": "pooled_vars"})
            )
            data = data.merge(unique_subset, how="left", on=subset_cols).drop(
                columns=subset_cols
            )
            data.index = pd.RangeIndex(len(data))
            break

    data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)

    return (data, subset_cols) if return_details else data


1			"""
2			Functions for data cleaning.
3
4			:author: Andreas Kanz
5			"""
6
7			from __future__ import annotations
8
9			import itertools
10			import re
11			from typing import Literal
12			from typing import Optional
13
14			import numpy as np
15			import pandas as pd
16
17			from klib.describe import corr_mat
18			from klib.utils import _diff_report
19			from klib.utils import _drop_duplicates
20			from klib.utils import _missing_vals
21			from klib.utils import _validate_input_bool
22			from klib.utils import _validate_input_range
23
24			__all__ = [
25			"clean_column_names",
26			"convert_datatypes",
27			"data_cleaning",
28			"drop_missing",
29			"mv_col_handling",
30			]
31
32
33			def _optimize_ints(data: pd.Series \| pd.DataFrame) -> pd.DataFrame:
34			df = pd.DataFrame(data).copy()
35			ints = df.select_dtypes(include=["int64"]).columns.tolist()
36			df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")
37			return df
38
39
40			def _optimize_floats(data: pd.Series \| pd.DataFrame) -> pd.DataFrame:
41			data = pd.DataFrame(data).copy()
42			floats = data.select_dtypes(include=["float64"]).columns.tolist()
43			data[floats] = data[floats].apply(pd.to_numeric, downcast="float")
44			return data
45
46
47			def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
48			"""Clean the column names of the provided Pandas Dataframe and optionally \
49			provides hints on duplicate and long column names.
50
51			Parameters
52			----------
53			data : pd.DataFrame
54			Original Dataframe with columns to be cleaned
55			hints : bool, optional
56			Print out hints on column name duplication and colum name length, by default \
57			True
58
59			Returns
60			-------
61			pd.DataFrame
62			Pandas DataFrame with cleaned column names
63			"""
64			_validate_input_bool(hints, "hints")
65
66			# Handle CamelCase
67			for i, col in enumerate(data.columns):
68			matches = re.findall(re.compile("[a-z][A-Z]"), col)
69			column = col
70			for match in matches:
71			column = column.replace(match, f"{match[0]}_{match[1]}")
72			data.rename(columns={data.columns[i]: column}, inplace=True)
73
74			data.columns = (
75			data.columns.str.replace("\n", "_", regex=False)
76			.str.replace("(", "_", regex=False)
77			.str.replace(")", "_", regex=False)
78			.str.replace("'", "_", regex=False)
79			.str.replace('"', "_", regex=False)
80			.str.replace(".", "_", regex=False)
81			.str.replace("-", "_", regex=False)
82			.str.replace(r"[!?:;/]", "_", regex=True)
83			.str.replace("+", "_plus_", regex=False)
84			.str.replace("*", "_times_", regex=False)
85			.str.replace("<", "_smaller", regex=False)
86			.str.replace(">", "_larger_", regex=False)
87			.str.replace("=", "_equal_", regex=False)
88			.str.replace("ä", "ae", regex=False)
89			.str.replace("ö", "oe", regex=False)
90			.str.replace("ü", "ue", regex=False)
91			.str.replace("ß", "ss", regex=False)
92			.str.replace("%", "_percent_", regex=False)
93			.str.replace("$", "_dollar_", regex=False)
94			.str.replace("€", "_euro_", regex=False)
95			.str.replace("@", "_at_", regex=False)
96			.str.replace("#", "_hash_", regex=False)
97			.str.replace("&", "_and_", regex=False)
98			.str.replace(r"\s+", "_", regex=True)
99			.str.replace(r"_+", "_", regex=True)
100			.str.strip("_")
101			.str.lower()
102			)
103
104			if dupl_idx := [i for i, x in enumerate(data.columns.duplicated()) if x]:
105			dupl_before = data.columns[dupl_idx].tolist()
106			data.columns = [
107			col if col not in data.columns[:i] else f"{col}_{str(i)}"
108			for i, col in enumerate(data.columns)
109			]
110
111			if hints:
112			print(
113			f"Duplicate column names detected! Columns with index {dupl_idx} and "
114			f"names {dupl_before}) have been renamed to "
115			f"{data.columns[dupl_idx].tolist()}."
116			)
117
118			long_col_names = [x for x in data.columns if len(x) > 25]
119			if long_col_names and hints:
120			print(
121			"Long column names detected (>25 characters). Consider renaming the "
122			f"following columns {long_col_names}."
123			)
124
125			return data
126
127
128			def convert_datatypes(
129			data: pd.DataFrame,
130			category: bool = True,
131			cat_threshold: float = 0.05,
132			cat_exclude: Optional[list[str \| int]] = None,
133			) -> pd.DataFrame:
134			"""Convert columns to best possible dtypes using dtypes supporting pd.NA.
135
136			Temporarily not converting to integers due to an issue in pandas. This is expected \
137			to be fixed in pandas 1.1. See https://github.com/pandas-dev/pandas/issues/33803
138
139			Parameters
140			----------
141			data : pd.DataFrame
142			2D dataset that can be coerced into Pandas DataFrame
143			category : bool, optional
144			Change dtypes of columns with dtype "object" to "category". Set threshold \
145			using cat_threshold or exclude columns using cat_exclude, by default True
146			cat_threshold : float, optional
147			Ratio of unique values below which categories are inferred and column dtype is \
148			changed to categorical, by default 0.05
149			cat_exclude : Optional[list[str \| int]], optional
150			List of columns to exclude from categorical conversion, by default None
151
152			Returns
153			-------
154			pd.DataFrame
155			Pandas DataFrame with converted Datatypes
156			"""
157			# Validate Inputs
158			_validate_input_bool(category, "Category")
159			_validate_input_range(cat_threshold, "cat_threshold", 0, 1)
160
161			cat_exclude = [] if cat_exclude is None else cat_exclude.copy()
162
163			data = pd.DataFrame(data).copy()
164			for col in data.columns:
165			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
166			if (
167			category
168			and unique_vals_ratio < cat_threshold
169			and col not in cat_exclude
170			and data[col].dtype == "object"
171			):
172			data[col] = data[col].astype("category")
173
174			data[col] = data[col].convert_dtypes(
175			infer_objects=True,
176			convert_string=True,
177			convert_integer=False,
178			convert_boolean=True,
179			)
180
181			data = _optimize_ints(data)
182			data = _optimize_floats(data)
183
184			return data
185
186
187			def drop_missing(
188			data: pd.DataFrame,
189			drop_threshold_cols: float = 1,
190			drop_threshold_rows: float = 1,
191			col_exclude: Optional[list[str]] = None,
192			) -> pd.DataFrame:
193			"""Drop completely empty columns and rows by default and optionally provides \
194			flexibility to loosen restrictions to drop additional non-empty columns and \
195			rows based on the fraction of NA-values.
196
197			Parameters
198			----------
199			data : pd.DataFrame
200			2D dataset that can be coerced into Pandas DataFrame
201			drop_threshold_cols : float, optional
202			Drop columns with NA-ratio equal to or above the specified threshold, by \
203			default 1
204			drop_threshold_rows : float, optional
205			Drop rows with NA-ratio equal to or above the specified threshold, by default 1
206			col_exclude : Optional[list[str]], optional
207			Specify a list of columns to exclude from dropping. The excluded columns do \
208			not affect the drop thresholds, by default None
209
210			Returns
211			-------
212			pd.DataFrame
213			Pandas DataFrame without any empty columns or rows
214
215			Notes
216			-----
217			Columns are dropped first
218			"""
219			# Validate Inputs
220			_validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
221			_validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
222
223			col_exclude = [] if col_exclude is None else col_exclude.copy()
224			data_exclude = data[col_exclude]
225
226			data = pd.DataFrame(data).copy()
227
228			data_dropped = data.drop(columns=col_exclude, errors="ignore")
229			data_dropped = data_dropped.drop(
230			columns=data_dropped.loc[
231			:, _missing_vals(data)["mv_cols_ratio"] > drop_threshold_cols
232			].columns
233			).dropna(axis=1, how="all")
234
235			data = pd.concat([data_dropped, data_exclude], axis=1)
236
237			return data.drop(
238			index=data.loc[
239			_missing_vals(data)["mv_rows_ratio"] > drop_threshold_rows, :
240			].index
241			).dropna(axis=0, how="all")
242
243
244			def data_cleaning(
245			data: pd.DataFrame,
246			drop_threshold_cols: float = 0.9,
247			drop_threshold_rows: float = 0.9,
248			drop_duplicates: bool = True,
249			convert_dtypes: bool = True,
250			col_exclude: Optional[list[str]] = None,
251			category: bool = True,
252			cat_threshold: float = 0.03,
253			cat_exclude: Optional[list[str \| int]] = None,
254			clean_col_names: bool = True,
255			show: Optional[Literal["all", "changes"]] = "changes",
256			) -> pd.DataFrame:
257			"""Perform initial data cleaning tasks on a dataset, such as dropping single \
258			valued and empty rows, empty columns as well as optimizing the datatypes.
259
260			Parameters
261			----------
262			data : pd.DataFrame
263			2D dataset that can be coerced into Pandas DataFrame
264			drop_threshold_cols : float, optional
265			Drop columns with NA-ratio equal to or above the specified threshold, by \
266			default 0.9
267			drop_threshold_rows : float, optional
268			Drop rows with NA-ratio equal to or above the specified threshold, by \
269			default 0.9
270			drop_duplicates : bool, optional
271			Drop duplicate rows, keeping the first occurence. This step comes after the \
272			dropping of missing values, by default True
273			convert_dtypes : bool, optional
274			Convert dtypes using pd.convert_dtypes(), by default True
275			col_exclude : Optional[list[str]], optional
276			Specify a list of columns to exclude from dropping, by default None
277			category : bool, optional
278			Enable changing dtypes of "object" columns to "category". Set threshold using \
279			cat_threshold. Requires convert_dtypes=True, by default True
280			cat_threshold : float, optional
281			Ratio of unique values below which categories are inferred and column dtype is \
282			changed to categorical, by default 0.03
283			cat_exclude : Optional[list[str]], optional
284			List of columns to exclude from categorical conversion, by default None
285			clean_column_names: bool, optional
286			Cleans the column names and provides hints on duplicate and long names, by \
287			default True
288			show : Optional[Literal["all", "changes"]], optional
289			{"all", "changes", None}, by default "changes"
290			Specify verbosity of the output:
291
292			* "all": Print information about the data before and after cleaning as \
293			well as information about changes and memory usage (deep). Please be \
294			aware, that this can slow down the function by quite a bit.
295			* "changes": Print out differences in the data before and after cleaning.
296			* None: No information about the data and the data cleaning is printed.
297
298			Returns
299			-------
300			pd.DataFrame
301			Cleaned Pandas DataFrame
302
303			See Also
304			--------
305			convert_datatypes: Convert columns to best possible dtypes.
306			drop_missing : Flexibly drop columns and rows.
307			_memory_usage: Gives the total memory usage in megabytes.
308			_missing_vals: Metrics about missing values in the dataset.
309
310			Notes
311			-----
312			The category dtype is not grouped in the summary, unless it contains exactly the \
313			same categories.
314			"""
315			# Validate Inputs
316			_validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
317			_validate_input_range(drop_threshold_rows, "drop_threshold_rows", 0, 1)
318			_validate_input_bool(drop_duplicates, "drop_duplicates")
319			_validate_input_bool(convert_dtypes, "convert_datatypes")
320			_validate_input_bool(category, "category")
321			_validate_input_range(cat_threshold, "cat_threshold", 0, 1)
322
323			data = pd.DataFrame(data).copy()
324			data_cleaned = drop_missing(
325			data, drop_threshold_cols, drop_threshold_rows, col_exclude=col_exclude
326			)
327
328			if clean_col_names:
329			data_cleaned = clean_column_names(data_cleaned)
330
331			single_val_cols = data_cleaned.columns[
332			data_cleaned.nunique(dropna=False) == 1
333			].tolist()
334			data_cleaned = data_cleaned.drop(columns=single_val_cols)
335
336			dupl_rows = None
337
338			if drop_duplicates:
339			data_cleaned, dupl_rows = _drop_duplicates(data_cleaned)
340			if convert_dtypes:
341			data_cleaned = convert_datatypes(
342			data_cleaned,
343			category=category,
344			cat_threshold=cat_threshold,
345			cat_exclude=cat_exclude,
346			)
347
348			_diff_report(
349			data,
350			data_cleaned,
351			dupl_rows=dupl_rows,
352			single_val_cols=single_val_cols,
353			show=show,
354			)
355
356			return data_cleaned
357
358
359			def mv_col_handling(
360			data: pd.DataFrame,
361			target: Optional[str \| pd.Series \| list[str]] = None,
362			mv_threshold: float = 0.1,
363			corr_thresh_features: float = 0.5,
364			corr_thresh_target: float = 0.3,
365			return_details: bool = False,
366			) -> pd.DataFrame \| tuple[pd.DataFrame, list[str], list[str]]:
367			"""Convert columns with a high ratio of missing values into binary features and \
368			eventually drops them based on their correlation with other features and the \
369			target variable.
370
371			This function follows a three step process:
372			- 1) Identify features with a high ratio of missing values (above 'mv_threshold').
373			- 2) Identify high correlations of these features among themselves and with \
374			other features in the dataset (above 'corr_thresh_features').
375			- 3) Features with high ratio of missing values and high correlation among each \
376			other are dropped unless they correlate reasonably well with the target \
377			variable (above 'corr_thresh_target').
378
379			Note: If no target is provided, the process exits after step two and drops columns \
380			identified up to this point.
381
382			Parameters
383			----------
384			data : pd.DataFrame
385			2D dataset that can be coerced into Pandas DataFrame
386			target : Optional[str \| pd.Series \| list]], optional
387			Specify target for correlation. I.e. label column to generate only the \
388			correlations between each feature and the label, by default None
389			mv_threshold : float, optional
390			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger \
391			than mv_threshold are candidates for dropping and undergo further analysis, by \
392			default 0.1
393			corr_thresh_features : float, optional
394			Value between 0 <= threshold <= 1. Maximum correlation a previously identified \
395			features (with a high mv-ratio) is allowed to have with another feature. If \
396			this threshold is overstepped, the feature undergoes further analysis, by \
397			default 0.5
398			corr_thresh_target : float, optional
399			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining \
400			feature (i.e. feature with a high mv-ratio and high correlation to another \
401			existing feature) with the target. If this threshold is not met the feature is \
402			ultimately dropped, by default 0.3
403			return_details : bool, optional
404			Provdies flexibility to return intermediary results, by default False
405
406			Returns
407			-------
408			pd.DataFrame
409			Updated Pandas DataFrame
410
411			optional:
412			cols_mv: Columns with missing values included in the analysis
413			drop_cols: List of dropped columns
414			"""
415			# Validate Inputs
416			_validate_input_range(mv_threshold, "mv_threshold", 0, 1)
417			_validate_input_range(corr_thresh_features, "corr_thresh_features", 0, 1)
418			_validate_input_range(corr_thresh_target, "corr_thresh_target", 0, 1)
419
420			data = pd.DataFrame(data).copy()
421			data_local = data.copy()
422			mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
423			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
424			data_local[cols_mv] = (
425			data_local[cols_mv].applymap(lambda x: x if pd.isnull(x) else 1).fillna(0)
426			)
427
428			high_corr_features = []
429			data_temp = data_local.copy()
430			for col in cols_mv:
431			corrmat = corr_mat(data_temp, colored=False)
432			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
433			high_corr_features.append(col)
434			data_temp = data_temp.drop(columns=[col])
435
436			drop_cols = []
437			if target is None:
438			data = data.drop(columns=high_corr_features)
439			else:
440			corrs = corr_mat(data_local, target=target, colored=False).loc[
441			high_corr_features
442			]
443			drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
444			data = data.drop(columns=drop_cols)
445
446			return (data, cols_mv, drop_cols) if return_details else data
447
448
449			def pool_duplicate_subsets(
450			data: pd.DataFrame,
451			col_dupl_thresh: float = 0.2,
452			subset_thresh: float = 0.2,
453			min_col_pool: int = 3,
454			exclude: Optional[list[str]] = None,
455			return_details=False,
456			) -> pd.DataFrame \| tuple[pd.DataFrame, list[str]]:
457			"""Check for duplicates in subsets of columns and pools them. This can reduce \
458			the number of columns in the data without loosing much information. Suitable \
459			columns are combined to subsets and tested for duplicates. In case sufficient \
460			duplicates can be found, the respective columns are aggregated into a \
461			"pooled_var" column. Identical numbers in the "pooled_var" column indicate \
462			identical information in the respective rows.
463
464			Note: It is advised to exclude features that provide sufficient informational \
465			content by themselves as well as the target column by using the "exclude" \
466			setting.
467
468			Parameters
469			----------
470			data : pd.DataFrame
471			2D dataset that can be coerced into Pandas DataFrame
472			col_dupl_thresh : float, optional
473			Columns with a ratio of duplicates higher than "col_dupl_thresh" are \
474			considered in the further analysis. Columns with a lower ratio are not \
475			considered for pooling, by default 0.2
476			subset_thresh : float, optional
477			The first subset with a duplicate threshold higher than "subset_thresh" is \
478			chosen and aggregated. If no subset reaches the threshold, the algorithm \
479			continues with continuously smaller subsets until "min_col_pool" is reached, \
480			by default 0.2
481			min_col_pool : int, optional
482			Minimum number of columns to pool. The algorithm attempts to combine as many \
483			columns as possible to suitable subsets and stops when "min_col_pool" is \
484			reached, by default 3
485			exclude : Optional[list[str]], optional
486			List of column names to be excluded from the analysis. These columns are \
487			passed through without modification, by default None
488			return_details : bool, optional
489			Provdies flexibility to return intermediary results, by default False
490
491			Returns
492			-------
493			pd.DataFrame
494			DataFrame with low cardinality columns pooled
495
496			optional:
497			subset_cols: List of columns used as subset
498			"""
499			# Input validation
500			_validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
501			_validate_input_range(subset_thresh, "subset_thresh", 0, 1)
502			_validate_input_range(min_col_pool, "min_col_pool", 0, data.shape[1])
503
504			excluded_cols = []
505			if exclude is not None:
506			excluded_cols = data[exclude]
507			data = data.drop(columns=exclude)
508
509			subset_cols = []
510			for i in range(data.shape[1] + 1 - min_col_pool):
511			# Consider only columns with lots of duplicates
512			check_list = [
513			col
514			for col in data.columns
515			if data.duplicated(subset=col).mean() > col_dupl_thresh
516			]
517
518			# Identify all possible combinations for the current interation
519			if check_list:
520			combinations = itertools.combinations(check_list, len(check_list) - i)
521			else:
522			continue
523
524			# Check subsets for all possible combinations
525			ratios = [
526			*map(lambda comb: data.duplicated(subset=list(comb)).mean(), combinations)
527			]
528			max_idx = np.argmax(ratios)
529
530			if max(ratios) > subset_thresh:
531			# Get the best possible iterator and process the data
532			best_subset = itertools.islice(
533			itertools.combinations(check_list, len(check_list) - i),
534			max_idx,
535			max_idx + 1,
536			)
537
538			best_subset = data[list(list(best_subset)[0])]
539			subset_cols = best_subset.columns.tolist()
540
541			unique_subset = (
542			best_subset.drop_duplicates()
543			.reset_index()
544			.rename(columns={"index": "pooled_vars"})
545			)
546			data = data.merge(unique_subset, how="left", on=subset_cols).drop(
547			columns=subset_cols
548			)
549			data.index = pd.RangeIndex(len(data))
550			break
551
552			data = pd.concat([data, pd.DataFrame(excluded_cols)], axis=1)
553
554			return (data, subset_cols) if return_details else data
555

akanz1 / klib

GitHub Access Token became invalid

Push — main ( 7b7d02...0ccad7 )

klib.clean.data_cleaning() B

Complexity

Size

Duplication

Importance

How to fix Long Method Many Parameters

Long Method

Many Parameters

Duplication Side-by-Side

Filter issues like