klib.utils._diff_report() - Code Metrics - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

klib.utils._diff_report() B
last analyzed 2025-11-06 11:08 UTC

↳ Parent: klib.utils

Complexity

Conditions

Size

Total Lines	75
Code Lines	35

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	35
nop	5
dl	0
loc	75
rs	8.5733
c	0
b	0
f	0

How to fix Long Method

"""Utilities and auxiliary functions.

:author: Andreas Kanz

"""

from __future__ import annotations

from typing import Literal
from typing import TypedDict

import numpy as np
import pandas as pd


def _corr_selector(
    corr: pd.Series | pd.DataFrame,
    split: Literal["pos", "neg", "high", "low"] | None = None,
    threshold: float = 0,
) -> pd.Series | pd.DataFrame:
    """Select the desired correlations using this utility function.

    Parameters
    ----------
    corr : pd.Series | pd.DataFrame
        pd.Series or pd.DataFrame of correlations
    split : Optional[str], optional
        Type of split performed, by default None
            * {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
        split = "high" or split = "low", in which case default is 0.3

    Returns
    -------
    pd.DataFrame
        List or matrix of (filtered) correlations

    """
    if split == "pos":
        corr = corr.where((corr >= threshold) & (corr > 0))
        print(
            'Displaying positive correlations. Specify a positive "threshold" to '
            "limit the results further.",
        )
    elif split == "neg":
        corr = corr.where((corr <= threshold) & (corr < 0))
        print(
            'Displaying negative correlations. Specify a negative "threshold" to '
            "limit the results further.",
        )
    elif split == "high":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) >= threshold)
        print(
            f"Displaying absolute correlations above the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.',
        )
    elif split == "low":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) <= threshold)
        print(
            f"Displaying absolute correlations below the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.',
        )

    return corr


def _diff_report(
    data: pd.DataFrame,
    data_cleaned: pd.DataFrame,
    dupl_rows: list[str | int] | None = None,
    single_val_cols: list[str] | None = None,
    show: Literal["all", "changes"] | None = "changes",
) -> None:
    """Provide information about changes between two datasets.

    This includes dropped rows and columns, memory usage and missing values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
        dataset here
    data_cleaned : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
        updated dataset here
    dupl_rows : Optional[list[str | int]], optional
        List of duplicate row indices, by default None
    single_val_cols : Optional[List[str]], optional
        List of single-valued column indices. I.e. columns where all cells contain \
        the same value. NaNs count as a separate value, by default None
    show : str, optional
        {"all", "changes", None}, by default "changes"
        Specify verbosity of the output:
            * "all": Print information about the data before and after cleaning as \
                well as information about changes and memory usage (deep). Please be \
                aware, that this can slow down the function by quite a bit.
            * "changes": Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    None
        Print statement highlighting the datasets or changes between the two datasets.

    """
    if show not in ["changes", "all"]:
        return

    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
    data_mem = _memory_usage(data, deep=False)
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
    data_mv_tot = _missing_vals(data)["mv_total"]
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]

    if show == "all":
        data_mem = _memory_usage(data, deep=True)
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
        _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
        _print_cleaning_details(
            "After data cleaning:\n",
            data_cleaned,
            data_cl_mv_tot,
            data_cl_mem,
        )

    print(
        f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}\n\n",
    )
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
    print(
        f"     of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n",
    )
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
    print(
        f"     of which {len(single_val_cols)} single valued.     Columns: {single_val_cols}",
    )
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
    mem_change = data_mem - data_cl_mem
    mem_perc = round(100 * mem_change / data_mem, 2)
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")


def _print_cleaning_details(
    header: str,
    data: pd.DataFrame | pd.Series,
    missing_vals: int,
    mem_usage: float,
) -> None:
    print(header)
    print(f"dtypes:\n{data.dtypes.value_counts()}")
    print(f"\nNumber of rows: {str(data.shape[0]).rjust(8)}")
    print(f"Number of cols: {str(data.shape[1]).rjust(8)}")
    print(f"Missing values: {str(missing_vals).rjust(8)}")
    print(f"Memory usage: {str(mem_usage).rjust(7)} MB")
    print("_______________________________________________________\n")


def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]:
    """Provide information on and drops duplicate rows.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Tuple[pd.DataFrame, List]
        Deduplicated Pandas DataFrame and Index Object of rows dropped

    """
    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)

    return data, dupl_rows


def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
    """Give the total memory usage in megabytes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    deep : bool, optional
        Runs a deep analysis of the memory usage, by default True

    Returns
    -------
    float
        Memory usage in megabytes

    """
    return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)


class MVResult(TypedDict):
    """TypedDict for the return value of _missing_vals."""

    mv_total: int
    mv_rows: int
    mv_cols: int
    mv_rows_ratio: float
    mv_cols_ratio: float


def _missing_vals(data: pd.DataFrame) -> MVResult:
    """Give metrics of missing values in the dataset.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Dict[str, float]
        mv_total: float, number of missing values in the entire dataset
        mv_rows: float, number of missing values in each row
        mv_cols: float, number of missing values in each column
        mv_rows_ratio: float, ratio of missing values for each row
        mv_cols_ratio: float, ratio of missing values for each column

    """
    data = pd.DataFrame(data).copy()
    mv_total: int = data.isna().sum().sum()
    mv_rows: int = data.isna().sum(axis=1)
    mv_cols: int = data.isna().sum(axis=0)
    mv_rows_ratio: float = mv_rows / data.shape[1]
    mv_cols_ratio: float = mv_cols / data.shape[0]

    return {
        "mv_total": mv_total,
        "mv_rows": mv_rows,
        "mv_cols": mv_cols,
        "mv_rows_ratio": mv_rows_ratio,
        "mv_cols_ratio": mv_cols_ratio,
    }


def _validate_input_bool(value: bool, desc: str) -> None:
    if not isinstance(value, bool):
        msg = f"Input value for '{desc}' is {type(value)} but should be a boolean."
        raise TypeError(msg)


def _validate_input_int(value: int, desc: str) -> None:
    if not isinstance(value, int):
        msg = f"Input value for '{desc}' is {type(value)} but should be an integer."
        raise TypeError(msg)


def _validate_input_range(value: float, desc: str, lower: float, upper: float) -> None:
    if value < lower or value > upper:
        msg = f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
        raise ValueError(msg)


def _validate_input_smaller(value1: int, value2: int, desc: str) -> None:
    if value1 > value2:
        msg = f"The first input for '{desc}' should be smaller or equal to the second."
        raise ValueError(msg)


def _validate_input_sum_smaller(limit: float, desc: str, *args) -> None:  # noqa: ANN002
    if sum(args) > limit:
        msg = f"The sum of input values for '{desc}' should be less or equal to {limit}."
        raise ValueError(msg)


def _validate_input_sum_larger(limit: float, desc: str, *args) -> None:  # noqa: ANN002
    if sum(args) < limit:
        msg = f"The sum of input values for '{desc}' should be larger/equal to {limit}."
        raise ValueError(msg)


def _validate_input_num_data(value: pd.DataFrame, desc: str) -> None:
    if value.select_dtypes(include=["number"]).empty:
        msg = f"Input value for '{desc}' should contain at least one numerical column."
        raise TypeError(msg)


1			"""Utilities and auxiliary functions.
2
3			:author: Andreas Kanz
4
5			"""
6
7			from __future__ import annotations
8
9			from typing import Literal
10			from typing import TypedDict
11
12			import numpy as np
13			import pandas as pd
14
15
16			def _corr_selector(
17			corr: pd.Series \| pd.DataFrame,
18			split: Literal["pos", "neg", "high", "low"] \| None = None,
19			threshold: float = 0,
20			) -> pd.Series \| pd.DataFrame:
21			"""Select the desired correlations using this utility function.
22
23			Parameters
24			----------
25			corr : pd.Series \| pd.DataFrame
26			pd.Series or pd.DataFrame of correlations
27			split : Optional[str], optional
28			Type of split performed, by default None
29			* {None, "pos", "neg", "high", "low"}
30			threshold : float, optional
31			Value between 0 and 1 to set the correlation threshold, by default 0 unless \
32			split = "high" or split = "low", in which case default is 0.3
33
34			Returns
35			-------
36			pd.DataFrame
37			List or matrix of (filtered) correlations
38
39			"""
40			if split == "pos":
41			corr = corr.where((corr >= threshold) & (corr > 0))
42			print(
43			'Displaying positive correlations. Specify a positive "threshold" to '
44			"limit the results further.",
45			)
46			elif split == "neg":
47			corr = corr.where((corr <= threshold) & (corr < 0))
48			print(
49			'Displaying negative correlations. Specify a negative "threshold" to '
50			"limit the results further.",
51			)
52			elif split == "high":
53			threshold = 0.3 if threshold <= 0 else threshold
54			corr = corr.where(np.abs(corr) >= threshold)
55			print(
56			f"Displaying absolute correlations above the threshold ({threshold}). "
57			'Specify a positive "threshold" to limit the results further.',
58			)
59			elif split == "low":
60			threshold = 0.3 if threshold <= 0 else threshold
61			corr = corr.where(np.abs(corr) <= threshold)
62			print(
63			f"Displaying absolute correlations below the threshold ({threshold}). "
64			'Specify a positive "threshold" to limit the results further.',
65			)
66
67			return corr
68
69
70			def _diff_report(
71			data: pd.DataFrame,
72			data_cleaned: pd.DataFrame,
73			dupl_rows: list[str \| int] \| None = None,
74			single_val_cols: list[str] \| None = None,
75			show: Literal["all", "changes"] \| None = "changes",
76			) -> None:
77			"""Provide information about changes between two datasets.
78
79			This includes dropped rows and columns, memory usage and missing values.
80
81			Parameters
82			----------
83			data : pd.DataFrame
84			2D dataset that can be coerced into Pandas DataFrame. Input the initial \
85			dataset here
86			data_cleaned : pd.DataFrame
87			2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
88			updated dataset here
89			dupl_rows : Optional[list[str \| int]], optional
90			List of duplicate row indices, by default None
91			single_val_cols : Optional[List[str]], optional
92			List of single-valued column indices. I.e. columns where all cells contain \
93			the same value. NaNs count as a separate value, by default None
94			show : str, optional
95			{"all", "changes", None}, by default "changes"
96			Specify verbosity of the output:
97			* "all": Print information about the data before and after cleaning as \
98			well as information about changes and memory usage (deep). Please be \
99			aware, that this can slow down the function by quite a bit.
100			* "changes": Print out differences in the data before and after cleaning.
101			* None: No information about the data and the data cleaning is printed.
102
103			Returns
104			-------
105			None
106			Print statement highlighting the datasets or changes between the two datasets.
107
108			"""
109			if show not in ["changes", "all"]:
110			return
111
112			dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
113			single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
114			data_mem = _memory_usage(data, deep=False)
115			data_cl_mem = _memory_usage(data_cleaned, deep=False)
116			data_mv_tot = _missing_vals(data)["mv_total"]
117			data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
118
119			if show == "all":
120			data_mem = _memory_usage(data, deep=True)
121			data_cl_mem = _memory_usage(data_cleaned, deep=True)
122			_print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
123			_print_cleaning_details(
124			"After data cleaning:\n",
125			data_cleaned,
126			data_cl_mv_tot,
127			data_cl_mem,
128			)
129
130			print(
131			f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}\n\n",
132			)
133			print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
134			print(
135			f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n",
136			)
137			print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
138			print(
139			f" of which {len(single_val_cols)} single valued. Columns: {single_val_cols}",
140			)
141			print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
142			mem_change = data_mem - data_cl_mem
143			mem_perc = round(100 * mem_change / data_mem, 2)
144			print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
145
146
147			def _print_cleaning_details(
148			header: str,
149			data: pd.DataFrame \| pd.Series,
150			missing_vals: int,
151			mem_usage: float,
152			) -> None:
153			print(header)
154			print(f"dtypes:\n{data.dtypes.value_counts()}")
155			print(f"\nNumber of rows: {str(data.shape[0]).rjust(8)}")
156			print(f"Number of cols: {str(data.shape[1]).rjust(8)}")
157			print(f"Missing values: {str(missing_vals).rjust(8)}")
158			print(f"Memory usage: {str(mem_usage).rjust(7)} MB")
159			print("_______________________________________________________\n")
160
161
162			def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str \| int]]:
163			"""Provide information on and drops duplicate rows.
164
165			Parameters
166			----------
167			data : pd.DataFrame
168			2D dataset that can be coerced into Pandas DataFrame
169
170			Returns
171			-------
172			Tuple[pd.DataFrame, List]
173			Deduplicated Pandas DataFrame and Index Object of rows dropped
174
175			"""
176			data = pd.DataFrame(data).copy()
177			dupl_rows = data[data.duplicated()].index.tolist()
178			data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
179
180			return data, dupl_rows
181
182
183			def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
184			"""Give the total memory usage in megabytes.
185
186			Parameters
187			----------
188			data : pd.DataFrame
189			2D dataset that can be coerced into Pandas DataFrame
190			deep : bool, optional
191			Runs a deep analysis of the memory usage, by default True
192
193			Returns
194			-------
195			float
196			Memory usage in megabytes
197
198			"""
199			return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
200
201
202			class MVResult(TypedDict):
203			"""TypedDict for the return value of _missing_vals."""
204
205			mv_total: int
206			mv_rows: int
207			mv_cols: int
208			mv_rows_ratio: float
209			mv_cols_ratio: float
210
211
212			def _missing_vals(data: pd.DataFrame) -> MVResult:
213			"""Give metrics of missing values in the dataset.
214
215			Parameters
216			----------
217			data : pd.DataFrame
218			2D dataset that can be coerced into Pandas DataFrame
219
220			Returns
221			-------
222			Dict[str, float]
223			mv_total: float, number of missing values in the entire dataset
224			mv_rows: float, number of missing values in each row
225			mv_cols: float, number of missing values in each column
226			mv_rows_ratio: float, ratio of missing values for each row
227			mv_cols_ratio: float, ratio of missing values for each column
228
229			"""
230			data = pd.DataFrame(data).copy()
231			mv_total: int = data.isna().sum().sum()
232			mv_rows: int = data.isna().sum(axis=1)
233			mv_cols: int = data.isna().sum(axis=0)
234			mv_rows_ratio: float = mv_rows / data.shape[1]
235			mv_cols_ratio: float = mv_cols / data.shape[0]
236
237			return {
238			"mv_total": mv_total,
239			"mv_rows": mv_rows,
240			"mv_cols": mv_cols,
241			"mv_rows_ratio": mv_rows_ratio,
242			"mv_cols_ratio": mv_cols_ratio,
243			}
244
245
246			def _validate_input_bool(value: bool, desc: str) -> None:
247			if not isinstance(value, bool):
248			msg = f"Input value for '{desc}' is {type(value)} but should be a boolean."
249			raise TypeError(msg)
250
251
252			def _validate_input_int(value: int, desc: str) -> None:
253			if not isinstance(value, int):
254			msg = f"Input value for '{desc}' is {type(value)} but should be an integer."
255			raise TypeError(msg)
256
257
258			def _validate_input_range(value: float, desc: str, lower: float, upper: float) -> None:
259			if value < lower or value > upper:
260			msg = f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
261			raise ValueError(msg)
262
263
264			def _validate_input_smaller(value1: int, value2: int, desc: str) -> None:
265			if value1 > value2:
266			msg = f"The first input for '{desc}' should be smaller or equal to the second."
267			raise ValueError(msg)
268
269
270			def _validate_input_sum_smaller(limit: float, desc: str, *args) -> None: # noqa: ANN002
271			if sum(args) > limit:
272			msg = f"The sum of input values for '{desc}' should be less or equal to {limit}."
273			raise ValueError(msg)
274
275
276			def _validate_input_sum_larger(limit: float, desc: str, *args) -> None: # noqa: ANN002
277			if sum(args) < limit:
278			msg = f"The sum of input values for '{desc}' should be larger/equal to {limit}."
279			raise ValueError(msg)
280
281
282			def _validate_input_num_data(value: pd.DataFrame, desc: str) -> None:
283			if value.select_dtypes(include=["number"]).empty:
284			msg = f"Input value for '{desc}' should contain at least one numerical column."
285			raise TypeError(msg)
286

akanz1 / klib

GitHub Access Token became invalid

klib.utils._diff_report() B last analyzed 2025-11-06 11:08 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like

klib.utils._diff_report() B
last analyzed 2025-11-06 11:08 UTC