klib.utils._print_cleaning_details() - Code Metrics - Inspection of "types and updates (#27)" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — main ( 7b7d02...0ccad7 )

by Andreas

created 2022-07-31 08:20 UTC

klib.utils._print_cleaning_details() A

↳ Parent: klib.utils

Complexity

Conditions

Size

Total Lines	8
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	8
nop	4
dl	0
loc	8
rs	10
c	0
b	0
f	0

"""
Utilities and auxiliary functions.

:author: Andreas Kanz

"""

from __future__ import annotations

from typing import Literal
from typing import Optional
from typing import TypedDict

import numpy as np
import pandas as pd


def _corr_selector(
    corr: pd.Series | pd.DataFrame,
    split: Optional[Literal["pos", "neg", "high", "low"]] = None,
    threshold: float = 0,
) -> pd.Series | pd.DataFrame:
    """Select the desired correlations using this utility function.

    Parameters
    ----------
    corr : pd.Series | pd.DataFrame
        pd.Series or pd.DataFrame of correlations
    split : Optional[str], optional
        Type of split performed, by default None
            * {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
        split = "high" or split = "low", in which case default is 0.3

    Returns
    -------
    pd.DataFrame
        List or matrix of (filtered) correlations
    """
    if split == "pos":
        corr = corr.where((corr >= threshold) & (corr > 0))
        print(
            'Displaying positive correlations. Specify a positive "threshold" to '
            "limit the results further."
        )
    elif split == "neg":
        corr = corr.where((corr <= threshold) & (corr < 0))
        print(
            'Displaying negative correlations. Specify a negative "threshold" to '
            "limit the results further."
        )
    elif split == "high":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) >= threshold)
        print(
            f"Displaying absolute correlations above the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.'
        )
    elif split == "low":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) <= threshold)
        print(
            f"Displaying absolute correlations below the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.'
        )

    return corr


def _diff_report(
    data: pd.DataFrame,
    data_cleaned: pd.DataFrame,
    dupl_rows: Optional[list[str | int]] = None,
    single_val_cols: Optional[list[str]] = None,
    show: Optional[Literal["all", "changes"]] = "changes",
) -> None:
    """Provide information about changes between two datasets, such as dropped rows \
        and columns, memory usage and missing values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
        dataset here
    data_cleaned : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
        updated dataset here
    dupl_rows : Optional[list[str | int]], optional
        List of duplicate row indices, by default None
    single_val_cols : Optional[List[str]], optional
        List of single-valued column indices. I.e. columns where all cells contain \
        the same value. NaNs count as a separate value, by default None
    show : str, optional
        {"all", "changes", None}, by default "changes"
        Specify verbosity of the output:
            * "all": Print information about the data before and after cleaning as \
                well as information about changes and memory usage (deep). Please be \
                aware, that this can slow down the function by quite a bit.
            * "changes": Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    None
        Print statement highlighting the datasets or changes between the two datasets.
    """
    if show not in ["changes", "all"]:
        return

    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
    data_mem = _memory_usage(data, deep=False)
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
    data_mv_tot = _missing_vals(data)["mv_total"]
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]

    if show == "all":
        data_mem = _memory_usage(data, deep=True)
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
        _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
        _print_cleaning_details(
            "After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem
        )

    print(
        f"Shape of cleaned data: {data_cleaned.shape} - "
        f"Remaining NAs: {data_cl_mv_tot}\n"
    )
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
    print(
        f"     of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n"
    )
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
    print(
        f"     of which {len(single_val_cols)} single valued."
        f"     Columns: {single_val_cols}"
    )
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
    mem_change = data_mem - data_cl_mem
    mem_perc = round(100 * mem_change / data_mem, 2)
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")


def _print_cleaning_details(arg0, arg1, arg2, arg3):
    print(arg0)
    print(f"dtypes:\n{arg1.dtypes.value_counts()}")
    print(f"\nNumber of rows: {str(arg1.shape[0]).rjust(8)}")
    print(f"Number of cols: {str(arg1.shape[1]).rjust(8)}")
    print(f"Missing values: {str(arg2).rjust(8)}")
    print(f"Memory usage: {str(arg3).rjust(7)} MB")
    print("_______________________________________________________\n")


def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]:
    """Provide information on and drops duplicate rows.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Tuple[pd.DataFrame, List]
        Deduplicated Pandas DataFrame and Index Object of rows dropped
    """
    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)

    return data, dupl_rows


def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
    """Give the total memory usage in megabytes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    deep : bool, optional
        Runs a deep analysis of the memory usage, by default True

    Returns
    -------
    float
        Memory usage in megabytes
    """
    return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)


class MVResult(TypedDict):
    """TypedDict for the return value of _missing_vals."""

    mv_total: int
    mv_rows: int
    mv_cols: int
    mv_rows_ratio: float
    mv_cols_ratio: float


def _missing_vals(data: pd.DataFrame) -> MVResult:
    """Give metrics of missing values in the dataset.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Dict[str, float]
        mv_total: float, number of missing values in the entire dataset
        mv_rows: float, number of missing values in each row
        mv_cols: float, number of missing values in each column
        mv_rows_ratio: float, ratio of missing values for each row
        mv_cols_ratio: float, ratio of missing values for each column
    """
    data = pd.DataFrame(data).copy()
    mv_total: int = data.isna().sum().sum()
    mv_rows: int = data.isna().sum(axis=1)
    mv_cols: int = data.isna().sum(axis=0)
    mv_rows_ratio: float = mv_rows / data.shape[1]
    mv_cols_ratio: float = mv_cols / data.shape[0]

    return {
        "mv_total": mv_total,
        "mv_rows": mv_rows,
        "mv_cols": mv_cols,
        "mv_rows_ratio": mv_rows_ratio,
        "mv_cols_ratio": mv_cols_ratio,
    }


def _validate_input_bool(value: bool, desc):
    if not isinstance(value, bool):
        raise TypeError(
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
        )


def _validate_input_int(value: int, desc):
    if not isinstance(value, int):
        raise TypeError(
            f"Input value for '{desc}' is {type(value)} but should be an integer."
        )


def _validate_input_range(value, desc, lower, upper):
    if value < lower or value > upper:
        raise ValueError(
            f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
        )


def _validate_input_smaller(value1, value2, desc):
    if value1 > value2:
        raise ValueError(
            f"The first input for '{desc}' should be smaller or equal to the second."
        )


def _validate_input_sum_smaller(limit, desc, *args):
    if sum(args) > limit:
        raise ValueError(
            f"The sum of input values for '{desc}' should be less or equal to {limit}."
        )


def _validate_input_sum_larger(limit, desc, *args):
    if sum(args) < limit:
        raise ValueError(
            f"The sum of input values for '{desc}' should be larger/equal to {limit}."
        )


1			"""
2			Utilities and auxiliary functions.
3
4			:author: Andreas Kanz
5
6			"""
7
8			from __future__ import annotations
9
10			from typing import Literal
11			from typing import Optional
12			from typing import TypedDict
13
14			import numpy as np
15			import pandas as pd
16
17
18			def _corr_selector(
19			corr: pd.Series \| pd.DataFrame,
20			split: Optional[Literal["pos", "neg", "high", "low"]] = None,
21			threshold: float = 0,
22			) -> pd.Series \| pd.DataFrame:
23			"""Select the desired correlations using this utility function.
24
25			Parameters
26			----------
27			corr : pd.Series \| pd.DataFrame
28			pd.Series or pd.DataFrame of correlations
29			split : Optional[str], optional
30			Type of split performed, by default None
31			* {None, "pos", "neg", "high", "low"}
32			threshold : float, optional
33			Value between 0 and 1 to set the correlation threshold, by default 0 unless \
34			split = "high" or split = "low", in which case default is 0.3
35
36			Returns
37			-------
38			pd.DataFrame
39			List or matrix of (filtered) correlations
40			"""
41			if split == "pos":
42			corr = corr.where((corr >= threshold) & (corr > 0))
43			print(
44			'Displaying positive correlations. Specify a positive "threshold" to '
45			"limit the results further."
46			)
47			elif split == "neg":
48			corr = corr.where((corr <= threshold) & (corr < 0))
49			print(
50			'Displaying negative correlations. Specify a negative "threshold" to '
51			"limit the results further."
52			)
53			elif split == "high":
54			threshold = 0.3 if threshold <= 0 else threshold
55			corr = corr.where(np.abs(corr) >= threshold)
56			print(
57			f"Displaying absolute correlations above the threshold ({threshold}). "
58			'Specify a positive "threshold" to limit the results further.'
59			)
60			elif split == "low":
61			threshold = 0.3 if threshold <= 0 else threshold
62			corr = corr.where(np.abs(corr) <= threshold)
63			print(
64			f"Displaying absolute correlations below the threshold ({threshold}). "
65			'Specify a positive "threshold" to limit the results further.'
66			)
67
68			return corr
69
70
71			def _diff_report(
72			data: pd.DataFrame,
73			data_cleaned: pd.DataFrame,
74			dupl_rows: Optional[list[str \| int]] = None,
75			single_val_cols: Optional[list[str]] = None,
76			show: Optional[Literal["all", "changes"]] = "changes",
77			) -> None:
78			"""Provide information about changes between two datasets, such as dropped rows \
79			and columns, memory usage and missing values.
80
81			Parameters
82			----------
83			data : pd.DataFrame
84			2D dataset that can be coerced into Pandas DataFrame. Input the initial \
85			dataset here
86			data_cleaned : pd.DataFrame
87			2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
88			updated dataset here
89			dupl_rows : Optional[list[str \| int]], optional
90			List of duplicate row indices, by default None
91			single_val_cols : Optional[List[str]], optional
92			List of single-valued column indices. I.e. columns where all cells contain \
93			the same value. NaNs count as a separate value, by default None
94			show : str, optional
95			{"all", "changes", None}, by default "changes"
96			Specify verbosity of the output:
97			* "all": Print information about the data before and after cleaning as \
98			well as information about changes and memory usage (deep). Please be \
99			aware, that this can slow down the function by quite a bit.
100			* "changes": Print out differences in the data before and after cleaning.
101			* None: No information about the data and the data cleaning is printed.
102
103			Returns
104			-------
105			None
106			Print statement highlighting the datasets or changes between the two datasets.
107			"""
108			if show not in ["changes", "all"]:
109			return
110
111			dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
112			single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
113			data_mem = _memory_usage(data, deep=False)
114			data_cl_mem = _memory_usage(data_cleaned, deep=False)
115			data_mv_tot = _missing_vals(data)["mv_total"]
116			data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
117
118			if show == "all":
119			data_mem = _memory_usage(data, deep=True)
120			data_cl_mem = _memory_usage(data_cleaned, deep=True)
121			_print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
122			_print_cleaning_details(
123			"After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem
124			)
125
126			print(
127			f"Shape of cleaned data: {data_cleaned.shape} - "
128			f"Remaining NAs: {data_cl_mv_tot}\n"
129			)
130			print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
131			print(
132			f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n"
133			)
134			print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
135			print(
136			f" of which {len(single_val_cols)} single valued."
137			f" Columns: {single_val_cols}"
138			)
139			print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
140			mem_change = data_mem - data_cl_mem
141			mem_perc = round(100 * mem_change / data_mem, 2)
142			print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
143
144
145			def _print_cleaning_details(arg0, arg1, arg2, arg3):
146			print(arg0)
147			print(f"dtypes:\n{arg1.dtypes.value_counts()}")
148			print(f"\nNumber of rows: {str(arg1.shape[0]).rjust(8)}")
149			print(f"Number of cols: {str(arg1.shape[1]).rjust(8)}")
150			print(f"Missing values: {str(arg2).rjust(8)}")
151			print(f"Memory usage: {str(arg3).rjust(7)} MB")
152			print("_______________________________________________________\n")
153
154
155			def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str \| int]]:
156			"""Provide information on and drops duplicate rows.
157
158			Parameters
159			----------
160			data : pd.DataFrame
161			2D dataset that can be coerced into Pandas DataFrame
162
163			Returns
164			-------
165			Tuple[pd.DataFrame, List]
166			Deduplicated Pandas DataFrame and Index Object of rows dropped
167			"""
168			data = pd.DataFrame(data).copy()
169			dupl_rows = data[data.duplicated()].index.tolist()
170			data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
171
172			return data, dupl_rows
173
174
175			def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
176			"""Give the total memory usage in megabytes.
177
178			Parameters
179			----------
180			data : pd.DataFrame
181			2D dataset that can be coerced into Pandas DataFrame
182			deep : bool, optional
183			Runs a deep analysis of the memory usage, by default True
184
185			Returns
186			-------
187			float
188			Memory usage in megabytes
189			"""
190			return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
191
192
193			class MVResult(TypedDict):
194			"""TypedDict for the return value of _missing_vals."""
195
196			mv_total: int
197			mv_rows: int
198			mv_cols: int
199			mv_rows_ratio: float
200			mv_cols_ratio: float
201
202
203			def _missing_vals(data: pd.DataFrame) -> MVResult:
204			"""Give metrics of missing values in the dataset.
205
206			Parameters
207			----------
208			data : pd.DataFrame
209			2D dataset that can be coerced into Pandas DataFrame
210
211			Returns
212			-------
213			Dict[str, float]
214			mv_total: float, number of missing values in the entire dataset
215			mv_rows: float, number of missing values in each row
216			mv_cols: float, number of missing values in each column
217			mv_rows_ratio: float, ratio of missing values for each row
218			mv_cols_ratio: float, ratio of missing values for each column
219			"""
220			data = pd.DataFrame(data).copy()
221			mv_total: int = data.isna().sum().sum()
222			mv_rows: int = data.isna().sum(axis=1)
223			mv_cols: int = data.isna().sum(axis=0)
224			mv_rows_ratio: float = mv_rows / data.shape[1]
225			mv_cols_ratio: float = mv_cols / data.shape[0]
226
227			return {
228			"mv_total": mv_total,
229			"mv_rows": mv_rows,
230			"mv_cols": mv_cols,
231			"mv_rows_ratio": mv_rows_ratio,
232			"mv_cols_ratio": mv_cols_ratio,
233			}
234
235
236			def _validate_input_bool(value: bool, desc):
237			if not isinstance(value, bool):
238			raise TypeError(
239			f"Input value for '{desc}' is {type(value)} but should be a boolean."
240			)
241
242
243			def _validate_input_int(value: int, desc):
244			if not isinstance(value, int):
245			raise TypeError(
246			f"Input value for '{desc}' is {type(value)} but should be an integer."
247			)
248
249
250			def _validate_input_range(value, desc, lower, upper):
251			if value < lower or value > upper:
252			raise ValueError(
253			f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
254			)
255
256
257			def _validate_input_smaller(value1, value2, desc):
258			if value1 > value2:
259			raise ValueError(
260			f"The first input for '{desc}' should be smaller or equal to the second."
261			)
262
263
264			def _validate_input_sum_smaller(limit, desc, *args):
265			if sum(args) > limit:
266			raise ValueError(
267			f"The sum of input values for '{desc}' should be less or equal to {limit}."
268			)
269
270
271			def _validate_input_sum_larger(limit, desc, *args):
272			if sum(args) < limit:
273			raise ValueError(
274			f"The sum of input values for '{desc}' should be larger/equal to {limit}."
275			)
276

akanz1 / klib

GitHub Access Token became invalid

Push — main ( 7b7d02...0ccad7 )

klib.utils._print_cleaning_details() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like