klib.utils._validate_input_num_data() - Code Metrics - Inspection of "add validation to prevent passing non-numerical da..." - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — main ( 477a06...fee6f5 )

by Andreas

created 2022-09-18 15:07 UTC

klib.utils._validate_input_num_data() A

↳ Parent: klib.utils

Complexity

Conditions

Size

Total Lines	4
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	4
nop	2
dl	0
loc	4
rs	10
c	0
b	0
f	0

"""
Utilities and auxiliary functions.

:author: Andreas Kanz

"""
from __future__ import annotations

from typing import Literal
from typing import Optional
from typing import TypedDict

import numpy as np
import pandas as pd


def _corr_selector(
    corr: pd.Series | pd.DataFrame,
    split: Optional[Literal["pos", "neg", "high", "low"]] = None,
    threshold: float = 0,
) -> pd.Series | pd.DataFrame:
    """Select the desired correlations using this utility function.

    Parameters
    ----------
    corr : pd.Series | pd.DataFrame
        pd.Series or pd.DataFrame of correlations
    split : Optional[str], optional
        Type of split performed, by default None
            * {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
        split = "high" or split = "low", in which case default is 0.3

    Returns
    -------
    pd.DataFrame
        List or matrix of (filtered) correlations
    """
    if split == "pos":
        corr = corr.where((corr >= threshold) & (corr > 0))
        print(
            'Displaying positive correlations. Specify a positive "threshold" to '
            "limit the results further."
        )
    elif split == "neg":
        corr = corr.where((corr <= threshold) & (corr < 0))
        print(
            'Displaying negative correlations. Specify a negative "threshold" to '
            "limit the results further."
        )
    elif split == "high":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) >= threshold)
        print(
            f"Displaying absolute correlations above the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.'
        )
    elif split == "low":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) <= threshold)
        print(
            f"Displaying absolute correlations below the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.'
        )

    return corr


def _diff_report(
    data: pd.DataFrame,
    data_cleaned: pd.DataFrame,
    dupl_rows: Optional[list[str | int]] = None,
    single_val_cols: Optional[list[str]] = None,
    show: Optional[Literal["all", "changes"]] = "changes",
) -> None:
    """Provide information about changes between two datasets, such as dropped rows \
        and columns, memory usage and missing values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
        dataset here
    data_cleaned : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
        updated dataset here
    dupl_rows : Optional[list[str | int]], optional
        List of duplicate row indices, by default None
    single_val_cols : Optional[List[str]], optional
        List of single-valued column indices. I.e. columns where all cells contain \
        the same value. NaNs count as a separate value, by default None
    show : str, optional
        {"all", "changes", None}, by default "changes"
        Specify verbosity of the output:
            * "all": Print information about the data before and after cleaning as \
                well as information about changes and memory usage (deep). Please be \
                aware, that this can slow down the function by quite a bit.
            * "changes": Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    None
        Print statement highlighting the datasets or changes between the two datasets.
    """
    if show not in ["changes", "all"]:
        return

    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
    data_mem = _memory_usage(data, deep=False)
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
    data_mv_tot = _missing_vals(data)["mv_total"]
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]

    if show == "all":
        data_mem = _memory_usage(data, deep=True)
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
        _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
        _print_cleaning_details(
            "After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem
        )

    print(
        f"Shape of cleaned data: {data_cleaned.shape} - "
        f"Remaining NAs: {data_cl_mv_tot}\n\n"
    )
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
    print(
        f"     of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n"  # noqa
    )
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
    print(
        f"     of which {len(single_val_cols)} single valued."
        f"     Columns: {single_val_cols}"
    )
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
    mem_change = data_mem - data_cl_mem
    mem_perc = round(100 * mem_change / data_mem, 2)
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")


def _print_cleaning_details(arg0, arg1, arg2, arg3):
    print(arg0)
    print(f"dtypes:\n{arg1.dtypes.value_counts()}")
    print(f"\nNumber of rows: {str(arg1.shape[0]).rjust(8)}")
    print(f"Number of cols: {str(arg1.shape[1]).rjust(8)}")
    print(f"Missing values: {str(arg2).rjust(8)}")
    print(f"Memory usage: {str(arg3).rjust(7)} MB")
    print("_______________________________________________________\n")


def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]:
    """Provide information on and drops duplicate rows.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Tuple[pd.DataFrame, List]
        Deduplicated Pandas DataFrame and Index Object of rows dropped
    """
    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)

    return data, dupl_rows


def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
    """Give the total memory usage in megabytes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    deep : bool, optional
        Runs a deep analysis of the memory usage, by default True

    Returns
    -------
    float
        Memory usage in megabytes
    """
    return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)


class MVResult(TypedDict):
    """TypedDict for the return value of _missing_vals."""

    mv_total: int
    mv_rows: int
    mv_cols: int
    mv_rows_ratio: float
    mv_cols_ratio: float


def _missing_vals(data: pd.DataFrame) -> MVResult:
    """Give metrics of missing values in the dataset.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Dict[str, float]
        mv_total: float, number of missing values in the entire dataset
        mv_rows: float, number of missing values in each row
        mv_cols: float, number of missing values in each column
        mv_rows_ratio: float, ratio of missing values for each row
        mv_cols_ratio: float, ratio of missing values for each column
    """
    data = pd.DataFrame(data).copy()
    mv_total: int = data.isna().sum().sum()
    mv_rows: int = data.isna().sum(axis=1)
    mv_cols: int = data.isna().sum(axis=0)
    mv_rows_ratio: float = mv_rows / data.shape[1]
    mv_cols_ratio: float = mv_cols / data.shape[0]

    return {
        "mv_total": mv_total,
        "mv_rows": mv_rows,
        "mv_cols": mv_cols,
        "mv_rows_ratio": mv_rows_ratio,
        "mv_cols_ratio": mv_cols_ratio,
    }


def _validate_input_bool(value: bool, desc):
    if not isinstance(value, bool):
        raise TypeError(
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
        )


def _validate_input_int(value: int, desc):
    if not isinstance(value, int):
        raise TypeError(
            f"Input value for '{desc}' is {type(value)} but should be an integer."
        )


def _validate_input_range(value, desc, lower, upper):
    if value < lower or value > upper:
        raise ValueError(
            f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
        )


def _validate_input_smaller(value1, value2, desc):
    if value1 > value2:
        raise ValueError(
            f"The first input for '{desc}' should be smaller or equal to the second."
        )


def _validate_input_sum_smaller(limit, desc, *args):
    if sum(args) > limit:
        raise ValueError(
            f"The sum of input values for '{desc}' should be less or equal to {limit}."
        )


def _validate_input_sum_larger(limit, desc, *args):
    if sum(args) < limit:
        raise ValueError(
            f"The sum of input values for '{desc}' should be larger/equal to {limit}."
        )


def _validate_input_num_data(value: pd.DataFrame, desc):
    if value.select_dtypes(include=["number"]).empty:
        raise TypeError(
            f"Input value for '{desc}' should contain at least one numerical column."
        )


1			"""
2			Utilities and auxiliary functions.
3
4			:author: Andreas Kanz
5
6			"""
7			from __future__ import annotations
8
9			from typing import Literal
10			from typing import Optional
11			from typing import TypedDict
12
13			import numpy as np
14			import pandas as pd
15
16
17			def _corr_selector(
18			corr: pd.Series \| pd.DataFrame,
19			split: Optional[Literal["pos", "neg", "high", "low"]] = None,
20			threshold: float = 0,
21			) -> pd.Series \| pd.DataFrame:
22			"""Select the desired correlations using this utility function.
23
24			Parameters
25			----------
26			corr : pd.Series \| pd.DataFrame
27			pd.Series or pd.DataFrame of correlations
28			split : Optional[str], optional
29			Type of split performed, by default None
30			* {None, "pos", "neg", "high", "low"}
31			threshold : float, optional
32			Value between 0 and 1 to set the correlation threshold, by default 0 unless \
33			split = "high" or split = "low", in which case default is 0.3
34
35			Returns
36			-------
37			pd.DataFrame
38			List or matrix of (filtered) correlations
39			"""
40			if split == "pos":
41			corr = corr.where((corr >= threshold) & (corr > 0))
42			print(
43			'Displaying positive correlations. Specify a positive "threshold" to '
44			"limit the results further."
45			)
46			elif split == "neg":
47			corr = corr.where((corr <= threshold) & (corr < 0))
48			print(
49			'Displaying negative correlations. Specify a negative "threshold" to '
50			"limit the results further."
51			)
52			elif split == "high":
53			threshold = 0.3 if threshold <= 0 else threshold
54			corr = corr.where(np.abs(corr) >= threshold)
55			print(
56			f"Displaying absolute correlations above the threshold ({threshold}). "
57			'Specify a positive "threshold" to limit the results further.'
58			)
59			elif split == "low":
60			threshold = 0.3 if threshold <= 0 else threshold
61			corr = corr.where(np.abs(corr) <= threshold)
62			print(
63			f"Displaying absolute correlations below the threshold ({threshold}). "
64			'Specify a positive "threshold" to limit the results further.'
65			)
66
67			return corr
68
69
70			def _diff_report(
71			data: pd.DataFrame,
72			data_cleaned: pd.DataFrame,
73			dupl_rows: Optional[list[str \| int]] = None,
74			single_val_cols: Optional[list[str]] = None,
75			show: Optional[Literal["all", "changes"]] = "changes",
76			) -> None:
77			"""Provide information about changes between two datasets, such as dropped rows \
78			and columns, memory usage and missing values.
79
80			Parameters
81			----------
82			data : pd.DataFrame
83			2D dataset that can be coerced into Pandas DataFrame. Input the initial \
84			dataset here
85			data_cleaned : pd.DataFrame
86			2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
87			updated dataset here
88			dupl_rows : Optional[list[str \| int]], optional
89			List of duplicate row indices, by default None
90			single_val_cols : Optional[List[str]], optional
91			List of single-valued column indices. I.e. columns where all cells contain \
92			the same value. NaNs count as a separate value, by default None
93			show : str, optional
94			{"all", "changes", None}, by default "changes"
95			Specify verbosity of the output:
96			* "all": Print information about the data before and after cleaning as \
97			well as information about changes and memory usage (deep). Please be \
98			aware, that this can slow down the function by quite a bit.
99			* "changes": Print out differences in the data before and after cleaning.
100			* None: No information about the data and the data cleaning is printed.
101
102			Returns
103			-------
104			None
105			Print statement highlighting the datasets or changes between the two datasets.
106			"""
107			if show not in ["changes", "all"]:
108			return
109
110			dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
111			single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
112			data_mem = _memory_usage(data, deep=False)
113			data_cl_mem = _memory_usage(data_cleaned, deep=False)
114			data_mv_tot = _missing_vals(data)["mv_total"]
115			data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
116
117			if show == "all":
118			data_mem = _memory_usage(data, deep=True)
119			data_cl_mem = _memory_usage(data_cleaned, deep=True)
120			_print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
121			_print_cleaning_details(
122			"After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem
123			)
124
125			print(
126			f"Shape of cleaned data: {data_cleaned.shape} - "
127			f"Remaining NAs: {data_cl_mv_tot}\n\n"
128			)
129			print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
130			print(
131			f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n" # noqa
132			)
133			print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
134			print(
135			f" of which {len(single_val_cols)} single valued."
136			f" Columns: {single_val_cols}"
137			)
138			print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
139			mem_change = data_mem - data_cl_mem
140			mem_perc = round(100 * mem_change / data_mem, 2)
141			print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
142
143
144			def _print_cleaning_details(arg0, arg1, arg2, arg3):
145			print(arg0)
146			print(f"dtypes:\n{arg1.dtypes.value_counts()}")
147			print(f"\nNumber of rows: {str(arg1.shape[0]).rjust(8)}")
148			print(f"Number of cols: {str(arg1.shape[1]).rjust(8)}")
149			print(f"Missing values: {str(arg2).rjust(8)}")
150			print(f"Memory usage: {str(arg3).rjust(7)} MB")
151			print("_______________________________________________________\n")
152
153
154			def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str \| int]]:
155			"""Provide information on and drops duplicate rows.
156
157			Parameters
158			----------
159			data : pd.DataFrame
160			2D dataset that can be coerced into Pandas DataFrame
161
162			Returns
163			-------
164			Tuple[pd.DataFrame, List]
165			Deduplicated Pandas DataFrame and Index Object of rows dropped
166			"""
167			data = pd.DataFrame(data).copy()
168			dupl_rows = data[data.duplicated()].index.tolist()
169			data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
170
171			return data, dupl_rows
172
173
174			def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
175			"""Give the total memory usage in megabytes.
176
177			Parameters
178			----------
179			data : pd.DataFrame
180			2D dataset that can be coerced into Pandas DataFrame
181			deep : bool, optional
182			Runs a deep analysis of the memory usage, by default True
183
184			Returns
185			-------
186			float
187			Memory usage in megabytes
188			"""
189			return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
190
191
192			class MVResult(TypedDict):
193			"""TypedDict for the return value of _missing_vals."""
194
195			mv_total: int
196			mv_rows: int
197			mv_cols: int
198			mv_rows_ratio: float
199			mv_cols_ratio: float
200
201
202			def _missing_vals(data: pd.DataFrame) -> MVResult:
203			"""Give metrics of missing values in the dataset.
204
205			Parameters
206			----------
207			data : pd.DataFrame
208			2D dataset that can be coerced into Pandas DataFrame
209
210			Returns
211			-------
212			Dict[str, float]
213			mv_total: float, number of missing values in the entire dataset
214			mv_rows: float, number of missing values in each row
215			mv_cols: float, number of missing values in each column
216			mv_rows_ratio: float, ratio of missing values for each row
217			mv_cols_ratio: float, ratio of missing values for each column
218			"""
219			data = pd.DataFrame(data).copy()
220			mv_total: int = data.isna().sum().sum()
221			mv_rows: int = data.isna().sum(axis=1)
222			mv_cols: int = data.isna().sum(axis=0)
223			mv_rows_ratio: float = mv_rows / data.shape[1]
224			mv_cols_ratio: float = mv_cols / data.shape[0]
225
226			return {
227			"mv_total": mv_total,
228			"mv_rows": mv_rows,
229			"mv_cols": mv_cols,
230			"mv_rows_ratio": mv_rows_ratio,
231			"mv_cols_ratio": mv_cols_ratio,
232			}
233
234
235			def _validate_input_bool(value: bool, desc):
236			if not isinstance(value, bool):
237			raise TypeError(
238			f"Input value for '{desc}' is {type(value)} but should be a boolean."
239			)
240
241
242			def _validate_input_int(value: int, desc):
243			if not isinstance(value, int):
244			raise TypeError(
245			f"Input value for '{desc}' is {type(value)} but should be an integer."
246			)
247
248
249			def _validate_input_range(value, desc, lower, upper):
250			if value < lower or value > upper:
251			raise ValueError(
252			f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
253			)
254
255
256			def _validate_input_smaller(value1, value2, desc):
257			if value1 > value2:
258			raise ValueError(
259			f"The first input for '{desc}' should be smaller or equal to the second."
260			)
261
262
263			def _validate_input_sum_smaller(limit, desc, *args):
264			if sum(args) > limit:
265			raise ValueError(
266			f"The sum of input values for '{desc}' should be less or equal to {limit}."
267			)
268
269
270			def _validate_input_sum_larger(limit, desc, *args):
271			if sum(args) < limit:
272			raise ValueError(
273			f"The sum of input values for '{desc}' should be larger/equal to {limit}."
274			)
275
276
277			def _validate_input_num_data(value: pd.DataFrame, desc):
278			if value.select_dtypes(include=["number"]).empty:
279			raise TypeError(
280			f"Input value for '{desc}' should contain at least one numerical column."
281			)
282

akanz1 / klib

GitHub Access Token became invalid

Push — main ( 477a06...fee6f5 )

klib.utils._validate_input_num_data() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like