klib.utils._validate_input_int() - Code Metrics - Inspection of "cat_plot refinements and test updates" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 5a4fe9...c92c0e )

by Andreas

created 2020-08-03 08:12 UTC

klib.utils._validate_input_int() A

↳ Parent: klib.utils

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	3
nop	2
dl	0
loc	3
rs	10
c	0
b	0
f	0

"""
Utilities and auxiliary functions.

:author: Andreas Kanz

"""

# Imports
import numpy as np
import pandas as pd
from typing import Any, Dict, List, Optional, Tuple, Union


def _corr_selector(
    corr: Union[pd.Series, pd.DataFrame],
    split: Optional[str] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
    threshold: float = 0,
) -> Union[pd.Series, pd.DataFrame]:
    """ Utility funciton to select the desired correlations.

    Parameters
    ----------
    corr : Union[pd.Series, pd.DataFrame]
        pd.Series or pd.DataFrame of correlations
    split : Optional[str], optional
        Type of split performed, by default None
           * {None, 'pos', 'neg', 'high', 'low'}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0

    Returns
    -------
    pd.DataFrame
        List or matrix of (filtered) correlations
    """

    if split == "pos":
        corr = corr.where((corr >= threshold) & (corr > 0))
        print(
            'Displaying positive correlations. Specify a positive "threshold" to further limit the results.'
        )
    elif split == "neg":
        corr = corr.where((corr <= threshold) & (corr < 0))
        print(
            'Displaying negative correlations. Specify a negative "threshold" to further limit the results.'
        )
    elif split == "high":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) >= threshold)
        print(
            f"Displaying absolute correlations above the threshold ({threshold}). "
            'Specify a positive "threshold" to further limit the results.'
        )
    elif split == "low":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) <= threshold)
        print(
            f"Displaying absolute correlations below the threshold ({threshold}). "
            'Specify a positive "threshold" to further limit the results.'
        )

    return corr


def _diff_report(
    data: pd.DataFrame,
    data_cleaned: pd.DataFrame,
    dupl_rows: Optional[List[Union[str, int]]] = None,
    single_val_cols: Optional[List[str]] = None,
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes",
) -> None:
    """ Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
    missing values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here
    data_cleaned : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here
    dupl_rows : Optional[List[Union[str, int]]], optional
        List of duplicate row indices, by default None
    single_val_cols : Optional[List[str]], optional
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
        NaNs count as a separate value, by default None
    show : str, optional
        {'all', 'changes', None}, by default "changes"
        Specify verbosity of the output:
            * 'all': Print information about the data before and after cleaning as well as information about changes \
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
            * 'changes': Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    None
        Print statement highlighting the datasets or changes between the two datasets.
    """

    if show in ["changes", "all"]:
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
        data_mem = _memory_usage(data, deep=False)
        data_cl_mem = _memory_usage(data_cleaned, deep=False)
        data_mv_tot = _missing_vals(data)["mv_total"]
        data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]

        if show == "all":
            data_mem = _memory_usage(data, deep=True)
            data_cl_mem = _memory_usage(data_cleaned, deep=True)
            print("Before data cleaning:\n")
            print(f"dtypes:\n{data.dtypes.value_counts()}")
            print(f"\nNumber of rows: {data.shape[0]}")
            print(f"Number of cols: {data.shape[1]}")
            print(f"Missing values: {data_mv_tot}")
            print(f"Memory usage: {data_mem} MB")
            print("_______________________________________________________\n")
            print("After data cleaning:\n")
            print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
            print(f"\nNumber of rows: {data_cleaned.shape[0]}")
            print(f"Number of cols: {data_cleaned.shape[1]}")
            print(f"Missing values: {data_cl_mv_tot}")
            print(f"Memory usage: {data_cl_mem} MB")
            print("_______________________________________________________\n")

        print(f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
        print("\nChanges:")
        print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
        print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})")
        print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
        print(f"     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})")
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
        mem_change = data_mem - data_cl_mem
        print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{round(100*mem_change/data_mem,2)}%)")


def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
    """ Provides information on and drops duplicate rows.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Tuple[pd.DataFrame, List]
        Deduplicated Pandas DataFrame and Index Object of rows dropped
    """

    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis="index")

    return data, dupl_rows


def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
    """ Gives the total memory usage in megabytes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    deep : bool, optional
        Runs a deep analysis of the memory usage, by default True

    Returns
    -------
    float
        Memory usage in megabytes
    """

    data = pd.DataFrame(data).copy()
    memory_usage = round(data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2)

    return memory_usage


def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
    """ Gives metrics of missing values in the dataset.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Dict[str, float]
        mv_total: float, number of missing values in the entire dataset
        mv_rows: float, number of missing values in each row
        mv_cols: float, number of missing values in each column
        mv_rows_ratio: float, ratio of missing values for each row
        mv_cols_ratio: float, ratio of missing values for each column
    """

    data = pd.DataFrame(data).copy()
    mv_rows = data.isna().sum(axis=1)
    mv_cols = data.isna().sum(axis=0)
    mv_total = data.isna().sum().sum()
    mv_rows_ratio = mv_rows / data.shape[1]
    mv_cols_ratio = mv_cols / data.shape[0]

    return {
        "mv_total": mv_total,
        "mv_rows": mv_rows,
        "mv_cols": mv_cols,
        "mv_rows_ratio": mv_rows_ratio,
        "mv_cols_ratio": mv_cols_ratio,
    }


def _validate_input_bool(value, desc):
    if not (isinstance(value, bool)):
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")


def _validate_input_int(value, desc):
    if not isinstance(value, int):
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")


def _validate_input_range(value, desc, lower, upper):
    if value < lower or value > upper:
        raise ValueError(f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")


def _validate_input_smaller(value1, value2, desc):
    if value1 > value2:
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")


def _validate_input_sum_smaller(limit, desc, *args):
    if sum(args) > limit:
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")


def _validate_input_sum_larger(limit, desc, *args):
    if sum(args) < limit:
        raise ValueError(
            f"The sum of imput values provided for '{desc}' should be larger or equal to {limit}."
        )


1			"""
2			Utilities and auxiliary functions.
3
4			:author: Andreas Kanz
5
6			"""
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11			from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14			def _corr_selector(
15			corr: Union[pd.Series, pd.DataFrame],
16			split: Optional[str] = None, # Optional[Literal["pos", "neg", "above", "below"]] = None,
17			threshold: float = 0,
18			) -> Union[pd.Series, pd.DataFrame]:
19			""" Utility funciton to select the desired correlations.
20
21			Parameters
22			----------
23			corr : Union[pd.Series, pd.DataFrame]
24			pd.Series or pd.DataFrame of correlations
25			split : Optional[str], optional
26			Type of split performed, by default None
27			* {None, 'pos', 'neg', 'high', 'low'}
28			threshold : float, optional
29			Value between 0 and 1 to set the correlation threshold, by default 0
30
31			Returns
32			-------
33			pd.DataFrame
34			List or matrix of (filtered) correlations
35			"""
36
37			if split == "pos":
38			corr = corr.where((corr >= threshold) & (corr > 0))
39			print(
40			'Displaying positive correlations. Specify a positive "threshold" to further limit the results.'
41			)
42			elif split == "neg":
43			corr = corr.where((corr <= threshold) & (corr < 0))
44			print(
45			'Displaying negative correlations. Specify a negative "threshold" to further limit the results.'
46			)
47			elif split == "high":
48			threshold = 0.3 if threshold <= 0 else threshold
49			corr = corr.where(np.abs(corr) >= threshold)
50			print(
51			f"Displaying absolute correlations above the threshold ({threshold}). "
52			'Specify a positive "threshold" to further limit the results.'
53			)
54			elif split == "low":
55			threshold = 0.3 if threshold <= 0 else threshold
56			corr = corr.where(np.abs(corr) <= threshold)
57			print(
58			f"Displaying absolute correlations below the threshold ({threshold}). "
59			'Specify a positive "threshold" to further limit the results.'
60			)
61
62			return corr
63
64
65			def _diff_report(
66			data: pd.DataFrame,
67			data_cleaned: pd.DataFrame,
68			dupl_rows: Optional[List[Union[str, int]]] = None,
69			single_val_cols: Optional[List[str]] = None,
70			show: Optional[str] = "changes", # Optional[Literal["all", "changes"]] = "changes",
71			) -> None:
72			""" Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
73			missing values.
74
75			Parameters
76			----------
77			data : pd.DataFrame
78			2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here
79			data_cleaned : pd.DataFrame
80			2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here
81			dupl_rows : Optional[List[Union[str, int]]], optional
82			List of duplicate row indices, by default None
83			single_val_cols : Optional[List[str]], optional
84			List of single-valued column indices. I.e. columns where all cells contain the same value. \
85			NaNs count as a separate value, by default None
86			show : str, optional
87			{'all', 'changes', None}, by default "changes"
88			Specify verbosity of the output:
89			* 'all': Print information about the data before and after cleaning as well as information about changes \
90			and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
91			* 'changes': Print out differences in the data before and after cleaning.
92			* None: No information about the data and the data cleaning is printed.
93
94			Returns
95			-------
96			None
97			Print statement highlighting the datasets or changes between the two datasets.
98			"""
99
100			if show in ["changes", "all"]:
101			dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
102			single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
103			data_mem = _memory_usage(data, deep=False)
104			data_cl_mem = _memory_usage(data_cleaned, deep=False)
105			data_mv_tot = _missing_vals(data)["mv_total"]
106			data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
107
108			if show == "all":
109			data_mem = _memory_usage(data, deep=True)
110			data_cl_mem = _memory_usage(data_cleaned, deep=True)
111			print("Before data cleaning:\n")
112			print(f"dtypes:\n{data.dtypes.value_counts()}")
113			print(f"\nNumber of rows: {data.shape[0]}")
114			print(f"Number of cols: {data.shape[1]}")
115			print(f"Missing values: {data_mv_tot}")
116			print(f"Memory usage: {data_mem} MB")
117			print("_______________________________________________________\n")
118			print("After data cleaning:\n")
119			print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
120			print(f"\nNumber of rows: {data_cleaned.shape[0]}")
121			print(f"Number of cols: {data_cleaned.shape[1]}")
122			print(f"Missing values: {data_cl_mv_tot}")
123			print(f"Memory usage: {data_cl_mem} MB")
124			print("_______________________________________________________\n")
125
126			print(f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
127			print("\nChanges:")
128			print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
129			print(f" of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})")
130			print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
131			print(f" of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})")
132			print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
133			mem_change = data_mem - data_cl_mem
134			print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{round(100*mem_change/data_mem,2)}%)")
135
136
137			def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
138			""" Provides information on and drops duplicate rows.
139
140			Parameters
141			----------
142			data : pd.DataFrame
143			2D dataset that can be coerced into Pandas DataFrame
144
145			Returns
146			-------
147			Tuple[pd.DataFrame, List]
148			Deduplicated Pandas DataFrame and Index Object of rows dropped
149			"""
150
151			data = pd.DataFrame(data).copy()
152			dupl_rows = data[data.duplicated()].index.tolist()
153			data = data.drop(dupl_rows, axis="index")
154
155			return data, dupl_rows
156
157
158			def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
159			""" Gives the total memory usage in megabytes.
160
161			Parameters
162			----------
163			data : pd.DataFrame
164			2D dataset that can be coerced into Pandas DataFrame
165			deep : bool, optional
166			Runs a deep analysis of the memory usage, by default True
167
168			Returns
169			-------
170			float
171			Memory usage in megabytes
172			"""
173
174			data = pd.DataFrame(data).copy()
175			memory_usage = round(data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2)
176
177			return memory_usage
178
179
180			def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
181			""" Gives metrics of missing values in the dataset.
182
183			Parameters
184			----------
185			data : pd.DataFrame
186			2D dataset that can be coerced into Pandas DataFrame
187
188			Returns
189			-------
190			Dict[str, float]
191			mv_total: float, number of missing values in the entire dataset
192			mv_rows: float, number of missing values in each row
193			mv_cols: float, number of missing values in each column
194			mv_rows_ratio: float, ratio of missing values for each row
195			mv_cols_ratio: float, ratio of missing values for each column
196			"""
197
198			data = pd.DataFrame(data).copy()
199			mv_rows = data.isna().sum(axis=1)
200			mv_cols = data.isna().sum(axis=0)
201			mv_total = data.isna().sum().sum()
202			mv_rows_ratio = mv_rows / data.shape[1]
203			mv_cols_ratio = mv_cols / data.shape[0]
204
205			return {
206			"mv_total": mv_total,
207			"mv_rows": mv_rows,
208			"mv_cols": mv_cols,
209			"mv_rows_ratio": mv_rows_ratio,
210			"mv_cols_ratio": mv_cols_ratio,
211			}
212
213
214			def _validate_input_bool(value, desc):
215			if not (isinstance(value, bool)):
216			raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
217
218
219			def _validate_input_int(value, desc):
220			if not isinstance(value, int):
221			raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
222
223
224			def _validate_input_range(value, desc, lower, upper):
225			if value < lower or value > upper:
226			raise ValueError(f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
227
228
229			def _validate_input_smaller(value1, value2, desc):
230			if value1 > value2:
231			raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
232
233
234			def _validate_input_sum_smaller(limit, desc, *args):
235			if sum(args) > limit:
236			raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
237
238
239			def _validate_input_sum_larger(limit, desc, *args):
240			if sum(args) < limit:
241			raise ValueError(
242			f"The sum of imput values provided for '{desc}' should be larger or equal to {limit}."
243			)
244

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 5a4fe9...c92c0e )

klib.utils._validate_input_int() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like