klib.utils - Code Metrics - Inspection of "update tests to match new Nullable pandas dtypes" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 476ed9...09d8d6 )

by Andreas

created 2021-07-05 11:59 UTC

klib.utils A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	265
Duplicated Lines	0 %

Test Coverage

Coverage

80.9%

Importance

Changes

Metric	Value
eloc	119
dl	0
loc	265
rs	10
c	0
b	0
f	0
ccs	72
cts	89
cp	0.809
wmc	28

11 Functions

Rating	Name	Size	Complexity
A	_validate_input_int()	4	2
A	_validate_input_bool()	4	2
A	_validate_input_range()	4	3
A	_validate_input_sum_smaller()	4	2
A	_validate_input_sum_larger()	4	2
B	_corr_selector()	53	7
A	_missing_vals()	30	1
A	_validate_input_smaller()	4	2
A	_memory_usage()	18	1
A	_drop_duplicates()	18	1
B	_diff_report()	81	5

"""
Utilities and auxiliary functions.

:author: Andreas Kanz

"""

# Imports
import numpy as np
import pandas as pd
from typing import Any, Dict, List, Optional, Tuple, Union


def _corr_selector(
    corr: Union[pd.Series, pd.DataFrame],
    split: Optional[
        str
    ] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
    threshold: float = 0,
) -> Union[pd.Series, pd.DataFrame]:
    """Select the desired correlations using this utility function.

    Parameters
    ----------
    corr : Union[pd.Series, pd.DataFrame]
        pd.Series or pd.DataFrame of correlations
    split : Optional[str], optional
        Type of split performed, by default None
            * {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
        split = "high" or split = "low", in which case default is 0.3

    Returns
    -------
    pd.DataFrame
        List or matrix of (filtered) correlations
    """
    if split == "pos":
        corr = corr.where((corr >= threshold) & (corr > 0))
        print(
            'Displaying positive correlations. Specify a positive "threshold" to '
            "limit the results further."
        )
    elif split == "neg":
        corr = corr.where((corr <= threshold) & (corr < 0))
        print(
            'Displaying negative correlations. Specify a negative "threshold" to '
            "limit the results further."
        )
    elif split == "high":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) >= threshold)
        print(
            f"Displaying absolute correlations above the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.'
        )
    elif split == "low":
        threshold = 0.3 if threshold <= 0 else threshold
        corr = corr.where(np.abs(corr) <= threshold)
        print(
            f"Displaying absolute correlations below the threshold ({threshold}). "
            'Specify a positive "threshold" to limit the results further.'
        )

    return corr


def _diff_report(
    data: pd.DataFrame,
    data_cleaned: pd.DataFrame,
    dupl_rows: Optional[List[Union[str, int]]] = None,
    single_val_cols: Optional[List[str]] = None,
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes"
) -> None:
    """Provide information about changes between two datasets, such as dropped rows \
        and columns, memory usage and missing values.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
        dataset here
    data_cleaned : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
        updated dataset here
    dupl_rows : Optional[List[Union[str, int]]], optional
        List of duplicate row indices, by default None
    single_val_cols : Optional[List[str]], optional
        List of single-valued column indices. I.e. columns where all cells contain \
        the same value. NaNs count as a separate value, by default None
    show : str, optional
        {"all", "changes", None}, by default "changes"
        Specify verbosity of the output:
            * "all": Print information about the data before and after cleaning as \
                well as information about changes and memory usage (deep). Please be \
                aware, that this can slow down the function by quite a bit.
            * "changes": Print out differences in the data before and after cleaning.
            * None: No information about the data and the data cleaning is printed.

    Returns
    -------
    None
        Print statement highlighting the datasets or changes between the two datasets.
    """
    if show not in ["changes", "all"]:
        return

    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
    data_mem = _memory_usage(data, deep=False)
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
    data_mv_tot = _missing_vals(data)["mv_total"]
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]

    if show == "all":
        data_mem = _memory_usage(data, deep=True)
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
        print("Before data cleaning:\n")
        print(f"dtypes:\n{data.dtypes.value_counts()}")
        print(f"\nNumber of rows: {str(data.shape[0]).rjust(8)}")
        print(f"Number of cols: {str(data.shape[1]).rjust(8)}")
        print(f"Missing values: {str(data_mv_tot).rjust(8)}")
        print(f"Memory usage: {str(data_mem).rjust(7)} MB")
        print("_______________________________________________________\n")
        print("After data cleaning:\n")
        print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
        print(f"\nNumber of rows: {str(data_cleaned.shape[0]).rjust(8)}")
        print(f"Number of cols: {str(data_cleaned.shape[1]).rjust(8)}")
        print(f"Missing values: {str(data_cl_mv_tot).rjust(8)}")
        print(f"Memory usage: {str(data_cl_mem).rjust(7)} MB")
        print("_______________________________________________________\n")

    print(
        f"Shape of cleaned data: {data_cleaned.shape}"
        f"Remaining NAs: {data_cl_mv_tot}"
    )
    print("\nChanges:")
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
    print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows[:200]})")
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
    print(
        f"     of which {len(single_val_cols)} single valued."
        f"     Columns: {single_val_cols}"
    )
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
    mem_change = data_mem - data_cl_mem
    mem_perc = round(100 * mem_change / data_mem, 2)
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")


def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
    """Provide information on and drops duplicate rows.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Tuple[pd.DataFrame, List]
        Deduplicated Pandas DataFrame and Index Object of rows dropped
    """
    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)

    return data, dupl_rows


def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
    """Give the total memory usage in megabytes.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame
    deep : bool, optional
        Runs a deep analysis of the memory usage, by default True

    Returns
    -------
    float
        Memory usage in megabytes
    """
    data = pd.DataFrame(data).copy()
    return round(
        data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2
    )


def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
    """Give metrics of missing values in the dataset.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame

    Returns
    -------
    Dict[str, float]
        mv_total: float, number of missing values in the entire dataset
        mv_rows: float, number of missing values in each row
        mv_cols: float, number of missing values in each column
        mv_rows_ratio: float, ratio of missing values for each row
        mv_cols_ratio: float, ratio of missing values for each column
    """
    data = pd.DataFrame(data).copy()
    mv_rows = data.isna().sum(axis=1)
    mv_cols = data.isna().sum(axis=0)
    mv_total = data.isna().sum().sum()
    mv_rows_ratio = mv_rows / data.shape[1]
    mv_cols_ratio = mv_cols / data.shape[0]

    return {
        "mv_total": mv_total,
        "mv_rows": mv_rows,
        "mv_cols": mv_cols,
        "mv_rows_ratio": mv_rows_ratio,
        "mv_cols_ratio": mv_cols_ratio,
    }


def _validate_input_bool(value, desc):
    if not isinstance(value, bool):
        raise TypeError(
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
        )


def _validate_input_int(value, desc):
    if not isinstance(value, int):
        raise TypeError(
            f"Input value for '{desc}' is {type(value)} but should be an integer."
        )


def _validate_input_range(value, desc, lower, upper):
    if value < lower or value > upper:
        raise ValueError(
            f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
        )


def _validate_input_smaller(value1, value2, desc):
    if value1 > value2:
        raise ValueError(
            f"The first input for '{desc}' should be smaller or equal to the second."
        )


def _validate_input_sum_smaller(limit, desc, *args):
    if sum(args) > limit:
        raise ValueError(
            f"The sum of input values for '{desc}' should be less or equal to {limit}."
        )


def _validate_input_sum_larger(limit, desc, *args):
    if sum(args) < limit:
        raise ValueError(
            f"The sum of input values for '{desc}' should be larger/equal to {limit}."
        )


1		"""
2		Utilities and auxiliary functions.
3
4		:author: Andreas Kanz
5
6		"""
7
8		# Imports
9	1	import numpy as np
10	1	import pandas as pd
11	1	from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14	1	def _corr_selector(
15		corr: Union[pd.Series, pd.DataFrame],
16		split: Optional[
17		str
18		] = None, # Optional[Literal["pos", "neg", "above", "below"]] = None,
19		threshold: float = 0,
20		) -> Union[pd.Series, pd.DataFrame]:
21		"""Select the desired correlations using this utility function.
22
23		Parameters
24		----------
25		corr : Union[pd.Series, pd.DataFrame]
26		pd.Series or pd.DataFrame of correlations
27		split : Optional[str], optional
28		Type of split performed, by default None
29		* {None, "pos", "neg", "high", "low"}
30		threshold : float, optional
31		Value between 0 and 1 to set the correlation threshold, by default 0 unless \
32		split = "high" or split = "low", in which case default is 0.3
33
34		Returns
35		-------
36		pd.DataFrame
37		List or matrix of (filtered) correlations
38		"""
39	1	if split == "pos":
40	1	corr = corr.where((corr >= threshold) & (corr > 0))
41	1	print(
42		'Displaying positive correlations. Specify a positive "threshold" to '
43		"limit the results further."
44		)
45	1	elif split == "neg":
46	1	corr = corr.where((corr <= threshold) & (corr < 0))
47	1	print(
48		'Displaying negative correlations. Specify a negative "threshold" to '
49		"limit the results further."
50		)
51	1	elif split == "high":
52	1	threshold = 0.3 if threshold <= 0 else threshold
53	1	corr = corr.where(np.abs(corr) >= threshold)
54	1	print(
55		f"Displaying absolute correlations above the threshold ({threshold}). "
56		'Specify a positive "threshold" to limit the results further.'
57		)
58	1	elif split == "low":
59	1	threshold = 0.3 if threshold <= 0 else threshold
60	1	corr = corr.where(np.abs(corr) <= threshold)
61	1	print(
62		f"Displaying absolute correlations below the threshold ({threshold}). "
63		'Specify a positive "threshold" to limit the results further.'
64		)
65
66	1	return corr
67
68
69	1	def _diff_report(
70		data: pd.DataFrame,
71		data_cleaned: pd.DataFrame,
72		dupl_rows: Optional[List[Union[str, int]]] = None,
73		single_val_cols: Optional[List[str]] = None,
74		show: Optional[str] = "changes", # Optional[Literal["all", "changes"]] = "changes"
75		) -> None:
76		"""Provide information about changes between two datasets, such as dropped rows \
77		and columns, memory usage and missing values.
78
79		Parameters
80		----------
81		data : pd.DataFrame
82		2D dataset that can be coerced into Pandas DataFrame. Input the initial \
83		dataset here
84		data_cleaned : pd.DataFrame
85		2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
86		updated dataset here
87		dupl_rows : Optional[List[Union[str, int]]], optional
88		List of duplicate row indices, by default None
89		single_val_cols : Optional[List[str]], optional
90		List of single-valued column indices. I.e. columns where all cells contain \
91		the same value. NaNs count as a separate value, by default None
92		show : str, optional
93		{"all", "changes", None}, by default "changes"
94		Specify verbosity of the output:
95		* "all": Print information about the data before and after cleaning as \
96		well as information about changes and memory usage (deep). Please be \
97		aware, that this can slow down the function by quite a bit.
98		* "changes": Print out differences in the data before and after cleaning.
99		* None: No information about the data and the data cleaning is printed.
100
101		Returns
102		-------
103		None
104		Print statement highlighting the datasets or changes between the two datasets.
105		"""
106	1	if show not in ["changes", "all"]:
107		return
108
109	1	dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
110	1	single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
111	1	data_mem = _memory_usage(data, deep=False)
112	1	data_cl_mem = _memory_usage(data_cleaned, deep=False)
113	1	data_mv_tot = _missing_vals(data)["mv_total"]
114	1	data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
115
116	1	if show == "all":
117		data_mem = _memory_usage(data, deep=True)
118		data_cl_mem = _memory_usage(data_cleaned, deep=True)
119		print("Before data cleaning:\n")
120		print(f"dtypes:\n{data.dtypes.value_counts()}")
121		print(f"\nNumber of rows: {str(data.shape[0]).rjust(8)}")
122		print(f"Number of cols: {str(data.shape[1]).rjust(8)}")
123		print(f"Missing values: {str(data_mv_tot).rjust(8)}")
124		print(f"Memory usage: {str(data_mem).rjust(7)} MB")
125		print("_______________________________________________________\n")
126		print("After data cleaning:\n")
127		print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
128		print(f"\nNumber of rows: {str(data_cleaned.shape[0]).rjust(8)}")
129		print(f"Number of cols: {str(data_cleaned.shape[1]).rjust(8)}")
130		print(f"Missing values: {str(data_cl_mv_tot).rjust(8)}")
131		print(f"Memory usage: {str(data_cl_mem).rjust(7)} MB")
132		print("_______________________________________________________\n")
133
134	1	print(
135		f"Shape of cleaned data: {data_cleaned.shape}"
136		f"Remaining NAs: {data_cl_mv_tot}"
137		)
138	1	print("\nChanges:")
139	1	print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
140	1	print(f" of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows[:200]})")
141	1	print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
142	1	print(
143		f" of which {len(single_val_cols)} single valued."
144		f" Columns: {single_val_cols}"
145		)
146	1	print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
147	1	mem_change = data_mem - data_cl_mem
148	1	mem_perc = round(100 * mem_change / data_mem, 2)
149	1	print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
150
151
152	1	def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
153		"""Provide information on and drops duplicate rows.
154
155		Parameters
156		----------
157		data : pd.DataFrame
158		2D dataset that can be coerced into Pandas DataFrame
159
160		Returns
161		-------
162		Tuple[pd.DataFrame, List]
163		Deduplicated Pandas DataFrame and Index Object of rows dropped
164		"""
165	1	data = pd.DataFrame(data).copy()
166	1	dupl_rows = data[data.duplicated()].index.tolist()
167	1	data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
168
169	1	return data, dupl_rows
170
171
172	1	def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
173		"""Give the total memory usage in megabytes.
174
175		Parameters
176		----------
177		data : pd.DataFrame
178		2D dataset that can be coerced into Pandas DataFrame
179		deep : bool, optional
180		Runs a deep analysis of the memory usage, by default True
181
182		Returns
183		-------
184		float
185		Memory usage in megabytes
186		"""
187	1	data = pd.DataFrame(data).copy()
188	1	return round(
189		data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2
190		)
191
192
193	1	def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
194		"""Give metrics of missing values in the dataset.
195
196		Parameters
197		----------
198		data : pd.DataFrame
199		2D dataset that can be coerced into Pandas DataFrame
200
201		Returns
202		-------
203		Dict[str, float]
204		mv_total: float, number of missing values in the entire dataset
205		mv_rows: float, number of missing values in each row
206		mv_cols: float, number of missing values in each column
207		mv_rows_ratio: float, ratio of missing values for each row
208		mv_cols_ratio: float, ratio of missing values for each column
209		"""
210	1	data = pd.DataFrame(data).copy()
211	1	mv_rows = data.isna().sum(axis=1)
212	1	mv_cols = data.isna().sum(axis=0)
213	1	mv_total = data.isna().sum().sum()
214	1	mv_rows_ratio = mv_rows / data.shape[1]
215	1	mv_cols_ratio = mv_cols / data.shape[0]
216
217	1	return {
218		"mv_total": mv_total,
219		"mv_rows": mv_rows,
220		"mv_cols": mv_cols,
221		"mv_rows_ratio": mv_rows_ratio,
222		"mv_cols_ratio": mv_cols_ratio,
223		}
224
225
226	1	def _validate_input_bool(value, desc):
227	1	if not isinstance(value, bool):
228	1	raise TypeError(
229		f"Input value for '{desc}' is {type(value)} but should be a boolean."
230		)
231
232
233	1	def _validate_input_int(value, desc):
234	1	if not isinstance(value, int):
235	1	raise TypeError(
236		f"Input value for '{desc}' is {type(value)} but should be an integer."
237		)
238
239
240	1	def _validate_input_range(value, desc, lower, upper):
241	1	if value < lower or value > upper:
242	1	raise ValueError(
243		f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
244		)
245
246
247	1	def _validate_input_smaller(value1, value2, desc):
248	1	if value1 > value2:
249	1	raise ValueError(
250		f"The first input for '{desc}' should be smaller or equal to the second."
251		)
252
253
254	1	def _validate_input_sum_smaller(limit, desc, *args):
255	1	if sum(args) > limit:
256	1	raise ValueError(
257		f"The sum of input values for '{desc}' should be less or equal to {limit}."
258		)
259
260
261	1	def _validate_input_sum_larger(limit, desc, *args):
262	1	if sum(args) < limit:
263	1	raise ValueError(
264		f"The sum of input values for '{desc}' should be larger/equal to {limit}."
265		)
266

akanz1 / klib

GitHub Access Token became invalid

Push — main ( 476ed9...09d8d6 )

klib.utils A

Complexity

Size/Duplication

Test Coverage

Importance

11 Functions

Duplication Side-by-Side

Filter issues like