GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

klib.utils._validate_input_bool()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 4
nop 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
"""Utilities and auxiliary functions.
2
3
:author: Andreas Kanz
4
5
"""
6
7
from __future__ import annotations
8
9
from typing import Literal
10
from typing import TypedDict
11
12
import numpy as np
13
import pandas as pd
14
15
16
def _corr_selector(
17
    corr: pd.Series | pd.DataFrame,
18
    split: Literal["pos", "neg", "high", "low"] | None = None,
19
    threshold: float = 0,
20
) -> pd.Series | pd.DataFrame:
21
    """Select the desired correlations using this utility function.
22
23
    Parameters
24
    ----------
25
    corr : pd.Series | pd.DataFrame
26
        pd.Series or pd.DataFrame of correlations
27
    split : Optional[str], optional
28
        Type of split performed, by default None
29
            * {None, "pos", "neg", "high", "low"}
30
    threshold : float, optional
31
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
32
        split = "high" or split = "low", in which case default is 0.3
33
34
    Returns
35
    -------
36
    pd.DataFrame
37
        List or matrix of (filtered) correlations
38
39
    """
40
    if split == "pos":
41
        corr = corr.where((corr >= threshold) & (corr > 0))
42
        print(
43
            'Displaying positive correlations. Specify a positive "threshold" to '
44
            "limit the results further.",
45
        )
46
    elif split == "neg":
47
        corr = corr.where((corr <= threshold) & (corr < 0))
48
        print(
49
            'Displaying negative correlations. Specify a negative "threshold" to '
50
            "limit the results further.",
51
        )
52
    elif split == "high":
53
        threshold = 0.3 if threshold <= 0 else threshold
54
        corr = corr.where(np.abs(corr) >= threshold)
55
        print(
56
            f"Displaying absolute correlations above the threshold ({threshold}). "
57
            'Specify a positive "threshold" to limit the results further.',
58
        )
59
    elif split == "low":
60
        threshold = 0.3 if threshold <= 0 else threshold
61
        corr = corr.where(np.abs(corr) <= threshold)
62
        print(
63
            f"Displaying absolute correlations below the threshold ({threshold}). "
64
            'Specify a positive "threshold" to limit the results further.',
65
        )
66
67
    return corr
68
69
70
def _diff_report(
71
    data: pd.DataFrame,
72
    data_cleaned: pd.DataFrame,
73
    dupl_rows: list[str | int] | None = None,
74
    single_val_cols: list[str] | None = None,
75
    show: Literal["all", "changes"] | None = "changes",
76
) -> None:
77
    """Provide information about changes between two datasets.
78
79
    This includes dropped rows and columns, memory usage and missing values.
80
81
    Parameters
82
    ----------
83
    data : pd.DataFrame
84
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
85
        dataset here
86
    data_cleaned : pd.DataFrame
87
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
88
        updated dataset here
89
    dupl_rows : Optional[list[str | int]], optional
90
        List of duplicate row indices, by default None
91
    single_val_cols : Optional[List[str]], optional
92
        List of single-valued column indices. I.e. columns where all cells contain \
93
        the same value. NaNs count as a separate value, by default None
94
    show : str, optional
95
        {"all", "changes", None}, by default "changes"
96
        Specify verbosity of the output:
97
            * "all": Print information about the data before and after cleaning as \
98
                well as information about changes and memory usage (deep). Please be \
99
                aware, that this can slow down the function by quite a bit.
100
            * "changes": Print out differences in the data before and after cleaning.
101
            * None: No information about the data and the data cleaning is printed.
102
103
    Returns
104
    -------
105
    None
106
        Print statement highlighting the datasets or changes between the two datasets.
107
108
    """
109
    if show not in ["changes", "all"]:
110
        return
111
112
    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
113
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
114
    data_mem = _memory_usage(data, deep=False)
115
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
116
    data_mv_tot = _missing_vals(data)["mv_total"]
117
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
118
119
    if show == "all":
120
        data_mem = _memory_usage(data, deep=True)
121
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
122
        _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
123
        _print_cleaning_details(
124
            "After data cleaning:\n",
125
            data_cleaned,
126
            data_cl_mv_tot,
127
            data_cl_mem,
128
        )
129
130
    print(
131
        f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}\n\n",
132
    )
133
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
134
    print(
135
        f"     of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n",
136
    )
137
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
138
    print(
139
        f"     of which {len(single_val_cols)} single valued.     Columns: {single_val_cols}",
140
    )
141
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
142
    mem_change = data_mem - data_cl_mem
143
    mem_perc = round(100 * mem_change / data_mem, 2)
144
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
145
146
147
def _print_cleaning_details(
148
    header: str,
149
    data: pd.DataFrame | pd.Series,
150
    missing_vals: int,
151
    mem_usage: float,
152
) -> None:
153
    print(header)
154
    print(f"dtypes:\n{data.dtypes.value_counts()}")
155
    print(f"\nNumber of rows: {str(data.shape[0]).rjust(8)}")
156
    print(f"Number of cols: {str(data.shape[1]).rjust(8)}")
157
    print(f"Missing values: {str(missing_vals).rjust(8)}")
158
    print(f"Memory usage: {str(mem_usage).rjust(7)} MB")
159
    print("_______________________________________________________\n")
160
161
162
def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]:
163
    """Provide information on and drops duplicate rows.
164
165
    Parameters
166
    ----------
167
    data : pd.DataFrame
168
        2D dataset that can be coerced into Pandas DataFrame
169
170
    Returns
171
    -------
172
    Tuple[pd.DataFrame, List]
173
        Deduplicated Pandas DataFrame and Index Object of rows dropped
174
175
    """
176
    data = pd.DataFrame(data).copy()
177
    dupl_rows = data[data.duplicated()].index.tolist()
178
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
179
180
    return data, dupl_rows
181
182
183
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
184
    """Give the total memory usage in megabytes.
185
186
    Parameters
187
    ----------
188
    data : pd.DataFrame
189
        2D dataset that can be coerced into Pandas DataFrame
190
    deep : bool, optional
191
        Runs a deep analysis of the memory usage, by default True
192
193
    Returns
194
    -------
195
    float
196
        Memory usage in megabytes
197
198
    """
199
    return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
200
201
202
class MVResult(TypedDict):
203
    """TypedDict for the return value of _missing_vals."""
204
205
    mv_total: int
206
    mv_rows: int
207
    mv_cols: int
208
    mv_rows_ratio: float
209
    mv_cols_ratio: float
210
211
212
def _missing_vals(data: pd.DataFrame) -> MVResult:
213
    """Give metrics of missing values in the dataset.
214
215
    Parameters
216
    ----------
217
    data : pd.DataFrame
218
        2D dataset that can be coerced into Pandas DataFrame
219
220
    Returns
221
    -------
222
    Dict[str, float]
223
        mv_total: float, number of missing values in the entire dataset
224
        mv_rows: float, number of missing values in each row
225
        mv_cols: float, number of missing values in each column
226
        mv_rows_ratio: float, ratio of missing values for each row
227
        mv_cols_ratio: float, ratio of missing values for each column
228
229
    """
230
    data = pd.DataFrame(data).copy()
231
    mv_total: int = data.isna().sum().sum()
232
    mv_rows: int = data.isna().sum(axis=1)
233
    mv_cols: int = data.isna().sum(axis=0)
234
    mv_rows_ratio: float = mv_rows / data.shape[1]
235
    mv_cols_ratio: float = mv_cols / data.shape[0]
236
237
    return {
238
        "mv_total": mv_total,
239
        "mv_rows": mv_rows,
240
        "mv_cols": mv_cols,
241
        "mv_rows_ratio": mv_rows_ratio,
242
        "mv_cols_ratio": mv_cols_ratio,
243
    }
244
245
246
def _validate_input_bool(value: bool, desc: str) -> None:
247
    if not isinstance(value, bool):
248
        msg = f"Input value for '{desc}' is {type(value)} but should be a boolean."
249
        raise TypeError(msg)
250
251
252
def _validate_input_int(value: int, desc: str) -> None:
253
    if not isinstance(value, int):
254
        msg = f"Input value for '{desc}' is {type(value)} but should be an integer."
255
        raise TypeError(msg)
256
257
258
def _validate_input_range(value: float, desc: str, lower: float, upper: float) -> None:
259
    if value < lower or value > upper:
260
        msg = f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
261
        raise ValueError(msg)
262
263
264
def _validate_input_smaller(value1: int, value2: int, desc: str) -> None:
265
    if value1 > value2:
266
        msg = f"The first input for '{desc}' should be smaller or equal to the second."
267
        raise ValueError(msg)
268
269
270
def _validate_input_sum_smaller(limit: float, desc: str, *args) -> None:  # noqa: ANN002
271
    if sum(args) > limit:
272
        msg = f"The sum of input values for '{desc}' should be less or equal to {limit}."
273
        raise ValueError(msg)
274
275
276
def _validate_input_sum_larger(limit: float, desc: str, *args) -> None:  # noqa: ANN002
277
    if sum(args) < limit:
278
        msg = f"The sum of input values for '{desc}' should be larger/equal to {limit}."
279
        raise ValueError(msg)
280
281
282
def _validate_input_num_data(value: pd.DataFrame, desc: str) -> None:
283
    if value.select_dtypes(include=["number"]).empty:
284
        msg = f"Input value for '{desc}' should contain at least one numerical column."
285
        raise TypeError(msg)
286