GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Test Failed
Push — main ( 477a06...fee6f5 )
by Andreas
01:43
created

klib.utils._validate_input_num_data()   A

Complexity

Conditions 2

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 4
nop 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
from __future__ import annotations
8
9
from typing import Literal
10
from typing import Optional
11
from typing import TypedDict
12
13
import numpy as np
14
import pandas as pd
15
16
17
def _corr_selector(
18
    corr: pd.Series | pd.DataFrame,
19
    split: Optional[Literal["pos", "neg", "high", "low"]] = None,
20
    threshold: float = 0,
21
) -> pd.Series | pd.DataFrame:
22
    """Select the desired correlations using this utility function.
23
24
    Parameters
25
    ----------
26
    corr : pd.Series | pd.DataFrame
27
        pd.Series or pd.DataFrame of correlations
28
    split : Optional[str], optional
29
        Type of split performed, by default None
30
            * {None, "pos", "neg", "high", "low"}
31
    threshold : float, optional
32
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
33
        split = "high" or split = "low", in which case default is 0.3
34
35
    Returns
36
    -------
37
    pd.DataFrame
38
        List or matrix of (filtered) correlations
39
    """
40
    if split == "pos":
41
        corr = corr.where((corr >= threshold) & (corr > 0))
42
        print(
43
            'Displaying positive correlations. Specify a positive "threshold" to '
44
            "limit the results further."
45
        )
46
    elif split == "neg":
47
        corr = corr.where((corr <= threshold) & (corr < 0))
48
        print(
49
            'Displaying negative correlations. Specify a negative "threshold" to '
50
            "limit the results further."
51
        )
52
    elif split == "high":
53
        threshold = 0.3 if threshold <= 0 else threshold
54
        corr = corr.where(np.abs(corr) >= threshold)
55
        print(
56
            f"Displaying absolute correlations above the threshold ({threshold}). "
57
            'Specify a positive "threshold" to limit the results further.'
58
        )
59
    elif split == "low":
60
        threshold = 0.3 if threshold <= 0 else threshold
61
        corr = corr.where(np.abs(corr) <= threshold)
62
        print(
63
            f"Displaying absolute correlations below the threshold ({threshold}). "
64
            'Specify a positive "threshold" to limit the results further.'
65
        )
66
67
    return corr
68
69
70
def _diff_report(
71
    data: pd.DataFrame,
72
    data_cleaned: pd.DataFrame,
73
    dupl_rows: Optional[list[str | int]] = None,
74
    single_val_cols: Optional[list[str]] = None,
75
    show: Optional[Literal["all", "changes"]] = "changes",
76
) -> None:
77
    """Provide information about changes between two datasets, such as dropped rows \
78
        and columns, memory usage and missing values.
79
80
    Parameters
81
    ----------
82
    data : pd.DataFrame
83
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
84
        dataset here
85
    data_cleaned : pd.DataFrame
86
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
87
        updated dataset here
88
    dupl_rows : Optional[list[str | int]], optional
89
        List of duplicate row indices, by default None
90
    single_val_cols : Optional[List[str]], optional
91
        List of single-valued column indices. I.e. columns where all cells contain \
92
        the same value. NaNs count as a separate value, by default None
93
    show : str, optional
94
        {"all", "changes", None}, by default "changes"
95
        Specify verbosity of the output:
96
            * "all": Print information about the data before and after cleaning as \
97
                well as information about changes and memory usage (deep). Please be \
98
                aware, that this can slow down the function by quite a bit.
99
            * "changes": Print out differences in the data before and after cleaning.
100
            * None: No information about the data and the data cleaning is printed.
101
102
    Returns
103
    -------
104
    None
105
        Print statement highlighting the datasets or changes between the two datasets.
106
    """
107
    if show not in ["changes", "all"]:
108
        return
109
110
    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
111
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
112
    data_mem = _memory_usage(data, deep=False)
113
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
114
    data_mv_tot = _missing_vals(data)["mv_total"]
115
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
116
117
    if show == "all":
118
        data_mem = _memory_usage(data, deep=True)
119
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
120
        _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
121
        _print_cleaning_details(
122
            "After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem
123
        )
124
125
    print(
126
        f"Shape of cleaned data: {data_cleaned.shape} - "
127
        f"Remaining NAs: {data_cl_mv_tot}\n\n"
128
    )
129
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
130
    print(
131
        f"     of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n"  # noqa
132
    )
133
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
134
    print(
135
        f"     of which {len(single_val_cols)} single valued."
136
        f"     Columns: {single_val_cols}"
137
    )
138
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
139
    mem_change = data_mem - data_cl_mem
140
    mem_perc = round(100 * mem_change / data_mem, 2)
141
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
142
143
144
def _print_cleaning_details(arg0, arg1, arg2, arg3):
145
    print(arg0)
146
    print(f"dtypes:\n{arg1.dtypes.value_counts()}")
147
    print(f"\nNumber of rows: {str(arg1.shape[0]).rjust(8)}")
148
    print(f"Number of cols: {str(arg1.shape[1]).rjust(8)}")
149
    print(f"Missing values: {str(arg2).rjust(8)}")
150
    print(f"Memory usage: {str(arg3).rjust(7)} MB")
151
    print("_______________________________________________________\n")
152
153
154
def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]:
155
    """Provide information on and drops duplicate rows.
156
157
    Parameters
158
    ----------
159
    data : pd.DataFrame
160
        2D dataset that can be coerced into Pandas DataFrame
161
162
    Returns
163
    -------
164
    Tuple[pd.DataFrame, List]
165
        Deduplicated Pandas DataFrame and Index Object of rows dropped
166
    """
167
    data = pd.DataFrame(data).copy()
168
    dupl_rows = data[data.duplicated()].index.tolist()
169
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
170
171
    return data, dupl_rows
172
173
174
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
175
    """Give the total memory usage in megabytes.
176
177
    Parameters
178
    ----------
179
    data : pd.DataFrame
180
        2D dataset that can be coerced into Pandas DataFrame
181
    deep : bool, optional
182
        Runs a deep analysis of the memory usage, by default True
183
184
    Returns
185
    -------
186
    float
187
        Memory usage in megabytes
188
    """
189
    return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
190
191
192
class MVResult(TypedDict):
193
    """TypedDict for the return value of _missing_vals."""
194
195
    mv_total: int
196
    mv_rows: int
197
    mv_cols: int
198
    mv_rows_ratio: float
199
    mv_cols_ratio: float
200
201
202
def _missing_vals(data: pd.DataFrame) -> MVResult:
203
    """Give metrics of missing values in the dataset.
204
205
    Parameters
206
    ----------
207
    data : pd.DataFrame
208
        2D dataset that can be coerced into Pandas DataFrame
209
210
    Returns
211
    -------
212
    Dict[str, float]
213
        mv_total: float, number of missing values in the entire dataset
214
        mv_rows: float, number of missing values in each row
215
        mv_cols: float, number of missing values in each column
216
        mv_rows_ratio: float, ratio of missing values for each row
217
        mv_cols_ratio: float, ratio of missing values for each column
218
    """
219
    data = pd.DataFrame(data).copy()
220
    mv_total: int = data.isna().sum().sum()
221
    mv_rows: int = data.isna().sum(axis=1)
222
    mv_cols: int = data.isna().sum(axis=0)
223
    mv_rows_ratio: float = mv_rows / data.shape[1]
224
    mv_cols_ratio: float = mv_cols / data.shape[0]
225
226
    return {
227
        "mv_total": mv_total,
228
        "mv_rows": mv_rows,
229
        "mv_cols": mv_cols,
230
        "mv_rows_ratio": mv_rows_ratio,
231
        "mv_cols_ratio": mv_cols_ratio,
232
    }
233
234
235
def _validate_input_bool(value: bool, desc):
236
    if not isinstance(value, bool):
237
        raise TypeError(
238
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
239
        )
240
241
242
def _validate_input_int(value: int, desc):
243
    if not isinstance(value, int):
244
        raise TypeError(
245
            f"Input value for '{desc}' is {type(value)} but should be an integer."
246
        )
247
248
249
def _validate_input_range(value, desc, lower, upper):
250
    if value < lower or value > upper:
251
        raise ValueError(
252
            f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
253
        )
254
255
256
def _validate_input_smaller(value1, value2, desc):
257
    if value1 > value2:
258
        raise ValueError(
259
            f"The first input for '{desc}' should be smaller or equal to the second."
260
        )
261
262
263
def _validate_input_sum_smaller(limit, desc, *args):
264
    if sum(args) > limit:
265
        raise ValueError(
266
            f"The sum of input values for '{desc}' should be less or equal to {limit}."
267
        )
268
269
270
def _validate_input_sum_larger(limit, desc, *args):
271
    if sum(args) < limit:
272
        raise ValueError(
273
            f"The sum of input values for '{desc}' should be larger/equal to {limit}."
274
        )
275
276
277
def _validate_input_num_data(value: pd.DataFrame, desc):
278
    if value.select_dtypes(include=["number"]).empty:
279
        raise TypeError(
280
            f"Input value for '{desc}' should contain at least one numerical column."
281
        )
282