GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Test Failed
Push — main ( 7b7d02...0ccad7 )
by Andreas
06:54
created

klib.utils._print_cleaning_details()   A

Complexity

Conditions 1

Size

Total Lines 8
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
nop 4
dl 0
loc 8
rs 10
c 0
b 0
f 0
1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
8
from __future__ import annotations
9
10
from typing import Literal
11
from typing import Optional
12
from typing import TypedDict
13
14
import numpy as np
15
import pandas as pd
16
17
18
def _corr_selector(
19
    corr: pd.Series | pd.DataFrame,
20
    split: Optional[Literal["pos", "neg", "high", "low"]] = None,
21
    threshold: float = 0,
22
) -> pd.Series | pd.DataFrame:
23
    """Select the desired correlations using this utility function.
24
25
    Parameters
26
    ----------
27
    corr : pd.Series | pd.DataFrame
28
        pd.Series or pd.DataFrame of correlations
29
    split : Optional[str], optional
30
        Type of split performed, by default None
31
            * {None, "pos", "neg", "high", "low"}
32
    threshold : float, optional
33
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
34
        split = "high" or split = "low", in which case default is 0.3
35
36
    Returns
37
    -------
38
    pd.DataFrame
39
        List or matrix of (filtered) correlations
40
    """
41
    if split == "pos":
42
        corr = corr.where((corr >= threshold) & (corr > 0))
43
        print(
44
            'Displaying positive correlations. Specify a positive "threshold" to '
45
            "limit the results further."
46
        )
47
    elif split == "neg":
48
        corr = corr.where((corr <= threshold) & (corr < 0))
49
        print(
50
            'Displaying negative correlations. Specify a negative "threshold" to '
51
            "limit the results further."
52
        )
53
    elif split == "high":
54
        threshold = 0.3 if threshold <= 0 else threshold
55
        corr = corr.where(np.abs(corr) >= threshold)
56
        print(
57
            f"Displaying absolute correlations above the threshold ({threshold}). "
58
            'Specify a positive "threshold" to limit the results further.'
59
        )
60
    elif split == "low":
61
        threshold = 0.3 if threshold <= 0 else threshold
62
        corr = corr.where(np.abs(corr) <= threshold)
63
        print(
64
            f"Displaying absolute correlations below the threshold ({threshold}). "
65
            'Specify a positive "threshold" to limit the results further.'
66
        )
67
68
    return corr
69
70
71
def _diff_report(
72
    data: pd.DataFrame,
73
    data_cleaned: pd.DataFrame,
74
    dupl_rows: Optional[list[str | int]] = None,
75
    single_val_cols: Optional[list[str]] = None,
76
    show: Optional[Literal["all", "changes"]] = "changes",
77
) -> None:
78
    """Provide information about changes between two datasets, such as dropped rows \
79
        and columns, memory usage and missing values.
80
81
    Parameters
82
    ----------
83
    data : pd.DataFrame
84
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
85
        dataset here
86
    data_cleaned : pd.DataFrame
87
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
88
        updated dataset here
89
    dupl_rows : Optional[list[str | int]], optional
90
        List of duplicate row indices, by default None
91
    single_val_cols : Optional[List[str]], optional
92
        List of single-valued column indices. I.e. columns where all cells contain \
93
        the same value. NaNs count as a separate value, by default None
94
    show : str, optional
95
        {"all", "changes", None}, by default "changes"
96
        Specify verbosity of the output:
97
            * "all": Print information about the data before and after cleaning as \
98
                well as information about changes and memory usage (deep). Please be \
99
                aware, that this can slow down the function by quite a bit.
100
            * "changes": Print out differences in the data before and after cleaning.
101
            * None: No information about the data and the data cleaning is printed.
102
103
    Returns
104
    -------
105
    None
106
        Print statement highlighting the datasets or changes between the two datasets.
107
    """
108
    if show not in ["changes", "all"]:
109
        return
110
111
    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
112
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
113
    data_mem = _memory_usage(data, deep=False)
114
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
115
    data_mv_tot = _missing_vals(data)["mv_total"]
116
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
117
118
    if show == "all":
119
        data_mem = _memory_usage(data, deep=True)
120
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
121
        _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem)
122
        _print_cleaning_details(
123
            "After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem
124
        )
125
126
    print(
127
        f"Shape of cleaned data: {data_cleaned.shape} - "
128
        f"Remaining NAs: {data_cl_mv_tot}\n"
129
    )
130
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
131
    print(
132
        f"     of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n"
133
    )
134
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
135
    print(
136
        f"     of which {len(single_val_cols)} single valued."
137
        f"     Columns: {single_val_cols}"
138
    )
139
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
140
    mem_change = data_mem - data_cl_mem
141
    mem_perc = round(100 * mem_change / data_mem, 2)
142
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
143
144
145
def _print_cleaning_details(arg0, arg1, arg2, arg3):
146
    print(arg0)
147
    print(f"dtypes:\n{arg1.dtypes.value_counts()}")
148
    print(f"\nNumber of rows: {str(arg1.shape[0]).rjust(8)}")
149
    print(f"Number of cols: {str(arg1.shape[1]).rjust(8)}")
150
    print(f"Missing values: {str(arg2).rjust(8)}")
151
    print(f"Memory usage: {str(arg3).rjust(7)} MB")
152
    print("_______________________________________________________\n")
153
154
155
def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]:
156
    """Provide information on and drops duplicate rows.
157
158
    Parameters
159
    ----------
160
    data : pd.DataFrame
161
        2D dataset that can be coerced into Pandas DataFrame
162
163
    Returns
164
    -------
165
    Tuple[pd.DataFrame, List]
166
        Deduplicated Pandas DataFrame and Index Object of rows dropped
167
    """
168
    data = pd.DataFrame(data).copy()
169
    dupl_rows = data[data.duplicated()].index.tolist()
170
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
171
172
    return data, dupl_rows
173
174
175
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
176
    """Give the total memory usage in megabytes.
177
178
    Parameters
179
    ----------
180
    data : pd.DataFrame
181
        2D dataset that can be coerced into Pandas DataFrame
182
    deep : bool, optional
183
        Runs a deep analysis of the memory usage, by default True
184
185
    Returns
186
    -------
187
    float
188
        Memory usage in megabytes
189
    """
190
    return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
191
192
193
class MVResult(TypedDict):
194
    """TypedDict for the return value of _missing_vals."""
195
196
    mv_total: int
197
    mv_rows: int
198
    mv_cols: int
199
    mv_rows_ratio: float
200
    mv_cols_ratio: float
201
202
203
def _missing_vals(data: pd.DataFrame) -> MVResult:
204
    """Give metrics of missing values in the dataset.
205
206
    Parameters
207
    ----------
208
    data : pd.DataFrame
209
        2D dataset that can be coerced into Pandas DataFrame
210
211
    Returns
212
    -------
213
    Dict[str, float]
214
        mv_total: float, number of missing values in the entire dataset
215
        mv_rows: float, number of missing values in each row
216
        mv_cols: float, number of missing values in each column
217
        mv_rows_ratio: float, ratio of missing values for each row
218
        mv_cols_ratio: float, ratio of missing values for each column
219
    """
220
    data = pd.DataFrame(data).copy()
221
    mv_total: int = data.isna().sum().sum()
222
    mv_rows: int = data.isna().sum(axis=1)
223
    mv_cols: int = data.isna().sum(axis=0)
224
    mv_rows_ratio: float = mv_rows / data.shape[1]
225
    mv_cols_ratio: float = mv_cols / data.shape[0]
226
227
    return {
228
        "mv_total": mv_total,
229
        "mv_rows": mv_rows,
230
        "mv_cols": mv_cols,
231
        "mv_rows_ratio": mv_rows_ratio,
232
        "mv_cols_ratio": mv_cols_ratio,
233
    }
234
235
236
def _validate_input_bool(value: bool, desc):
237
    if not isinstance(value, bool):
238
        raise TypeError(
239
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
240
        )
241
242
243
def _validate_input_int(value: int, desc):
244
    if not isinstance(value, int):
245
        raise TypeError(
246
            f"Input value for '{desc}' is {type(value)} but should be an integer."
247
        )
248
249
250
def _validate_input_range(value, desc, lower, upper):
251
    if value < lower or value > upper:
252
        raise ValueError(
253
            f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
254
        )
255
256
257
def _validate_input_smaller(value1, value2, desc):
258
    if value1 > value2:
259
        raise ValueError(
260
            f"The first input for '{desc}' should be smaller or equal to the second."
261
        )
262
263
264
def _validate_input_sum_smaller(limit, desc, *args):
265
    if sum(args) > limit:
266
        raise ValueError(
267
            f"The sum of input values for '{desc}' should be less or equal to {limit}."
268
        )
269
270
271
def _validate_input_sum_larger(limit, desc, *args):
272
    if sum(args) < limit:
273
        raise ValueError(
274
            f"The sum of input values for '{desc}' should be larger/equal to {limit}."
275
        )
276