GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — main ( 476ed9...09d8d6 )
by Andreas
02:41
created

klib.utils._diff_report()   B

Complexity

Conditions 5

Size

Total Lines 81
Code Lines 43

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 19
CRAP Score 7.6321

Importance

Changes 0
Metric Value
cc 5
eloc 43
nop 5
dl 0
loc 81
rs 8.3813
c 0
b 0
f 0
ccs 19
cts 36
cp 0.5278
crap 7.6321

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
8
# Imports
9 1
import numpy as np
10 1
import pandas as pd
11 1
from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14 1
def _corr_selector(
15
    corr: Union[pd.Series, pd.DataFrame],
16
    split: Optional[
17
        str
18
    ] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
19
    threshold: float = 0,
20
) -> Union[pd.Series, pd.DataFrame]:
21
    """Select the desired correlations using this utility function.
22
23
    Parameters
24
    ----------
25
    corr : Union[pd.Series, pd.DataFrame]
26
        pd.Series or pd.DataFrame of correlations
27
    split : Optional[str], optional
28
        Type of split performed, by default None
29
            * {None, "pos", "neg", "high", "low"}
30
    threshold : float, optional
31
        Value between 0 and 1 to set the correlation threshold, by default 0 unless \
32
        split = "high" or split = "low", in which case default is 0.3
33
34
    Returns
35
    -------
36
    pd.DataFrame
37
        List or matrix of (filtered) correlations
38
    """
39 1
    if split == "pos":
40 1
        corr = corr.where((corr >= threshold) & (corr > 0))
41 1
        print(
42
            'Displaying positive correlations. Specify a positive "threshold" to '
43
            "limit the results further."
44
        )
45 1
    elif split == "neg":
46 1
        corr = corr.where((corr <= threshold) & (corr < 0))
47 1
        print(
48
            'Displaying negative correlations. Specify a negative "threshold" to '
49
            "limit the results further."
50
        )
51 1
    elif split == "high":
52 1
        threshold = 0.3 if threshold <= 0 else threshold
53 1
        corr = corr.where(np.abs(corr) >= threshold)
54 1
        print(
55
            f"Displaying absolute correlations above the threshold ({threshold}). "
56
            'Specify a positive "threshold" to limit the results further.'
57
        )
58 1
    elif split == "low":
59 1
        threshold = 0.3 if threshold <= 0 else threshold
60 1
        corr = corr.where(np.abs(corr) <= threshold)
61 1
        print(
62
            f"Displaying absolute correlations below the threshold ({threshold}). "
63
            'Specify a positive "threshold" to limit the results further.'
64
        )
65
66 1
    return corr
67
68
69 1
def _diff_report(
70
    data: pd.DataFrame,
71
    data_cleaned: pd.DataFrame,
72
    dupl_rows: Optional[List[Union[str, int]]] = None,
73
    single_val_cols: Optional[List[str]] = None,
74
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes"
75
) -> None:
76
    """Provide information about changes between two datasets, such as dropped rows \
77
        and columns, memory usage and missing values.
78
79
    Parameters
80
    ----------
81
    data : pd.DataFrame
82
        2D dataset that can be coerced into Pandas DataFrame. Input the initial \
83
        dataset here
84
    data_cleaned : pd.DataFrame
85
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \
86
        updated dataset here
87
    dupl_rows : Optional[List[Union[str, int]]], optional
88
        List of duplicate row indices, by default None
89
    single_val_cols : Optional[List[str]], optional
90
        List of single-valued column indices. I.e. columns where all cells contain \
91
        the same value. NaNs count as a separate value, by default None
92
    show : str, optional
93
        {"all", "changes", None}, by default "changes"
94
        Specify verbosity of the output:
95
            * "all": Print information about the data before and after cleaning as \
96
                well as information about changes and memory usage (deep). Please be \
97
                aware, that this can slow down the function by quite a bit.
98
            * "changes": Print out differences in the data before and after cleaning.
99
            * None: No information about the data and the data cleaning is printed.
100
101
    Returns
102
    -------
103
    None
104
        Print statement highlighting the datasets or changes between the two datasets.
105
    """
106 1
    if show not in ["changes", "all"]:
107
        return
108
109 1
    dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
110 1
    single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
111 1
    data_mem = _memory_usage(data, deep=False)
112 1
    data_cl_mem = _memory_usage(data_cleaned, deep=False)
113 1
    data_mv_tot = _missing_vals(data)["mv_total"]
114 1
    data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
115
116 1
    if show == "all":
117
        data_mem = _memory_usage(data, deep=True)
118
        data_cl_mem = _memory_usage(data_cleaned, deep=True)
119
        print("Before data cleaning:\n")
120
        print(f"dtypes:\n{data.dtypes.value_counts()}")
121
        print(f"\nNumber of rows: {str(data.shape[0]).rjust(8)}")
122
        print(f"Number of cols: {str(data.shape[1]).rjust(8)}")
123
        print(f"Missing values: {str(data_mv_tot).rjust(8)}")
124
        print(f"Memory usage: {str(data_mem).rjust(7)} MB")
125
        print("_______________________________________________________\n")
126
        print("After data cleaning:\n")
127
        print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
128
        print(f"\nNumber of rows: {str(data_cleaned.shape[0]).rjust(8)}")
129
        print(f"Number of cols: {str(data_cleaned.shape[1]).rjust(8)}")
130
        print(f"Missing values: {str(data_cl_mv_tot).rjust(8)}")
131
        print(f"Memory usage: {str(data_cl_mem).rjust(7)} MB")
132
        print("_______________________________________________________\n")
133
134 1
    print(
135
        f"Shape of cleaned data: {data_cleaned.shape}"
136
        f"Remaining NAs: {data_cl_mv_tot}"
137
    )
138 1
    print("\nChanges:")
139 1
    print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
140 1
    print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows[:200]})")
141 1
    print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
142 1
    print(
143
        f"     of which {len(single_val_cols)} single valued."
144
        f"     Columns: {single_val_cols}"
145
    )
146 1
    print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
147 1
    mem_change = data_mem - data_cl_mem
148 1
    mem_perc = round(100 * mem_change / data_mem, 2)
149 1
    print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n")
150
151
152 1
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
153
    """Provide information on and drops duplicate rows.
154
155
    Parameters
156
    ----------
157
    data : pd.DataFrame
158
        2D dataset that can be coerced into Pandas DataFrame
159
160
    Returns
161
    -------
162
    Tuple[pd.DataFrame, List]
163
        Deduplicated Pandas DataFrame and Index Object of rows dropped
164
    """
165 1
    data = pd.DataFrame(data).copy()
166 1
    dupl_rows = data[data.duplicated()].index.tolist()
167 1
    data = data.drop(dupl_rows, axis="index").reset_index(drop=True)
168
169 1
    return data, dupl_rows
170
171
172 1
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
173
    """Give the total memory usage in megabytes.
174
175
    Parameters
176
    ----------
177
    data : pd.DataFrame
178
        2D dataset that can be coerced into Pandas DataFrame
179
    deep : bool, optional
180
        Runs a deep analysis of the memory usage, by default True
181
182
    Returns
183
    -------
184
    float
185
        Memory usage in megabytes
186
    """
187 1
    data = pd.DataFrame(data).copy()
188 1
    return round(
189
        data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2
190
    )
191
192
193 1
def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
194
    """Give metrics of missing values in the dataset.
195
196
    Parameters
197
    ----------
198
    data : pd.DataFrame
199
        2D dataset that can be coerced into Pandas DataFrame
200
201
    Returns
202
    -------
203
    Dict[str, float]
204
        mv_total: float, number of missing values in the entire dataset
205
        mv_rows: float, number of missing values in each row
206
        mv_cols: float, number of missing values in each column
207
        mv_rows_ratio: float, ratio of missing values for each row
208
        mv_cols_ratio: float, ratio of missing values for each column
209
    """
210 1
    data = pd.DataFrame(data).copy()
211 1
    mv_rows = data.isna().sum(axis=1)
212 1
    mv_cols = data.isna().sum(axis=0)
213 1
    mv_total = data.isna().sum().sum()
214 1
    mv_rows_ratio = mv_rows / data.shape[1]
215 1
    mv_cols_ratio = mv_cols / data.shape[0]
216
217 1
    return {
218
        "mv_total": mv_total,
219
        "mv_rows": mv_rows,
220
        "mv_cols": mv_cols,
221
        "mv_rows_ratio": mv_rows_ratio,
222
        "mv_cols_ratio": mv_cols_ratio,
223
    }
224
225
226 1
def _validate_input_bool(value, desc):
227 1
    if not isinstance(value, bool):
228 1
        raise TypeError(
229
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
230
        )
231
232
233 1
def _validate_input_int(value, desc):
234 1
    if not isinstance(value, int):
235 1
        raise TypeError(
236
            f"Input value for '{desc}' is {type(value)} but should be an integer."
237
        )
238
239
240 1
def _validate_input_range(value, desc, lower, upper):
241 1
    if value < lower or value > upper:
242 1
        raise ValueError(
243
            f"'{desc}' = {value} but should be {lower} <= '{desc}' <= {upper}."
244
        )
245
246
247 1
def _validate_input_smaller(value1, value2, desc):
248 1
    if value1 > value2:
249 1
        raise ValueError(
250
            f"The first input for '{desc}' should be smaller or equal to the second."
251
        )
252
253
254 1
def _validate_input_sum_smaller(limit, desc, *args):
255 1
    if sum(args) > limit:
256 1
        raise ValueError(
257
            f"The sum of input values for '{desc}' should be less or equal to {limit}."
258
        )
259
260
261 1
def _validate_input_sum_larger(limit, desc, *args):
262 1
    if sum(args) < limit:
263 1
        raise ValueError(
264
            f"The sum of input values for '{desc}' should be larger/equal to {limit}."
265
        )
266