GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 5a4fe9...c92c0e )
by Andreas
01:16
created

klib.utils._validate_input_int()   A

Complexity

Conditions 2

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 3
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14
def _corr_selector(
15
    corr: Union[pd.Series, pd.DataFrame],
16
    split: Optional[str] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
17
    threshold: float = 0,
18
) -> Union[pd.Series, pd.DataFrame]:
19
    """ Utility funciton to select the desired correlations.
20
21
    Parameters
22
    ----------
23
    corr : Union[pd.Series, pd.DataFrame]
24
        pd.Series or pd.DataFrame of correlations
25
    split : Optional[str], optional
26
        Type of split performed, by default None
27
           * {None, 'pos', 'neg', 'high', 'low'}
28
    threshold : float, optional
29
        Value between 0 and 1 to set the correlation threshold, by default 0
30
31
    Returns
32
    -------
33
    pd.DataFrame
34
        List or matrix of (filtered) correlations
35
    """
36
37
    if split == "pos":
38
        corr = corr.where((corr >= threshold) & (corr > 0))
39
        print(
40
            'Displaying positive correlations. Specify a positive "threshold" to further limit the results.'
41
        )
42
    elif split == "neg":
43
        corr = corr.where((corr <= threshold) & (corr < 0))
44
        print(
45
            'Displaying negative correlations. Specify a negative "threshold" to further limit the results.'
46
        )
47
    elif split == "high":
48
        threshold = 0.3 if threshold <= 0 else threshold
49
        corr = corr.where(np.abs(corr) >= threshold)
50
        print(
51
            f"Displaying absolute correlations above the threshold ({threshold}). "
52
            'Specify a positive "threshold" to further limit the results.'
53
        )
54
    elif split == "low":
55
        threshold = 0.3 if threshold <= 0 else threshold
56
        corr = corr.where(np.abs(corr) <= threshold)
57
        print(
58
            f"Displaying absolute correlations below the threshold ({threshold}). "
59
            'Specify a positive "threshold" to further limit the results.'
60
        )
61
62
    return corr
63
64
65
def _diff_report(
66
    data: pd.DataFrame,
67
    data_cleaned: pd.DataFrame,
68
    dupl_rows: Optional[List[Union[str, int]]] = None,
69
    single_val_cols: Optional[List[str]] = None,
70
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes",
71
) -> None:
72
    """ Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
73
    missing values.
74
75
    Parameters
76
    ----------
77
    data : pd.DataFrame
78
        2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here
79
    data_cleaned : pd.DataFrame
80
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here
81
    dupl_rows : Optional[List[Union[str, int]]], optional
82
        List of duplicate row indices, by default None
83
    single_val_cols : Optional[List[str]], optional
84
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
85
        NaNs count as a separate value, by default None
86
    show : str, optional
87
        {'all', 'changes', None}, by default "changes"
88
        Specify verbosity of the output:
89
            * 'all': Print information about the data before and after cleaning as well as information about changes \
90
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
91
            * 'changes': Print out differences in the data before and after cleaning.
92
            * None: No information about the data and the data cleaning is printed.
93
94
    Returns
95
    -------
96
    None
97
        Print statement highlighting the datasets or changes between the two datasets.
98
    """
99
100
    if show in ["changes", "all"]:
101
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
102
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
103
        data_mem = _memory_usage(data, deep=False)
104
        data_cl_mem = _memory_usage(data_cleaned, deep=False)
105
        data_mv_tot = _missing_vals(data)["mv_total"]
106
        data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
107
108
        if show == "all":
109
            data_mem = _memory_usage(data, deep=True)
110
            data_cl_mem = _memory_usage(data_cleaned, deep=True)
111
            print("Before data cleaning:\n")
112
            print(f"dtypes:\n{data.dtypes.value_counts()}")
113
            print(f"\nNumber of rows: {data.shape[0]}")
114
            print(f"Number of cols: {data.shape[1]}")
115
            print(f"Missing values: {data_mv_tot}")
116
            print(f"Memory usage: {data_mem} MB")
117
            print("_______________________________________________________\n")
118
            print("After data cleaning:\n")
119
            print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
120
            print(f"\nNumber of rows: {data_cleaned.shape[0]}")
121
            print(f"Number of cols: {data_cleaned.shape[1]}")
122
            print(f"Missing values: {data_cl_mv_tot}")
123
            print(f"Memory usage: {data_cl_mem} MB")
124
            print("_______________________________________________________\n")
125
126
        print(f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
127
        print("\nChanges:")
128
        print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
129
        print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})")
130
        print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
131
        print(f"     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})")
132
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
133
        mem_change = data_mem - data_cl_mem
134
        print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{round(100*mem_change/data_mem,2)}%)")
135
136
137
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
138
    """ Provides information on and drops duplicate rows.
139
140
    Parameters
141
    ----------
142
    data : pd.DataFrame
143
        2D dataset that can be coerced into Pandas DataFrame
144
145
    Returns
146
    -------
147
    Tuple[pd.DataFrame, List]
148
        Deduplicated Pandas DataFrame and Index Object of rows dropped
149
    """
150
151
    data = pd.DataFrame(data).copy()
152
    dupl_rows = data[data.duplicated()].index.tolist()
153
    data = data.drop(dupl_rows, axis="index")
154
155
    return data, dupl_rows
156
157
158
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
159
    """ Gives the total memory usage in megabytes.
160
161
    Parameters
162
    ----------
163
    data : pd.DataFrame
164
        2D dataset that can be coerced into Pandas DataFrame
165
    deep : bool, optional
166
        Runs a deep analysis of the memory usage, by default True
167
168
    Returns
169
    -------
170
    float
171
        Memory usage in megabytes
172
    """
173
174
    data = pd.DataFrame(data).copy()
175
    memory_usage = round(data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2)
176
177
    return memory_usage
178
179
180
def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
181
    """ Gives metrics of missing values in the dataset.
182
183
    Parameters
184
    ----------
185
    data : pd.DataFrame
186
        2D dataset that can be coerced into Pandas DataFrame
187
188
    Returns
189
    -------
190
    Dict[str, float]
191
        mv_total: float, number of missing values in the entire dataset
192
        mv_rows: float, number of missing values in each row
193
        mv_cols: float, number of missing values in each column
194
        mv_rows_ratio: float, ratio of missing values for each row
195
        mv_cols_ratio: float, ratio of missing values for each column
196
    """
197
198
    data = pd.DataFrame(data).copy()
199
    mv_rows = data.isna().sum(axis=1)
200
    mv_cols = data.isna().sum(axis=0)
201
    mv_total = data.isna().sum().sum()
202
    mv_rows_ratio = mv_rows / data.shape[1]
203
    mv_cols_ratio = mv_cols / data.shape[0]
204
205
    return {
206
        "mv_total": mv_total,
207
        "mv_rows": mv_rows,
208
        "mv_cols": mv_cols,
209
        "mv_rows_ratio": mv_rows_ratio,
210
        "mv_cols_ratio": mv_cols_ratio,
211
    }
212
213
214
def _validate_input_bool(value, desc):
215
    if not (isinstance(value, bool)):
216
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
217
218
219
def _validate_input_int(value, desc):
220
    if not isinstance(value, int):
221
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
222
223
224
def _validate_input_range(value, desc, lower, upper):
225
    if value < lower or value > upper:
226
        raise ValueError(f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
227
228
229
def _validate_input_smaller(value1, value2, desc):
230
    if value1 > value2:
231
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
232
233
234
def _validate_input_sum_smaller(limit, desc, *args):
235
    if sum(args) > limit:
236
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
237
238
239
def _validate_input_sum_larger(limit, desc, *args):
240
    if sum(args) < limit:
241
        raise ValueError(
242
            f"The sum of imput values provided for '{desc}' should be larger or equal to {limit}."
243
        )
244