GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( c20d75...406f67 )
by Andreas
01:47
created

klib.utils   A

Complexity

Total Complexity 24

Size/Duplication

Total Lines 248
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 111
dl 0
loc 248
rs 10
c 0
b 0
f 0
wmc 24

10 Functions

Rating   Name   Duplication   Size   Complexity  
A _memory_usage() 0 22 1
A _drop_duplicates() 0 19 1
A _validate_input_int() 0 4 2
A _validate_input_bool() 0 4 2
B _diff_report() 0 75 5
A _validate_input_range() 0 4 3
A _validate_input_sum() 0 4 2
B _corr_selector() 0 43 5
A _missing_vals() 0 31 1
A _validate_input_smaller() 0 4 2
1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14
def _corr_selector(
15
    corr: Union[pd.Series, pd.DataFrame],
16
    split: Optional[
17
        str
18
    ] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
19
    threshold: float = 0,
20
) -> Union[pd.Series, pd.DataFrame]:
21
    """[summary]
22
23
    Parameters
24
    ----------
25
    corr : Union[pd.Series, pd.DataFrame]
26
        pd.Series or pd.DataFrame of correlations
27
    split : Optional[str], optional
28
        Type of split performed, by default None
29
           * {None, 'pos', 'neg', 'above', 'below'}
30
    threshold : float, optional
31
        Value between 0 and 1 to set the correlation threshold, by default 0
32
33
    Returns
34
    -------
35
    pd.DataFrame
36
        List or matrix of (filtered) correlations
37
    """
38
39
    if split == "pos":
40
        corr = corr.where((corr >= threshold) & (corr > 0))
41
        print(
42
            'Displaying positive correlations. Use "threshold" to further limit the results.'
43
        )
44
    elif split == "neg":
45
        corr = corr.where((corr <= threshold) & (corr < 0))
46
        print(
47
            'Displaying negative correlations. Use "threshold" to further limit the results.'
48
        )
49
    elif split == "above":
50
        corr = corr.where(np.abs(corr) >= threshold)
51
        print(f"Displaying absolute correlations above the threshold ({threshold}).")
52
    elif split == "below":
53
        corr = corr.where(np.abs(corr) <= threshold)
54
        print(f"Displaying absolute correlations below the threshold ({threshold}).")
55
56
    return corr
57
58
59
def _diff_report(
60
    data: pd.DataFrame,
61
    data_cleaned: pd.DataFrame,
62
    dupl_rows: Optional[List[Union[str, int]]] = None,
63
    single_val_cols: Optional[List[str]] = None,
64
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes",
65
) -> None:
66
    """ Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
67
    missing values.
68
69
    Parameters
70
    ----------
71
    data : pd.DataFrame
72
        2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here
73
    data_cleaned : pd.DataFrame
74
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here
75
    dupl_rows : Optional[List[Union[str, int]]], optional
76
        List of duplicate row indices, by default None
77
    single_val_cols : Optional[List[str]], optional
78
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
79
        NaNs count as a separate value, by default None
80
    show : str, optional
81
        {'all', 'changes', None}, by default "changes"
82
        Specify verbosity of the output:
83
            * 'all': Print information about the data before and after cleaning as well as information about changes \
84
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
85
            * 'changes': Print out differences in the data before and after cleaning.
86
            * None: No information about the data and the data cleaning is printed.
87
88
    Returns
89
    -------
90
    None
91
        Print statement highlighting the datasets or changes between the two datasets.
92
    """
93
94
    if show in ["changes", "all"]:
95
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
96
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
97
        data_mem = _memory_usage(data, deep=False)
98
        data_cl_mem = _memory_usage(data_cleaned, deep=False)
99
        data_mv_tot = _missing_vals(data)["mv_total"]
100
        data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
101
102
        if show == "all":
103
            data_mem = _memory_usage(data, deep=True)
104
            data_cl_mem = _memory_usage(data_cleaned, deep=True)
105
            print("Before data cleaning:\n")
106
            print(f"dtypes:\n{data.dtypes.value_counts()}")
107
            print(f"\nNumber of rows: {data.shape[0]}")
108
            print(f"Number of cols: {data.shape[1]}")
109
            print(f"Missing values: {data_mv_tot}")
110
            print(f"Memory usage: {data_mem} MB")
111
            print("_______________________________________________________\n")
112
            print("After data cleaning:\n")
113
            print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
114
            print(f"\nNumber of rows: {data_cleaned.shape[0]}")
115
            print(f"Number of cols: {data_cleaned.shape[1]}")
116
            print(f"Missing values: {data_cl_mv_tot}")
117
            print(f"Memory usage: {data_cl_mem} MB")
118
            print("_______________________________________________________\n")
119
120
        print(
121
            f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}"
122
        )
123
        print(f"\nChanges:")
124
        print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
125
        print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})")
126
        print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
127
        print(
128
            f"     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})"
129
        )
130
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
131
        mem_change = data_mem - data_cl_mem
132
        print(
133
            f"Reduced memory by at least: {round(mem_change,2)} MB (-{round(100*mem_change/data_mem,1)}%)"
134
        )
135
136
137
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
138
    """ Provides information on and drops duplicate rows.
139
140
    Parameters
141
    ----------
142
    data : pd.DataFrame
143
        2D dataset that can be coerced into Pandas DataFrame
144
145
    Returns
146
    -------
147
    Tuple[pd.DataFrame, List]
148
        Deduplicated Pandas DataFrame and Index Object of rows dropped
149
    """
150
151
    data = pd.DataFrame(data).copy()
152
    dupl_rows = data[data.duplicated()].index.tolist()
153
    data = data.drop(dupl_rows, axis="index")
154
155
    return data, dupl_rows
156
157
158
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
159
    """ Gives the total memory usage in megabytes.
160
161
    Parameters
162
    ----------
163
    data : pd.DataFrame
164
        2D dataset that can be coerced into Pandas DataFrame
165
    deep : bool, optional
166
        Runs a deep analysis of the memory usage, by default True
167
168
    Returns
169
    -------
170
    float
171
        Memory usage in megabytes
172
    """
173
174
    data = pd.DataFrame(data).copy()
175
    memory_usage = round(
176
        data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2
177
    )
178
179
    return memory_usage
180
181
182
def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
183
    """ Gives metrics of missing values in the dataset.
184
185
    Parameters
186
    ----------
187
    data : pd.DataFrame
188
        2D dataset that can be coerced into Pandas DataFrame
189
190
    Returns
191
    -------
192
    Dict[str, float]
193
        mv_total: float, number of missing values in the entire dataset
194
        mv_rows: float, number of missing values in each row
195
        mv_cols: float, number of missing values in each column
196
        mv_rows_ratio: float, ratio of missing values for each row
197
        mv_cols_ratio: float, ratio of missing values for each column
198
    """
199
200
    data = pd.DataFrame(data).copy()
201
    mv_rows = data.isna().sum(axis=1)
202
    mv_cols = data.isna().sum(axis=0)
203
    mv_total = data.isna().sum().sum()
204
    mv_rows_ratio = mv_rows / data.shape[1]
205
    mv_cols_ratio = mv_cols / data.shape[0]
206
207
    return {
208
        "mv_total": mv_total,
209
        "mv_rows": mv_rows,
210
        "mv_cols": mv_cols,
211
        "mv_rows_ratio": mv_rows_ratio,
212
        "mv_cols_ratio": mv_cols_ratio,
213
    }
214
215
216
def _validate_input_bool(value, desc):
217
    if not (isinstance(value, bool)):
218
        raise TypeError(
219
            f"Input value for '{desc}' is {type(value)} but should be a boolean."
220
        )
221
222
223
def _validate_input_int(value, desc):
224
    if not isinstance(value, int):
225
        raise TypeError(
226
            f"Input value for '{desc}' is {type(value)} but should be an integer."
227
        )
228
229
230
def _validate_input_range(value, desc, lower, upper):
231
    if value < lower or value > upper:
232
        raise ValueError(
233
            f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}."
234
        )
235
236
237
def _validate_input_smaller(value1, value2, desc):
238
    if value1 > value2:
239
        raise ValueError(
240
            f"The first input for '{desc}' should be smaller or equal to the second input."
241
        )
242
243
244
def _validate_input_sum(limit, desc, *args):
245
    if sum(args) > limit:
246
        raise ValueError(
247
            f"The sum of imput values provided for '{desc}' should be less or equal to {limit}."
248
        )
249