GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 32c39e...8bebb3 )
by Andreas
01:15
created

klib.utils._corr_selector()   B

Complexity

Conditions 7

Size

Total Lines 39
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 19
nop 3
dl 0
loc 39
rs 8
c 0
b 0
f 0
1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14
def _corr_selector(
15
    corr: Union[pd.Series, pd.DataFrame],
16
    split: Optional[str] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
17
    threshold: float = 0,
18
) -> Union[pd.Series, pd.DataFrame]:
19
    """[summary]
20
21
    Parameters
22
    ----------
23
    corr : Union[pd.Series, pd.DataFrame]
24
        pd.Series or pd.DataFrame of correlations
25
    split : Optional[str], optional
26
        Type of split performed, by default None
27
           * {None, 'pos', 'neg', 'high', 'low'}
28
    threshold : float, optional
29
        Value between 0 and 1 to set the correlation threshold, by default 0
30
31
    Returns
32
    -------
33
    pd.DataFrame
34
        List or matrix of (filtered) correlations
35
    """
36
37
    if split == "pos":
38
        corr = corr.where((corr >= threshold) & (corr > 0))
39
        print('Displaying positive correlations. Use a positive "threshold" to further limit the results.')
40
    elif split == "neg":
41
        corr = corr.where((corr <= threshold) & (corr < 0))
42
        print('Displaying negative correlations. Use a negative "threshold" to further limit the results.')
43
    elif split == "high":
44
        threshold = 0.5 if threshold == 0 else threshold
45
        corr = corr.where(np.abs(corr) >= threshold)
46
        print(f"Displaying absolute correlations above the threshold ({threshold}).")
47
    elif split == "low":
48
        threshold = 0.5 if threshold == 0 else threshold
49
        corr = corr.where(np.abs(corr) <= threshold)
50
        print(f"Displaying absolute correlations below the threshold ({threshold}).")
51
52
    return corr
53
54
55
def _diff_report(
56
    data: pd.DataFrame,
57
    data_cleaned: pd.DataFrame,
58
    dupl_rows: Optional[List[Union[str, int]]] = None,
59
    single_val_cols: Optional[List[str]] = None,
60
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes",
61
) -> None:
62
    """ Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
63
    missing values.
64
65
    Parameters
66
    ----------
67
    data : pd.DataFrame
68
        2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here
69
    data_cleaned : pd.DataFrame
70
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here
71
    dupl_rows : Optional[List[Union[str, int]]], optional
72
        List of duplicate row indices, by default None
73
    single_val_cols : Optional[List[str]], optional
74
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
75
        NaNs count as a separate value, by default None
76
    show : str, optional
77
        {'all', 'changes', None}, by default "changes"
78
        Specify verbosity of the output:
79
            * 'all': Print information about the data before and after cleaning as well as information about changes \
80
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
81
            * 'changes': Print out differences in the data before and after cleaning.
82
            * None: No information about the data and the data cleaning is printed.
83
84
    Returns
85
    -------
86
    None
87
        Print statement highlighting the datasets or changes between the two datasets.
88
    """
89
90
    if show in ["changes", "all"]:
91
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
92
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
93
        data_mem = _memory_usage(data, deep=False)
94
        data_cl_mem = _memory_usage(data_cleaned, deep=False)
95
        data_mv_tot = _missing_vals(data)["mv_total"]
96
        data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
97
98
        if show == "all":
99
            data_mem = _memory_usage(data, deep=True)
100
            data_cl_mem = _memory_usage(data_cleaned, deep=True)
101
            print("Before data cleaning:\n")
102
            print(f"dtypes:\n{data.dtypes.value_counts()}")
103
            print(f"\nNumber of rows: {data.shape[0]}")
104
            print(f"Number of cols: {data.shape[1]}")
105
            print(f"Missing values: {data_mv_tot}")
106
            print(f"Memory usage: {data_mem} MB")
107
            print("_______________________________________________________\n")
108
            print("After data cleaning:\n")
109
            print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
110
            print(f"\nNumber of rows: {data_cleaned.shape[0]}")
111
            print(f"Number of cols: {data_cleaned.shape[1]}")
112
            print(f"Missing values: {data_cl_mv_tot}")
113
            print(f"Memory usage: {data_cl_mem} MB")
114
            print("_______________________________________________________\n")
115
116
        print(f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
117
        print("\nChanges:")
118
        print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
119
        print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})")
120
        print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
121
        print(f"     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})")
122
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
123
        mem_change = data_mem - data_cl_mem
124
        print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{round(100*mem_change/data_mem,2)}%)")
125
126
127
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
128
    """ Provides information on and drops duplicate rows.
129
130
    Parameters
131
    ----------
132
    data : pd.DataFrame
133
        2D dataset that can be coerced into Pandas DataFrame
134
135
    Returns
136
    -------
137
    Tuple[pd.DataFrame, List]
138
        Deduplicated Pandas DataFrame and Index Object of rows dropped
139
    """
140
141
    data = pd.DataFrame(data).copy()
142
    dupl_rows = data[data.duplicated()].index.tolist()
143
    data = data.drop(dupl_rows, axis="index")
144
145
    return data, dupl_rows
146
147
148
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
149
    """ Gives the total memory usage in megabytes.
150
151
    Parameters
152
    ----------
153
    data : pd.DataFrame
154
        2D dataset that can be coerced into Pandas DataFrame
155
    deep : bool, optional
156
        Runs a deep analysis of the memory usage, by default True
157
158
    Returns
159
    -------
160
    float
161
        Memory usage in megabytes
162
    """
163
164
    data = pd.DataFrame(data).copy()
165
    memory_usage = round(data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2)
166
167
    return memory_usage
168
169
170
def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
171
    """ Gives metrics of missing values in the dataset.
172
173
    Parameters
174
    ----------
175
    data : pd.DataFrame
176
        2D dataset that can be coerced into Pandas DataFrame
177
178
    Returns
179
    -------
180
    Dict[str, float]
181
        mv_total: float, number of missing values in the entire dataset
182
        mv_rows: float, number of missing values in each row
183
        mv_cols: float, number of missing values in each column
184
        mv_rows_ratio: float, ratio of missing values for each row
185
        mv_cols_ratio: float, ratio of missing values for each column
186
    """
187
188
    data = pd.DataFrame(data).copy()
189
    mv_rows = data.isna().sum(axis=1)
190
    mv_cols = data.isna().sum(axis=0)
191
    mv_total = data.isna().sum().sum()
192
    mv_rows_ratio = mv_rows / data.shape[1]
193
    mv_cols_ratio = mv_cols / data.shape[0]
194
195
    return {
196
        "mv_total": mv_total,
197
        "mv_rows": mv_rows,
198
        "mv_cols": mv_cols,
199
        "mv_rows_ratio": mv_rows_ratio,
200
        "mv_cols_ratio": mv_cols_ratio,
201
    }
202
203
204
def _validate_input_bool(value, desc):
205
    if not (isinstance(value, bool)):
206
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
207
208
209
def _validate_input_int(value, desc):
210
    if not isinstance(value, int):
211
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
212
213
214
def _validate_input_range(value, desc, lower, upper):
215
    if value < lower or value > upper:
216
        raise ValueError(f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
217
218
219
def _validate_input_smaller(value1, value2, desc):
220
    if value1 > value2:
221
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
222
223
224
def _validate_input_sum(limit, desc, *args):
225
    if sum(args) > limit:
226
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
227