GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 012cfd...853c75 )
by Andreas
01:13
created

klib.utils._corr_selector()   A

Complexity

Conditions 5

Size

Total Lines 37
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 17
nop 3
dl 0
loc 37
rs 9.0833
c 0
b 0
f 0
1
"""
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
"""
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
from typing import Any, Dict, List, Optional, Tuple, Union
12
13
14
def _corr_selector(
15
    corr: Union[pd.Series, pd.DataFrame],
16
    split: Optional[str] = None,  # Optional[Literal["pos", "neg", "above", "below"]] = None,
17
    threshold: float = 0,
18
) -> Union[pd.Series, pd.DataFrame]:
19
    """[summary]
20
21
    Parameters
22
    ----------
23
    corr : Union[pd.Series, pd.DataFrame]
24
        pd.Series or pd.DataFrame of correlations
25
    split : Optional[str], optional
26
        Type of split performed, by default None
27
           * {None, 'pos', 'neg', 'above', 'below'}
28
    threshold : float, optional
29
        Value between 0 and 1 to set the correlation threshold, by default 0
30
31
    Returns
32
    -------
33
    pd.DataFrame
34
        List or matrix of (filtered) correlations
35
    """
36
37
    if split == "pos":
38
        corr = corr.where((corr >= threshold) & (corr > 0))
39
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
40
    elif split == "neg":
41
        corr = corr.where((corr <= threshold) & (corr < 0))
42
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
43
    elif split == "above":
44
        corr = corr.where(np.abs(corr) >= threshold)
45
        print(f"Displaying absolute correlations above the threshold ({threshold}).")
46
    elif split == "below":
47
        corr = corr.where(np.abs(corr) <= threshold)
48
        print(f"Displaying absolute correlations below the threshold ({threshold}).")
49
50
    return corr
51
52
53
def _diff_report(
54
    data: pd.DataFrame,
55
    data_cleaned: pd.DataFrame,
56
    dupl_rows: Optional[List[Union[str, int]]] = None,
57
    single_val_cols: Optional[List[str]] = None,
58
    show: Optional[str] = "changes",  # Optional[Literal["all", "changes"]] = "changes",
59
) -> None:
60
    """ Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
61
    missing values.
62
63
    Parameters
64
    ----------
65
    data : pd.DataFrame
66
        2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here
67
    data_cleaned : pd.DataFrame
68
        2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here
69
    dupl_rows : Optional[List[Union[str, int]]], optional
70
        List of duplicate row indices, by default None
71
    single_val_cols : Optional[List[str]], optional
72
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
73
        NaNs count as a separate value, by default None
74
    show : str, optional
75
        {'all', 'changes', None}, by default "changes"
76
        Specify verbosity of the output:
77
            * 'all': Print information about the data before and after cleaning as well as information about changes \
78
            and memory usage (deep). Please be aware, that this can slow down the function by quite a bit.
79
            * 'changes': Print out differences in the data before and after cleaning.
80
            * None: No information about the data and the data cleaning is printed.
81
82
    Returns
83
    -------
84
    None
85
        Print statement highlighting the datasets or changes between the two datasets.
86
    """
87
88
    if show in ["changes", "all"]:
89
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
90
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
91
        data_mem = _memory_usage(data, deep=False)
92
        data_cl_mem = _memory_usage(data_cleaned, deep=False)
93
        data_mv_tot = _missing_vals(data)["mv_total"]
94
        data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"]
95
96
        if show == "all":
97
            data_mem = _memory_usage(data, deep=True)
98
            data_cl_mem = _memory_usage(data_cleaned, deep=True)
99
            print("Before data cleaning:\n")
100
            print(f"dtypes:\n{data.dtypes.value_counts()}")
101
            print(f"\nNumber of rows: {data.shape[0]}")
102
            print(f"Number of cols: {data.shape[1]}")
103
            print(f"Missing values: {data_mv_tot}")
104
            print(f"Memory usage: {data_mem} MB")
105
            print("_______________________________________________________\n")
106
            print("After data cleaning:\n")
107
            print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}")
108
            print(f"\nNumber of rows: {data_cleaned.shape[0]}")
109
            print(f"Number of cols: {data_cleaned.shape[1]}")
110
            print(f"Missing values: {data_cl_mv_tot}")
111
            print(f"Memory usage: {data_cl_mem} MB")
112
            print("_______________________________________________________\n")
113
114
        print(f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}")
115
        print(f"\nChanges:")
116
        print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
117
        print(f"     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})")
118
        print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
119
        print(f"     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})")
120
        print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
121
        mem_change = data_mem - data_cl_mem
122
        print(f"Reduced memory by at least: {round(mem_change,2)} MB (-{round(100*mem_change/data_mem,1)}%)")
123
124
125
def _drop_duplicates(data: pd.DataFrame) -> Tuple[pd.DataFrame, Any]:
126
    """ Provides information on and drops duplicate rows.
127
128
    Parameters
129
    ----------
130
    data : pd.DataFrame
131
        2D dataset that can be coerced into Pandas DataFrame
132
133
    Returns
134
    -------
135
    Tuple[pd.DataFrame, List]
136
        Deduplicated Pandas DataFrame and Index Object of rows dropped
137
    """
138
139
    data = pd.DataFrame(data).copy()
140
    dupl_rows = data[data.duplicated()].index.tolist()
141
    data = data.drop(dupl_rows, axis="index")
142
143
    return data, dupl_rows
144
145
146
def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
147
    """ Gives the total memory usage in megabytes.
148
149
    Parameters
150
    ----------
151
    data : pd.DataFrame
152
        2D dataset that can be coerced into Pandas DataFrame
153
    deep : bool, optional
154
        Runs a deep analysis of the memory usage, by default True
155
156
    Returns
157
    -------
158
    float
159
        Memory usage in megabytes
160
    """
161
162
    data = pd.DataFrame(data).copy()
163
    memory_usage = round(data.memory_usage(index=True, deep=deep).sum() / (1024 ** 2), 2)
164
165
    return memory_usage
166
167
168
def _missing_vals(data: pd.DataFrame) -> Dict[str, Any]:
169
    """ Gives metrics of missing values in the dataset.
170
171
    Parameters
172
    ----------
173
    data : pd.DataFrame
174
        2D dataset that can be coerced into Pandas DataFrame
175
176
    Returns
177
    -------
178
    Dict[str, float]
179
        mv_total: float, number of missing values in the entire dataset
180
        mv_rows: float, number of missing values in each row
181
        mv_cols: float, number of missing values in each column
182
        mv_rows_ratio: float, ratio of missing values for each row
183
        mv_cols_ratio: float, ratio of missing values for each column
184
    """
185
186
    data = pd.DataFrame(data).copy()
187
    mv_rows = data.isna().sum(axis=1)
188
    mv_cols = data.isna().sum(axis=0)
189
    mv_total = data.isna().sum().sum()
190
    mv_rows_ratio = mv_rows / data.shape[1]
191
    mv_cols_ratio = mv_cols / data.shape[0]
192
193
    return {
194
        "mv_total": mv_total,
195
        "mv_rows": mv_rows,
196
        "mv_cols": mv_cols,
197
        "mv_rows_ratio": mv_rows_ratio,
198
        "mv_cols_ratio": mv_cols_ratio,
199
    }
200
201
202
def _validate_input_bool(value, desc):
203
    if not (isinstance(value, bool)):
204
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
205
206
207
def _validate_input_int(value, desc):
208
    if not isinstance(value, int):
209
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
210
211
212
def _validate_input_range(value, desc, lower, upper):
213
    if value < lower or value > upper:
214
        raise ValueError(f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
215
216
217
def _validate_input_smaller(value1, value2, desc):
218
    if value1 > value2:
219
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
220
221
222
def _validate_input_sum(limit, desc, *args):
223
    if sum(args) > limit:
224
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
225