GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Branch master (5deb01)
by Andreas
02:32
created

klib.utils   A

Complexity

Total Complexity 24

Size/Duplication

Total Lines 209
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 87
dl 0
loc 209
rs 10
c 0
b 0
f 0
wmc 24

10 Functions

Rating   Name   Duplication   Size   Complexity  
A _memory_usage() 0 17 1
A _drop_duplicates() 0 19 1
A _validate_input_int() 0 3 2
A _validate_input_bool() 0 3 2
B _diff_report() 0 64 5
A _validate_input_range() 0 4 3
A _validate_input_sum() 0 3 2
A _corr_selector() 0 33 5
A _missing_vals() 0 29 1
A _validate_input_smaller() 0 3 2
1
'''
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
13
def _corr_selector(corr, split=None, threshold=0):
14
    '''
15
    Select correlations based on the provided parameters.
16
17
    Parameters
18
    ----------
19
    corr: pd.Series or pd.DataFrame of correlations.
20
21
    split: {None, 'pos', 'neg', 'above', 'below'}, default None
22
        Type of split to be performed.
23
24
    threshold: float, default 0
25
        Value between 0 <= threshold <= 1
26
27
    Returns:
28
    -------
29
    corr: List or matrix of (filtered) correlations.
30
    '''
31
32
    if split == 'pos':
33
        corr = corr.where((corr >= threshold) & (corr > 0))
34
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
35
    elif split == 'neg':
36
        corr = corr.where((corr <= threshold) & (corr < 0))
37
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
38
    elif split == 'above':
39
        corr = corr.where(np.abs(corr) >= threshold)
40
        print(f'Displaying absolute correlations above the threshold ({threshold}).')
41
    elif split == 'below':
42
        corr = corr.where(np.abs(corr) <= threshold)
43
        print(f'Displaying absolute correlations below the threshold ({threshold}).')
44
45
    return corr
46
47
48
def _diff_report(data, data_cleaned, dupl_rows=None, single_val_cols=None, show='changes'):
49
    '''
50
    Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
51
    missing values.
52
53
    Parameters
54
    ----------
55
    data: 2D dataset that can be coerced into Pandas DataFrame.
56
        Input the initial dataset here.
57
58
    data_cleaned: 2D dataset that can be coerced into Pandas DataFrame.
59
        Input the cleaned / updated dataset here.
60
61
    dupl_rows: list, default None
62
        List of duplicate row indices.
63
64
    single_val_cols: list, default None
65
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
66
        NaNs count as a separate value.
67
68
    show: {'all', 'changes', None} default 'all'
69
        Specify verbosity of the output.
70
        * 'all': Print information about the data before and after cleaning as well as information about changes.
71
        * 'changes': Print out differences in the data before and after cleaning.
72
        * None: No information about the data and the data cleaning is printed.
73
74
    Returns:
75
    -------
76
    Print statement highlighting the datasets or changes between the two datasets.
77
    '''
78
79
    if show in ['changes', 'all']:
80
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
81
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
82
        data_mem = _memory_usage(data)
83
        data_cl_mem = _memory_usage(data_cleaned)
84
        data_mv_tot = _missing_vals(data)['mv_total']
85
        data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']
86
87
        if show == 'all':
88
            print('Before data cleaning:\n')
89
            print(f'dtypes:\n{data.dtypes.value_counts()}')
90
            print(f'\nNumber of rows: {data.shape[0]}')
91
            print(f'Number of cols: {data.shape[1]}')
92
            print(f'Missing values: {data_mv_tot}')
93
            print(f'Memory usage: {data_mem} KB')
94
            print('_______________________________________________________\n')
95
            print('After data cleaning:\n')
96
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
97
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
98
            print(f'Number of cols: {data_cleaned.shape[1]}')
99
            print(f'Missing values: {data_cl_mv_tot}')
100
            print(f'Memory usage: {data_cl_mem} KB')
101
            print('_______________________________________________________\n')
102
103
        print(f'Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}')
104
        print(f'\nChanges:')
105
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
106
        print(f'     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})')
107
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
108
        print(f'     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})')
109
        print(f'Dropped missing values: {data_mv_tot-data_cl_mv_tot}')
110
        mem_change = data_mem-data_cl_mem
111
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')
112
113
114
def _drop_duplicates(data):
115
    '''
116
    Provides information and drops duplicate rows.
117
118
    Parameters
119
    ----------
120
    data: 2D dataset that can be coerced into Pandas DataFrame.
121
122
    Returns
123
    -------
124
    data: Deduplicated Pandas DataFrame
125
    rows_dropped: Index Object of rows dropped.
126
    '''
127
128
    data = pd.DataFrame(data).copy()
129
    dupl_rows = data[data.duplicated()].index.tolist()
130
    data = data.drop(dupl_rows, axis='index')
131
132
    return data, dupl_rows
133
134
135
def _memory_usage(data):
136
    '''
137
    Gives the total memory usage in kilobytes.
138
139
    Parameters
140
    ----------
141
    data: 2D dataset that can be coerced into Pandas DataFrame.
142
143
    Returns
144
    -------
145
    memory_usage: float
146
    '''
147
148
    data = pd.DataFrame(data).copy()
149
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
150
151
    return memory_usage
152
153
154
def _missing_vals(data):
155
    '''
156
    Gives metrics of missing values in the dataset.
157
158
    Parameters
159
    ----------
160
    data: 2D dataset that can be coerced into Pandas DataFrame.
161
162
    Returns
163
    -------
164
    mv_total: float, number of missing values in the entire dataset
165
    mv_rows: float, number of missing values in each row
166
    mv_cols: float, number of missing values in each column
167
    mv_rows_ratio: float, ratio of missing values for each row
168
    mv_cols_ratio: float, ratio of missing values for each column
169
    '''
170
171
    data = pd.DataFrame(data).copy()
172
    mv_rows = data.isna().sum(axis=1)
173
    mv_cols = data.isna().sum(axis=0)
174
    mv_total = data.isna().sum().sum()
175
    mv_rows_ratio = mv_rows/data.shape[1]
176
    mv_cols_ratio = mv_cols/data.shape[0]
177
178
    return {'mv_total': mv_total,
179
            'mv_rows': mv_rows,
180
            'mv_cols': mv_cols,
181
            'mv_rows_ratio': mv_rows_ratio,
182
            'mv_cols_ratio': mv_cols_ratio}
183
184
185
def _validate_input_bool(value, desc):
186
    if not(isinstance(value, bool)):
187
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
188
189
190
def _validate_input_int(value, desc):
191
    if type(value) != int:
192
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
193
194
195
def _validate_input_range(value, desc, lower, upper):
196
    if value < lower or value > upper:
197
        raise ValueError(
198
            f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
199
200
201
def _validate_input_smaller(value1, value2, desc):
202
    if value1 > value2:
203
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
204
205
206
def _validate_input_sum(limit, desc, *args):
207
    if sum(args) > limit:
208
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
209