GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 2625ff...cc4c68 )
by Andreas
01:13
created

klib.utils   A

Complexity

Total Complexity 22

Size/Duplication

Total Lines 207
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 85
dl 0
loc 207
rs 10
c 0
b 0
f 0
wmc 22

9 Functions

Rating   Name   Duplication   Size   Complexity  
A _corr_selector() 0 35 5
A _memory_usage() 0 17 1
A _drop_duplicates() 0 19 1
B _diff_report() 0 65 5
A _missing_vals() 0 29 1
A _validate_input_int() 0 3 2
A _validate_input_bool() 0 3 2
A _validate_input_range() 0 4 3
A _validate_input_smaller() 0 3 2
1
'''
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
13
def _corr_selector(corr, split=None, threshold=0):
14
    '''
15
    Select correlations based on the provided parameters.
16
17
    Parameters
18
    ----------
19
    corr: List or matrix of correlations.
20
21
    split: {None, 'pos', 'neg', 'high', 'low'}, default None
22
        Type of split to be performed.
23
24
    threshold: float, default 0
25
        Value between 0 <= threshold <= 1
26
27
    Returns:
28
    -------
29
    corr: List or matrix of (filtered) correlations.
30
    '''
31
32
    if split == 'pos':
33
        corr = corr.where((corr >= threshold) & (corr > 0))
34
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
35
    elif split == 'neg':
36
        corr = corr.where((corr <= threshold) & (corr < 0))
37
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
38
    elif split == 'high':
39
        corr = corr.where(np.abs(corr) >= threshold)
40
        print(f'Displaying absolute correlations above the threshold ({threshold}).')
41
    elif split == 'low':
42
        corr = corr.where(np.abs(corr) <= threshold)
43
        print(f'Displaying absolute correlations below the threshold ({threshold}).')
44
    else:
45
        corr = corr
46
47
    return corr
48
49
50
def _diff_report(data, data_cleaned, dupl_rows=None, single_val_cols=None, show='changes'):
51
    '''
52
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
53
        columns as well as optimizing the datatypes.
54
55
    Parameters
56
    ----------
57
    data: 2D dataset that can be coerced into Pandas DataFrame.
58
        Input the initial dataset here.
59
60
    data_cleaned: 2D dataset that can be coerced into Pandas DataFrame.
61
        Input the cleaned / updated dataset here.
62
63
    dupl_rows: list, default None
64
        List of duplicate row indices.
65
66
    single_val_cols: list, default None
67
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
68
        NaNs count as a separate value.
69
70
    show: {'all', 'changes', None} default 'all'
71
        Specify verbosity of the output.
72
        * 'all': Print information about the data before and after cleaning as well as information about changes.
73
        * 'changes': Print out differences in the data before and after cleaning.
74
        * None: No information about the data and the data cleaning is printed.
75
76
    Returns:
77
    -------
78
    Print statement highlighting the datasets or changes between the two datasets.
79
80
    '''
81
82
    if show in ['changes', 'all']:
83
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
84
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
85
        data_mem = _memory_usage(data)
86
        data_cl_mem = _memory_usage(data_cleaned)
87
        data_mv_tot = _missing_vals(data)['mv_total']
88
        data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']
89
90
        if show == 'all':
91
            print('Before data cleaning:\n')
92
            print(f'dtypes:\n{data.dtypes.value_counts()}')
93
            print(f'\nNumber of rows: {data.shape[0]}')
94
            print(f'Number of cols: {data.shape[1]}')
95
            print(f'Missing values: {data_mv_tot}')
96
            print(f'Memory usage: {data_mem} KB')
97
            print('_______________________________________________________\n')
98
            print('After data cleaning:\n')
99
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
100
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
101
            print(f'Number of cols: {data_cleaned.shape[1]}')
102
            print(f'Missing values: {data_cl_mv_tot}')
103
            print(f'Memory usage: {data_cl_mem} KB')
104
            print('_______________________________________________________\n')
105
106
        print(f'Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}')
107
        print(f'\nChanges:')
108
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
109
        print(f'     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})')
110
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
111
        print(f'     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})')
112
        print(f'Dropped missing values: {data_mv_tot-data_cl_mv_tot}')
113
        mem_change = data_mem-data_cl_mem
114
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')
115
116
117
def _drop_duplicates(data):
118
    '''
119
    Provides information and drops duplicate rows.
120
121
    Parameters
122
    ----------
123
    data: 2D dataset that can be coerced into Pandas DataFrame.
124
125
    Returns
126
    -------
127
    data: Deduplicated Pandas DataFrame
128
    rows_dropped: Index Object of rows dropped.
129
    '''
130
131
    data = pd.DataFrame(data).copy()
132
    dupl_rows = data[data.duplicated()].index.tolist()
133
    data = data.drop(dupl_rows, axis='index')
134
135
    return data, dupl_rows
136
137
138
def _memory_usage(data):
139
    '''
140
    Gives the total memory usage in kilobytes.
141
142
    Parameters
143
    ----------
144
    data: 2D dataset that can be coerced into Pandas DataFrame.
145
146
    Returns
147
    -------
148
    memory_usage: float
149
    '''
150
151
    data = pd.DataFrame(data).copy()
152
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
153
154
    return memory_usage
155
156
157
def _missing_vals(data):
158
    '''
159
    Gives metrics of missing values in the dataset.
160
161
    Parameters
162
    ----------
163
    data: 2D dataset that can be coerced into Pandas DataFrame.
164
165
    Returns
166
    -------
167
    mv_total: float, number of missing values in the entire dataset
168
    mv_rows: float, number of missing values in each row
169
    mv_cols: float, number of missing values in each column
170
    mv_rows_ratio: float, ratio of missing values for each row
171
    mv_cols_ratio: float, ratio of missing values for each column
172
    '''
173
174
    data = pd.DataFrame(data).copy()
175
    mv_rows = data.isna().sum(axis=1)
176
    mv_cols = data.isna().sum(axis=0)
177
    mv_total = data.isna().sum().sum()
178
    mv_rows_ratio = mv_rows/data.shape[1]
179
    mv_cols_ratio = mv_cols/data.shape[0]
180
181
    return {'mv_total': mv_total,
182
            'mv_rows': mv_rows,
183
            'mv_cols': mv_cols,
184
            'mv_rows_ratio': mv_rows_ratio,
185
            'mv_cols_ratio': mv_cols_ratio}
186
187
188
def _validate_input_bool(value, desc):
189
    if not(isinstance(value, bool)):
190
        raise TypeError(f'Input value for {desc} is {type(value)} but should be a boolean.')
191
192
193
def _validate_input_int(value, desc):
194
    if type(value) != int:
195
        raise TypeError(f'Input value for {desc} is {type(value)} but should be an integer.')
196
197
198
def _validate_input_range(value, desc, lower, upper):
199
    if value < lower or value > upper:
200
        raise ValueError(
201
            f'Input value for {desc} is {value} but should be in the range {lower} <= {desc} <= {upper}.')
202
203
204
def _validate_input_smaller(value1, value2, desc):
205
    if value1 > value2:
206
        raise ValueError(f'The first input for {desc} should be smaller or equal to the second input.')
207