GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 1ffeda...94eb5d )
by Andreas
01:28
created

klib.utils._validate_input_sum()   A

Complexity

Conditions 2

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 3
nop 3
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
'''
2
Utilities and auxiliary functions.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
13
def _corr_selector(corr, split=None, threshold=0):
14
    '''
15
    Select correlations based on the provided parameters.
16
17
    Parameters
18
    ----------
19
    corr: List or matrix of correlations.
20
21
    split: {None, 'pos', 'neg', 'above', 'below'}, default None
22
        Type of split to be performed.
23
24
    threshold: float, default 0
25
        Value between 0 <= threshold <= 1
26
27
    Returns:
28
    -------
29
    corr: List or matrix of (filtered) correlations.
30
    '''
31
32
    if split == 'pos':
33
        corr = corr.where((corr >= threshold) & (corr > 0))
34
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
35
    elif split == 'neg':
36
        corr = corr.where((corr <= threshold) & (corr < 0))
37
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
38
    elif split == 'above':
39
        corr = corr.where(np.abs(corr) >= threshold)
40
        print(f'Displaying absolute correlations above the threshold ({threshold}).')
41
    elif split == 'below':
42
        corr = corr.where(np.abs(corr) <= threshold)
43
        print(f'Displaying absolute correlations below the threshold ({threshold}).')
44
    else:
45
        corr = corr
46
47
    return corr
48
49
50
def _diff_report(data, data_cleaned, dupl_rows=None, single_val_cols=None, show='changes'):
51
    '''
52
    Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \
53
        columns as well as optimizing the datatypes.
54
55
    Parameters
56
    ----------
57
    data: 2D dataset that can be coerced into Pandas DataFrame.
58
        Input the initial dataset here.
59
60
    data_cleaned: 2D dataset that can be coerced into Pandas DataFrame.
61
        Input the cleaned / updated dataset here.
62
63
    dupl_rows: list, default None
64
        List of duplicate row indices.
65
66
    single_val_cols: list, default None
67
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
68
        NaNs count as a separate value.
69
70
    show: {'all', 'changes', None} default 'all'
71
        Specify verbosity of the output.
72
        * 'all': Print information about the data before and after cleaning as well as information about changes.
73
        * 'changes': Print out differences in the data before and after cleaning.
74
        * None: No information about the data and the data cleaning is printed.
75
76
    Returns:
77
    -------
78
    Print statement highlighting the datasets or changes between the two datasets.
79
    '''
80
81
    if show in ['changes', 'all']:
82
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
83
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
84
        data_mem = _memory_usage(data)
85
        data_cl_mem = _memory_usage(data_cleaned)
86
        data_mv_tot = _missing_vals(data)['mv_total']
87
        data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']
88
89
        if show == 'all':
90
            print('Before data cleaning:\n')
91
            print(f'dtypes:\n{data.dtypes.value_counts()}')
92
            print(f'\nNumber of rows: {data.shape[0]}')
93
            print(f'Number of cols: {data.shape[1]}')
94
            print(f'Missing values: {data_mv_tot}')
95
            print(f'Memory usage: {data_mem} KB')
96
            print('_______________________________________________________\n')
97
            print('After data cleaning:\n')
98
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
99
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
100
            print(f'Number of cols: {data_cleaned.shape[1]}')
101
            print(f'Missing values: {data_cl_mv_tot}')
102
            print(f'Memory usage: {data_cl_mem} KB')
103
            print('_______________________________________________________\n')
104
105
        print(f'Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}')
106
        print(f'\nChanges:')
107
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
108
        print(f'     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})')
109
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
110
        print(f'     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})')
111
        print(f'Dropped missing values: {data_mv_tot-data_cl_mv_tot}')
112
        mem_change = data_mem-data_cl_mem
113
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')
114
115
116
def _drop_duplicates(data):
117
    '''
118
    Provides information and drops duplicate rows.
119
120
    Parameters
121
    ----------
122
    data: 2D dataset that can be coerced into Pandas DataFrame.
123
124
    Returns
125
    -------
126
    data: Deduplicated Pandas DataFrame
127
    rows_dropped: Index Object of rows dropped.
128
    '''
129
130
    data = pd.DataFrame(data).copy()
131
    dupl_rows = data[data.duplicated()].index.tolist()
132
    data = data.drop(dupl_rows, axis='index')
133
134
    return data, dupl_rows
135
136
137
def _memory_usage(data):
138
    '''
139
    Gives the total memory usage in kilobytes.
140
141
    Parameters
142
    ----------
143
    data: 2D dataset that can be coerced into Pandas DataFrame.
144
145
    Returns
146
    -------
147
    memory_usage: float
148
    '''
149
150
    data = pd.DataFrame(data).copy()
151
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
152
153
    return memory_usage
154
155
156
def _missing_vals(data):
157
    '''
158
    Gives metrics of missing values in the dataset.
159
160
    Parameters
161
    ----------
162
    data: 2D dataset that can be coerced into Pandas DataFrame.
163
164
    Returns
165
    -------
166
    mv_total: float, number of missing values in the entire dataset
167
    mv_rows: float, number of missing values in each row
168
    mv_cols: float, number of missing values in each column
169
    mv_rows_ratio: float, ratio of missing values for each row
170
    mv_cols_ratio: float, ratio of missing values for each column
171
    '''
172
173
    data = pd.DataFrame(data).copy()
174
    mv_rows = data.isna().sum(axis=1)
175
    mv_cols = data.isna().sum(axis=0)
176
    mv_total = data.isna().sum().sum()
177
    mv_rows_ratio = mv_rows/data.shape[1]
178
    mv_cols_ratio = mv_cols/data.shape[0]
179
180
    return {'mv_total': mv_total,
181
            'mv_rows': mv_rows,
182
            'mv_cols': mv_cols,
183
            'mv_rows_ratio': mv_rows_ratio,
184
            'mv_cols_ratio': mv_cols_ratio}
185
186
187
def _validate_input_bool(value, desc):
188
    if not(isinstance(value, bool)):
189
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
190
191
192
def _validate_input_int(value, desc):
193
    if type(value) != int:
194
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
195
196
197
def _validate_input_range(value, desc, lower, upper):
198
    if value < lower or value > upper:
199
        raise ValueError(
200
            f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
201
202
203
def _validate_input_smaller(value1, value2, desc):
204
    if value1 > value2:
205
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
206
207
208
def _validate_input_sum(limit, desc, *args):
209
    if sum(args) > limit:
210
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
211