GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( cd89f1...aa5eb4 )
by Andreas
01:21
created

klib.clean.memory_usage()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
# function to drop empty rows and columns based on thresholds and reindex?
2
# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series)
3
4
# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation
5
6
# list all dropped columns and rows and provide a before and after summary of shape, memory etc
7
8
9
# drop further columns and rows based on criteria
10
11
12
# deal with outliers --> Outlier models? Possible Options? Default values?
13
# list possible outliers base on standard deviation
14
# winsorize?
15
# quantile based
16
# Dropping the outlier rows with Percentiles
17
# upper_lim = data['column'].quantile(.95)
18
# lower_lim = data['column'].quantile(.05)
19
20
# capping the data at a certain value
21
# Capping the outlier rows with Percentiles
22
# upper_lim = data['column'].quantile(.95)
23
# lower_lim = data['column'].quantile(.05)
24
# data.loc[(df[column] > upper_lim),column] = upper_lim
25
# data.loc[(df[column] < lower_lim),column] = lower_lim
26
27
28
# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
29
30
# imputation methods
31
# col or row?
32
# mean
33
# median - more outlier resistant
34
# ...
35
36
37
# further feature engineering
38
39
# extract information from datetimes
40
# create features for year, month, day, weekday, weekend, day of the week, holiday, ...
41
42
# from datetime import date
43
44
# data = pd.DataFrame({'date':
45
# ['01-01-2017',
46
# '04-12-2008',
47
# '23-06-1988',
48
# '25-08-1999',
49
# '20-02-1993',
50
# ]})
51
52
# #Transform string to date
53
# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")
54
55
# #Extracting Year
56
# data['year'] = data['date'].dt.year
57
58
# #Extracting Month
59
# data['month'] = data['date'].dt.month
60
61
# #Extracting passed years since the date
62
# data['passed_years'] = date.today().year - data['date'].dt.year
63
64
# #Extracting passed months since the date
65
# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month
66
67
# #Extracting the weekday name of the date
68
# data['day_name'] = data['date'].dt.day_name()
69
#         date  year  month  passed_years  passed_months   day_name
70
# 0 2017-01-01  2017      1             2             26     Sunday
71
# 1 2008-12-04  2008     12            11            123   Thursday
72
# 2 1988-06-23  1988      6            31            369   Thursday
73
# 3 1999-08-25  1999      8            20            235  Wednesday
74
# 4 1993-02-20  1993      2            26            313   Saturday
75
76
# binning (give option to choose features to bin and how)
77
# binning of numerical variables
78
# binning of categorical data
79
80
# Categorical Binning Example
81
#      Country
82
# 0      Spain
83
# 1      Chile
84
# 2  Australia
85
# 3      Italy
86
# 4     Brazil
87
# conditions = [
88
#     data['Country'].str.contains('Spain'),
89
#     data['Country'].str.contains('Italy'),
90
#     data['Country'].str.contains('Chile'),
91
#     data['Country'].str.contains('Brazil')]
92
93
# choices = ['Europe', 'Europe', 'South America', 'South America']
94
95
# data['Continent'] = np.select(conditions, choices, default='Other')
96
#      Country      Continent
97
# 0      Spain         Europe
98
# 1      Chile  South America
99
# 2  Australia          Other
100
# 3      Italy         Europe
101
# 4     Brazil  South America
102
103
# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies
104
105
# export / save " clean data"?
106
107
108
def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]):
109
    '''
110
    Convert columns to best possible dtypes using dtypes supporting pd.NA.
111
    
112
    
113
    '''
114
    data = data.copy()
115
    for col in data.columns:
116
        data[col] = data[col].convert_dtypes()
117
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
118
        if category and unique_vals_ratio < cat_threshold and col not in exclude:
119
            data[col] = data[col].astype('category')
120
    return data
121
122
123
def memory_usage(data):
124
    '''
125
    Total memory usage in Kilobytes.
126
    '''
127
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
128
    return memory_usage
129