GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( aa5eb4...c0ff44 )
by Andreas
01:11
created

klib.clean   A

Complexity

Total Complexity 10

Size/Duplication

Total Lines 217
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 10
eloc 33
dl 0
loc 217
rs 10
c 0
b 0
f 0

5 Functions

Rating   Name   Duplication   Size   Complexity  
A missing_vals() 0 16 1
A memory_usage() 0 17 1
A summary_stats() 0 11 1
A convert_datatypes() 0 30 5
A data_cleaning() 0 15 2
1
'''
2
Utilities for data cleaning.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
import statsmodels
11
12
13
# function to drop empty rows and columns based on thresholds and reindex?
14
# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series)
15
16
# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation
17
18
# list all dropped columns and rows and provide a before and after summary of shape, memory etc
19
20
21
# drop further columns and rows based on criteria
22
23
24
# deal with outliers --> Outlier models? Possible Options? Default values?
25
# list possible outliers base on standard deviation
26
# winsorize?
27
# quantile based
28
# Dropping the outlier rows with Percentiles
29
# upper_lim = data['column'].quantile(.95)
30
# lower_lim = data['column'].quantile(.05)
31
32
# capping the data at a certain value
33
# Capping the outlier rows with Percentiles
34
# upper_lim = data['column'].quantile(.95)
35
# lower_lim = data['column'].quantile(.05)
36
# data.loc[(df[column] > upper_lim),column] = upper_lim
37
# data.loc[(df[column] < lower_lim),column] = lower_lim
38
39
40
# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
41
42
# imputation methods
43
# col or row?
44
# mean
45
# median - more outlier resistant
46
# ...
47
48
49
# further feature engineering
50
51
# extract information from datetimes
52
# create features for year, month, day, weekday, weekend, day of the week, holiday, ...
53
54
# from datetime import date
55
56
# data = pd.DataFrame({'date':
57
# ['01-01-2017',
58
# '04-12-2008',
59
# '23-06-1988',
60
# '25-08-1999',
61
# '20-02-1993',
62
# ]})
63
64
# #Transform string to date
65
# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")
66
67
# #Extracting Year
68
# data['year'] = data['date'].dt.year
69
70
# #Extracting Month
71
# data['month'] = data['date'].dt.month
72
73
# #Extracting passed years since the date
74
# data['passed_years'] = date.today().year - data['date'].dt.year
75
76
# #Extracting passed months since the date
77
# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month
78
79
# #Extracting the weekday name of the date
80
# data['day_name'] = data['date'].dt.day_name()
81
#         date  year  month  passed_years  passed_months   day_name
82
# 0 2017-01-01  2017      1             2             26     Sunday
83
# 1 2008-12-04  2008     12            11            123   Thursday
84
# 2 1988-06-23  1988      6            31            369   Thursday
85
# 3 1999-08-25  1999      8            20            235  Wednesday
86
# 4 1993-02-20  1993      2            26            313   Saturday
87
88
# binning (give option to choose features to bin and how)
89
# binning of numerical variables
90
# binning of categorical data
91
92
# Categorical Binning Example
93
#      Country
94
# 0      Spain
95
# 1      Chile
96
# 2  Australia
97
# 3      Italy
98
# 4     Brazil
99
# conditions = [
100
#     data['Country'].str.contains('Spain'),
101
#     data['Country'].str.contains('Italy'),
102
#     data['Country'].str.contains('Chile'),
103
#     data['Country'].str.contains('Brazil')]
104
105
# choices = ['Europe', 'Europe', 'South America', 'South America']
106
107
# data['Continent'] = np.select(conditions, choices, default='Other')
108
#      Country      Continent
109
# 0      Spain         Europe
110
# 1      Chile  South America
111
# 2  Australia          Other
112
# 3      Italy         Europe
113
# 4     Brazil  South America
114
115
# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies
116
117
# export / save " clean data"?
118
119
120
def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]):
121
    '''
122
    Convert columns to best possible dtypes using dtypes supporting pd.NA.
123
124
    Parameters:
125
    ----------
126
    data: Pandas DataFrame or numpy ndarray.
127
128
    category: bool, default True
129
        Change dtypes of columns to "category". Set threshold using cat_threshold.
130
131
    cat_threshold: float, default 0.05
132
        Ratio of unique values below which column dtype is changed to categorical.
133
134
    exclude: default [] (empty list)
135
        List of columns to exclude from categorical conversion.
136
137
    Returns:
138
    -------
139
    Pandas DataFrame or numpy ndarray.
140
141
    '''
142
143
    data = pd.DataFrame(data).copy()
144
    for col in data.columns:
145
        data[col] = data[col].convert_dtypes()
146
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
147
        if category and unique_vals_ratio < cat_threshold and col not in exclude:
148
            data[col] = data[col].astype('category')
149
    return data
150
151
152
def memory_usage(data): # --> describe.py & setup imports
153
    '''
154
    Total memory usage in kilobytes.
155
156
    Parameters:
157
    ----------
158
    data: Pandas DataFrame or numpy ndarray.
159
160
    Returns:
161
    -------
162
    memory_usage: float
163
164
    '''
165
166
    data = pd.DataFrame(data)
167
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
168
    return memory_usage
169
170
171
def missing_vals(data): # --> describe.py & setup imports
172
    '''
173
    Total missing values in the dataset.
174
175
    Parameters:
176
    ----------
177
178
    data: Pandas DataFrame or numpy ndarray.
179
180
    Returns:
181
    -------
182
    missing_vals: float
183
    '''
184
185
    missing_vals = data.isna().sum().sum()
186
    return missing_vals
187
188
189
def summary_stats(data):
190
    rows = data.shape[0]
191
    cols = data.shape[1]
192
    describe = data.describe().round(1)
193
    # look for possible statistics (statsmodels)
194
    # no visualizations here
195
    # Shape
196
    print('Summary statistics:\n___________________\n')
197
    print(f'Number of rows: {rows}')
198
    print(f'Number of columns: {cols}\n___________________\n')
199
    return describe
200
201
202
def data_cleaning(data, showall=True):
203
    '''
204
    initial data cleaning
205
    '''
206
    data_cleaned = convert_datatypes(data)
207
208
    if showall:
209
        print(f'Memory usage before data cleaning: {memory_usage(data)} kilobytes.')
210
211
        print(f'Memory usage after data cleaning: {memory_usage(data_cleaned)} kilobytes.')
212
213
    else:
214
        pass
215
216
    return data_cleaned
217
218
#     before: number of columns, number of rows, memory usage, number of NAs
219
#     cleaning empty columns etc.
220
#     after: number of columns, number of rows, memory usage, number of NAs
221
#     improvement / changes
222