1
|
|
|
''' |
2
|
|
|
Utilities for data cleaning. |
3
|
|
|
|
4
|
|
|
:author: Andreas Kanz |
5
|
|
|
|
6
|
|
|
''' |
7
|
|
|
|
8
|
|
|
# Imports |
9
|
|
|
import pandas as pd |
10
|
|
|
import statsmodels |
11
|
|
|
|
12
|
|
|
|
13
|
|
|
# function to drop empty rows and columns based on thresholds and reindex? |
14
|
|
|
# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series) |
15
|
|
|
|
16
|
|
|
# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation |
17
|
|
|
|
18
|
|
|
# list all dropped columns and rows and provide a before and after summary of shape, memory etc |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
# drop further columns and rows based on criteria |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
# deal with outliers --> Outlier models? Possible Options? Default values? |
25
|
|
|
# list possible outliers base on standard deviation |
26
|
|
|
# winsorize? |
27
|
|
|
# quantile based |
28
|
|
|
# Dropping the outlier rows with Percentiles |
29
|
|
|
# upper_lim = data['column'].quantile(.95) |
30
|
|
|
# lower_lim = data['column'].quantile(.05) |
31
|
|
|
|
32
|
|
|
# capping the data at a certain value |
33
|
|
|
# Capping the outlier rows with Percentiles |
34
|
|
|
# upper_lim = data['column'].quantile(.95) |
35
|
|
|
# lower_lim = data['column'].quantile(.05) |
36
|
|
|
# data.loc[(df[column] > upper_lim),column] = upper_lim |
37
|
|
|
# data.loc[(df[column] < lower_lim),column] = lower_lim |
38
|
|
|
|
39
|
|
|
|
40
|
|
|
# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)] |
41
|
|
|
|
42
|
|
|
# imputation methods |
43
|
|
|
# col or row? |
44
|
|
|
# mean |
45
|
|
|
# median - more outlier resistant |
46
|
|
|
# ... |
47
|
|
|
|
48
|
|
|
|
49
|
|
|
# further feature engineering |
50
|
|
|
|
51
|
|
|
# extract information from datetimes |
52
|
|
|
# create features for year, month, day, weekday, weekend, day of the week, holiday, ... |
53
|
|
|
|
54
|
|
|
# from datetime import date |
55
|
|
|
|
56
|
|
|
# data = pd.DataFrame({'date': |
57
|
|
|
# ['01-01-2017', |
58
|
|
|
# '04-12-2008', |
59
|
|
|
# '23-06-1988', |
60
|
|
|
# '25-08-1999', |
61
|
|
|
# '20-02-1993', |
62
|
|
|
# ]}) |
63
|
|
|
|
64
|
|
|
# #Transform string to date |
65
|
|
|
# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y") |
66
|
|
|
|
67
|
|
|
# #Extracting Year |
68
|
|
|
# data['year'] = data['date'].dt.year |
69
|
|
|
|
70
|
|
|
# #Extracting Month |
71
|
|
|
# data['month'] = data['date'].dt.month |
72
|
|
|
|
73
|
|
|
# #Extracting passed years since the date |
74
|
|
|
# data['passed_years'] = date.today().year - data['date'].dt.year |
75
|
|
|
|
76
|
|
|
# #Extracting passed months since the date |
77
|
|
|
# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month |
78
|
|
|
|
79
|
|
|
# #Extracting the weekday name of the date |
80
|
|
|
# data['day_name'] = data['date'].dt.day_name() |
81
|
|
|
# date year month passed_years passed_months day_name |
82
|
|
|
# 0 2017-01-01 2017 1 2 26 Sunday |
83
|
|
|
# 1 2008-12-04 2008 12 11 123 Thursday |
84
|
|
|
# 2 1988-06-23 1988 6 31 369 Thursday |
85
|
|
|
# 3 1999-08-25 1999 8 20 235 Wednesday |
86
|
|
|
# 4 1993-02-20 1993 2 26 313 Saturday |
87
|
|
|
|
88
|
|
|
# binning (give option to choose features to bin and how) |
89
|
|
|
# binning of numerical variables |
90
|
|
|
# binning of categorical data |
91
|
|
|
|
92
|
|
|
# Categorical Binning Example |
93
|
|
|
# Country |
94
|
|
|
# 0 Spain |
95
|
|
|
# 1 Chile |
96
|
|
|
# 2 Australia |
97
|
|
|
# 3 Italy |
98
|
|
|
# 4 Brazil |
99
|
|
|
# conditions = [ |
100
|
|
|
# data['Country'].str.contains('Spain'), |
101
|
|
|
# data['Country'].str.contains('Italy'), |
102
|
|
|
# data['Country'].str.contains('Chile'), |
103
|
|
|
# data['Country'].str.contains('Brazil')] |
104
|
|
|
|
105
|
|
|
# choices = ['Europe', 'Europe', 'South America', 'South America'] |
106
|
|
|
|
107
|
|
|
# data['Continent'] = np.select(conditions, choices, default='Other') |
108
|
|
|
# Country Continent |
109
|
|
|
# 0 Spain Europe |
110
|
|
|
# 1 Chile South America |
111
|
|
|
# 2 Australia Other |
112
|
|
|
# 3 Italy Europe |
113
|
|
|
# 4 Brazil South America |
114
|
|
|
|
115
|
|
|
# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies |
116
|
|
|
|
117
|
|
|
# export / save " clean data"? |
118
|
|
|
|
119
|
|
|
|
120
|
|
|
def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]): |
121
|
|
|
''' |
122
|
|
|
Convert columns to best possible dtypes using dtypes supporting pd.NA. |
123
|
|
|
|
124
|
|
|
Parameters: |
125
|
|
|
---------- |
126
|
|
|
data: Pandas DataFrame or numpy ndarray. |
127
|
|
|
|
128
|
|
|
category: bool, default True |
129
|
|
|
Change dtypes of columns to "category". Set threshold using cat_threshold. |
130
|
|
|
|
131
|
|
|
cat_threshold: float, default 0.05 |
132
|
|
|
Ratio of unique values below which column dtype is changed to categorical. |
133
|
|
|
|
134
|
|
|
exclude: default [] (empty list) |
135
|
|
|
List of columns to exclude from categorical conversion. |
136
|
|
|
|
137
|
|
|
Returns: |
138
|
|
|
------- |
139
|
|
|
Pandas DataFrame or numpy ndarray. |
140
|
|
|
|
141
|
|
|
''' |
142
|
|
|
|
143
|
|
|
data = pd.DataFrame(data).copy() |
144
|
|
|
for col in data.columns: |
145
|
|
|
data[col] = data[col].convert_dtypes() |
146
|
|
|
unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0] |
147
|
|
|
if category and unique_vals_ratio < cat_threshold and col not in exclude: |
148
|
|
|
data[col] = data[col].astype('category') |
149
|
|
|
return data |
150
|
|
|
|
151
|
|
|
|
152
|
|
|
def memory_usage(data): # --> describe.py & setup imports |
153
|
|
|
''' |
154
|
|
|
Total memory usage in kilobytes. |
155
|
|
|
|
156
|
|
|
Parameters: |
157
|
|
|
---------- |
158
|
|
|
data: Pandas DataFrame or numpy ndarray. |
159
|
|
|
|
160
|
|
|
Returns: |
161
|
|
|
------- |
162
|
|
|
memory_usage: float |
163
|
|
|
|
164
|
|
|
''' |
165
|
|
|
|
166
|
|
|
data = pd.DataFrame(data) |
167
|
|
|
memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2) |
168
|
|
|
return memory_usage |
169
|
|
|
|
170
|
|
|
|
171
|
|
|
def missing_vals(data): # --> describe.py & setup imports |
172
|
|
|
''' |
173
|
|
|
Total missing values in the dataset. |
174
|
|
|
|
175
|
|
|
Parameters: |
176
|
|
|
---------- |
177
|
|
|
|
178
|
|
|
data: Pandas DataFrame or numpy ndarray. |
179
|
|
|
|
180
|
|
|
Returns: |
181
|
|
|
------- |
182
|
|
|
missing_vals: float |
183
|
|
|
''' |
184
|
|
|
|
185
|
|
|
missing_vals = data.isna().sum().sum() |
186
|
|
|
return missing_vals |
187
|
|
|
|
188
|
|
|
|
189
|
|
|
def summary_stats(data): |
190
|
|
|
rows = data.shape[0] |
191
|
|
|
cols = data.shape[1] |
192
|
|
|
describe = data.describe().round(1) |
193
|
|
|
# look for possible statistics (statsmodels) |
194
|
|
|
# no visualizations here |
195
|
|
|
# Shape |
196
|
|
|
print('Summary statistics:\n___________________\n') |
197
|
|
|
print(f'Number of rows: {rows}') |
198
|
|
|
print(f'Number of columns: {cols}\n___________________\n') |
199
|
|
|
return describe |
200
|
|
|
|
201
|
|
|
|
202
|
|
|
def data_cleaning(data, showall=True): |
203
|
|
|
''' |
204
|
|
|
initial data cleaning |
205
|
|
|
''' |
206
|
|
|
data_cleaned = convert_datatypes(data) |
207
|
|
|
|
208
|
|
|
if showall: |
209
|
|
|
print(f'Memory usage before data cleaning: {memory_usage(data)} kilobytes.') |
210
|
|
|
|
211
|
|
|
print(f'Memory usage after data cleaning: {memory_usage(data_cleaned)} kilobytes.') |
212
|
|
|
|
213
|
|
|
else: |
214
|
|
|
pass |
215
|
|
|
|
216
|
|
|
return data_cleaned |
217
|
|
|
|
218
|
|
|
# before: number of columns, number of rows, memory usage, number of NAs |
219
|
|
|
# cleaning empty columns etc. |
220
|
|
|
# after: number of columns, number of rows, memory usage, number of NAs |
221
|
|
|
# improvement / changes |
222
|
|
|
|