1
|
|
|
# function to drop empty rows and columns based on thresholds and reindex? |
2
|
|
|
# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series) |
3
|
|
|
|
4
|
|
|
# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation |
5
|
|
|
|
6
|
|
|
# list all dropped columns and rows and provide a before and after summary of shape, memory etc |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
# drop further columns and rows based on criteria |
10
|
|
|
|
11
|
|
|
|
12
|
|
|
# deal with outliers --> Outlier models? Possible Options? Default values? |
13
|
|
|
# list possible outliers base on standard deviation |
14
|
|
|
# winsorize? |
15
|
|
|
# quantile based |
16
|
|
|
# Dropping the outlier rows with Percentiles |
17
|
|
|
# upper_lim = data['column'].quantile(.95) |
18
|
|
|
# lower_lim = data['column'].quantile(.05) |
19
|
|
|
|
20
|
|
|
# capping the data at a certain value |
21
|
|
|
# Capping the outlier rows with Percentiles |
22
|
|
|
# upper_lim = data['column'].quantile(.95) |
23
|
|
|
# lower_lim = data['column'].quantile(.05) |
24
|
|
|
# data.loc[(df[column] > upper_lim),column] = upper_lim |
25
|
|
|
# data.loc[(df[column] < lower_lim),column] = lower_lim |
26
|
|
|
|
27
|
|
|
|
28
|
|
|
# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)] |
29
|
|
|
|
30
|
|
|
# imputation methods |
31
|
|
|
# col or row? |
32
|
|
|
# mean |
33
|
|
|
# median - more outlier resistant |
34
|
|
|
# ... |
35
|
|
|
|
36
|
|
|
|
37
|
|
|
# further feature engineering |
38
|
|
|
|
39
|
|
|
# extract information from datetimes |
40
|
|
|
# create features for year, month, day, weekday, weekend, day of the week, holiday, ... |
41
|
|
|
|
42
|
|
|
# from datetime import date |
43
|
|
|
|
44
|
|
|
# data = pd.DataFrame({'date': |
45
|
|
|
# ['01-01-2017', |
46
|
|
|
# '04-12-2008', |
47
|
|
|
# '23-06-1988', |
48
|
|
|
# '25-08-1999', |
49
|
|
|
# '20-02-1993', |
50
|
|
|
# ]}) |
51
|
|
|
|
52
|
|
|
# #Transform string to date |
53
|
|
|
# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y") |
54
|
|
|
|
55
|
|
|
# #Extracting Year |
56
|
|
|
# data['year'] = data['date'].dt.year |
57
|
|
|
|
58
|
|
|
# #Extracting Month |
59
|
|
|
# data['month'] = data['date'].dt.month |
60
|
|
|
|
61
|
|
|
# #Extracting passed years since the date |
62
|
|
|
# data['passed_years'] = date.today().year - data['date'].dt.year |
63
|
|
|
|
64
|
|
|
# #Extracting passed months since the date |
65
|
|
|
# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month |
66
|
|
|
|
67
|
|
|
# #Extracting the weekday name of the date |
68
|
|
|
# data['day_name'] = data['date'].dt.day_name() |
69
|
|
|
# date year month passed_years passed_months day_name |
70
|
|
|
# 0 2017-01-01 2017 1 2 26 Sunday |
71
|
|
|
# 1 2008-12-04 2008 12 11 123 Thursday |
72
|
|
|
# 2 1988-06-23 1988 6 31 369 Thursday |
73
|
|
|
# 3 1999-08-25 1999 8 20 235 Wednesday |
74
|
|
|
# 4 1993-02-20 1993 2 26 313 Saturday |
75
|
|
|
|
76
|
|
|
# binning (give option to choose features to bin and how) |
77
|
|
|
# binning of numerical variables |
78
|
|
|
# binning of categorical data |
79
|
|
|
|
80
|
|
|
# Categorical Binning Example |
81
|
|
|
# Country |
82
|
|
|
# 0 Spain |
83
|
|
|
# 1 Chile |
84
|
|
|
# 2 Australia |
85
|
|
|
# 3 Italy |
86
|
|
|
# 4 Brazil |
87
|
|
|
# conditions = [ |
88
|
|
|
# data['Country'].str.contains('Spain'), |
89
|
|
|
# data['Country'].str.contains('Italy'), |
90
|
|
|
# data['Country'].str.contains('Chile'), |
91
|
|
|
# data['Country'].str.contains('Brazil')] |
92
|
|
|
|
93
|
|
|
# choices = ['Europe', 'Europe', 'South America', 'South America'] |
94
|
|
|
|
95
|
|
|
# data['Continent'] = np.select(conditions, choices, default='Other') |
96
|
|
|
# Country Continent |
97
|
|
|
# 0 Spain Europe |
98
|
|
|
# 1 Chile South America |
99
|
|
|
# 2 Australia Other |
100
|
|
|
# 3 Italy Europe |
101
|
|
|
# 4 Brazil South America |
102
|
|
|
|
103
|
|
|
# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies |
104
|
|
|
|
105
|
|
|
# export / save " clean data"? |
106
|
|
|
|
107
|
|
|
|
108
|
|
|
def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]): |
109
|
|
|
''' |
110
|
|
|
Convert columns to best possible dtypes using dtypes supporting pd.NA. |
111
|
|
|
|
112
|
|
|
|
113
|
|
|
''' |
114
|
|
|
data = data.copy() |
115
|
|
|
for col in data.columns: |
116
|
|
|
data[col] = data[col].convert_dtypes() |
117
|
|
|
unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0] |
118
|
|
|
if category and unique_vals_ratio < cat_threshold and col not in exclude: |
119
|
|
|
data[col] = data[col].astype('category') |
120
|
|
|
return data |
121
|
|
|
|
122
|
|
|
|
123
|
|
|
def memory_usage(data): |
124
|
|
|
''' |
125
|
|
|
Total memory usage in Kilobytes. |
126
|
|
|
''' |
127
|
|
|
memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2) |
128
|
|
|
return memory_usage |
129
|
|
|
|