1
|
|
|
""" |
2
|
|
|
Summary: |
3
|
|
|
Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and |
4
|
|
|
preproces the data. |
5
|
|
|
Example function calls in 'Tutorial mcfly on PAMAP2.ipynb' |
6
|
|
|
""" |
7
|
1 |
|
import numpy as np |
8
|
1 |
|
from numpy import genfromtxt |
9
|
1 |
|
import pandas as pd |
10
|
1 |
|
import matplotlib.pyplot as plt |
11
|
1 |
|
from os import listdir |
12
|
1 |
|
import os.path |
13
|
1 |
|
import zipfile |
14
|
1 |
|
import keras |
15
|
1 |
|
from keras.utils.np_utils import to_categorical |
16
|
1 |
|
import sys |
17
|
1 |
|
if sys.version_info <= (3,): #python2 |
18
|
|
|
import urllib |
19
|
|
|
else: #python3 |
20
|
1 |
|
import urllib.request |
21
|
|
|
|
22
|
1 |
|
def split_activities(labels, X, borders=10*100): |
|
|
|
|
23
|
|
|
""" |
24
|
|
|
Splits up the data per activity and exclude activity=0. |
25
|
|
|
Also remove borders for each activity. |
26
|
|
|
Returns lists with subdatasets |
27
|
|
|
""" |
28
|
1 |
|
tot_len = len(labels) |
29
|
1 |
|
startpoints = np.where([1] + [labels[i] != labels[i-1] \ |
30
|
|
|
for i in range(1, tot_len)])[0] |
31
|
1 |
|
endpoints = np.append(startpoints[1:]-1, tot_len-1) |
32
|
1 |
|
acts = [labels[s] for s, e in zip(startpoints, endpoints)] |
33
|
|
|
#Also split up the data, and only keep the non-zero activities |
34
|
1 |
|
xysplit = [(X[s+borders:e-borders+1, :], a) \ |
35
|
|
|
for s, e, a in zip(startpoints, endpoints, acts) if a != 0] |
36
|
1 |
|
xysplit = [(X, y) for X, y in xysplit if len(X) > 0] |
37
|
1 |
|
Xlist = [X for X, y in xysplit] |
|
|
|
|
38
|
1 |
|
ylist = [y for X, y in xysplit] |
39
|
1 |
|
return Xlist, ylist |
40
|
|
|
|
41
|
1 |
|
def sliding_window(frame_length, step, Xsamples,\ |
|
|
|
|
42
|
|
|
ysamples, ysampleslist, Xsampleslist): |
43
|
|
|
""" |
44
|
|
|
Splits time series in ysampleslist and Xsampleslist |
45
|
|
|
into segments by applying a sliding overlapping window |
46
|
|
|
of size equal to frame_length with steps equal to step |
47
|
|
|
it does this for all the samples and appends all the output together. |
48
|
|
|
So, the participant distinction is not kept |
49
|
|
|
""" |
50
|
1 |
|
for j in range(len(Xsampleslist)): |
51
|
1 |
|
X = Xsampleslist[j] |
|
|
|
|
52
|
1 |
|
ybinary = ysampleslist[j] |
53
|
1 |
|
for i in range(0, X.shape[0]-frame_length, step): |
54
|
1 |
|
xsub = X[i:i+frame_length, :] |
55
|
1 |
|
ysub = ybinary |
56
|
1 |
|
Xsamples.append(xsub) |
57
|
1 |
|
ysamples.append(ysub) |
58
|
|
|
|
59
|
1 |
|
def transform_y(y, mapclasses, nr_classes): |
|
|
|
|
60
|
|
|
""" |
61
|
|
|
Transforms y, a list with one sequence of A timesteps |
62
|
|
|
and B unique classes into a binary Numpy matrix of |
63
|
|
|
shape (A, B) |
64
|
|
|
""" |
65
|
1 |
|
ymapped = np.array([mapclasses[c] for c in y], dtype='int') |
66
|
1 |
|
ybinary = to_categorical(ymapped, nr_classes) |
67
|
1 |
|
return ybinary |
68
|
|
|
|
69
|
1 |
|
def addheader(datasets): |
70
|
|
|
""" |
71
|
|
|
The columns of the pandas data frame are numbers |
72
|
|
|
this function adds the column labels |
73
|
|
|
""" |
74
|
1 |
|
axes = ['x', 'y', 'z'] |
75
|
1 |
|
IMUsensor_columns = ['temperature'] + \ |
|
|
|
|
76
|
|
|
['acc_16g_' + i for i in axes] + \ |
77
|
|
|
['acc_6g_' + i for i in axes] + \ |
78
|
|
|
['gyroscope_'+ i for i in axes] + \ |
79
|
|
|
['magnometer_'+ i for i in axes] + \ |
80
|
|
|
['orientation_' + str(i) for i in range(4)] |
81
|
1 |
|
header = ["timestamp", "activityID", "heartrate"] + ["hand_"+s \ |
82
|
|
|
for s in IMUsensor_columns] \ |
83
|
|
|
+ ["chest_"+s for s in IMUsensor_columns]+ ["ankle_"+s \ |
84
|
|
|
for s in IMUsensor_columns] |
85
|
1 |
|
for i in range(0, len(datasets)): |
86
|
1 |
|
datasets[i].columns = header |
|
|
|
|
87
|
|
|
return datasets |
88
|
|
|
|
89
|
1 |
|
def numpify_and_store(X, y, xname, yname, outdatapath, shuffle=False): |
|
|
|
|
90
|
|
|
""" |
91
|
|
|
Converts python lists x 3D and y 1D into numpy arrays |
92
|
|
|
and stores the numpy array in directory outdatapath |
93
|
|
|
shuffle is optional and shuffles the samples |
94
|
|
|
""" |
95
|
|
|
X = np.array(X) |
96
|
|
|
y = np.array(y) |
97
|
|
|
#Shuffle around the train set |
98
|
|
|
if shuffle is True: |
99
|
|
|
np.random.seed(123) |
100
|
|
|
neworder = np.random.permutation(X.shape[0]) |
101
|
|
|
X = X[neworder, :, :] |
102
|
|
|
y = y[neworder, :] |
103
|
|
|
# Save binary file |
104
|
|
|
np.save(outdatapath+ xname, X) |
105
|
|
|
np.save(outdatapath+ yname, y) |
106
|
|
|
|
107
|
|
|
|
108
|
1 |
|
def fetch_data(directory_to_extract_to): |
109
|
|
|
""" |
110
|
|
|
Fetch the data and extract the contents of the zip file |
111
|
|
|
to the directory_to_extract_to. |
112
|
|
|
First check whether this was done before, if yes, then skip |
113
|
|
|
""" |
114
|
|
|
targetdir = directory_to_extract_to + '/PAMAP2' |
115
|
|
|
if os.path.exists(targetdir): |
116
|
|
|
print('Data previously downloaded and stored in ' + targetdir) |
117
|
|
|
else: |
118
|
|
|
os.makedirs(targetdir) # create target directory |
119
|
|
|
#download the PAMAP2 data, this is 688 Mb |
120
|
|
|
path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip' |
121
|
|
|
test_file_exist = os.path.isfile(path_to_zip_file) |
122
|
|
|
if test_file_exist is False: |
123
|
|
|
url = str('https://archive.ics.uci.edu/ml/' + |
124
|
|
|
'machine-learning-databases/00231/PAMAP2_Dataset.zip') |
125
|
|
|
#retrieve data from url |
126
|
|
|
if sys.version_info <= (3,): #python2 |
127
|
|
|
local_fn, headers = urllib.urlretrieve(url,\ |
128
|
|
|
filename=path_to_zip_file) |
129
|
|
|
else: #python3 |
130
|
|
|
local_fn, headers = urllib.request.urlretrieve(url,\ |
131
|
|
|
filename=path_to_zip_file) |
132
|
|
|
print('Download complete and stored in: ' + path_to_zip_file) |
133
|
|
|
else: |
134
|
|
|
print('The data was previously downloaded and stored in ' + |
135
|
|
|
path_to_zip_file) |
136
|
|
|
# unzip |
137
|
|
|
with zipfile.ZipFile(path_to_zip_file ,"r") as zip_ref: |
|
|
|
|
138
|
|
|
zip_ref.extractall(targetdir) |
139
|
|
|
return targetdir |
140
|
|
|
|
141
|
|
|
|
142
|
1 |
|
def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None): |
143
|
|
|
""" |
144
|
|
|
High level function to fetch_and_preprocess the PAMAP2 dataset |
145
|
|
|
directory_to_extract_to: the directory where the data will be stored |
146
|
|
|
columns_to_use: the columns to use |
147
|
|
|
""" |
148
|
|
|
if columns_to_use is None: |
149
|
|
|
columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z', |
150
|
|
|
'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z', |
151
|
|
|
'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z'] |
152
|
|
|
targetdir = fetch_data(directory_to_extract_to) |
153
|
|
|
outdatapath = targetdir + '/PAMAP2_Dataset' + '/slidingwindow512cleaned/' |
154
|
|
|
if not os.path.exists(outdatapath): |
155
|
|
|
os.makedirs(outdatapath) |
156
|
|
|
if os.path.isfile(outdatapath+'x_train.npy'): |
157
|
|
|
print('Data previously pre-processed and np-files saved to ' + |
158
|
|
|
outdatapath) |
159
|
|
|
else: |
160
|
|
|
datadir = targetdir + '/PAMAP2_Dataset/Protocol' |
161
|
|
|
filenames = listdir(datadir) |
162
|
|
|
print('Start pre-processing all ' + str(len(filenames)) + ' files...') |
163
|
|
|
# load the files and put them in a list of pandas dataframes: |
164
|
|
|
datasets = [pd.read_csv(datadir+'/'+fn, header=None, sep=' ') \ |
165
|
|
|
for fn in filenames] |
166
|
|
|
datasets = addheader(datasets) # add headers to the datasets |
167
|
|
|
#Interpolate dataset to get same sample rate between channels |
168
|
|
|
datasets_filled = [d.interpolate() for d in datasets] |
169
|
|
|
# Create mapping for class labels |
170
|
|
|
ysetall = [set(np.array(data.activityID)) - set([0]) \ |
171
|
|
|
for data in datasets_filled] |
172
|
|
|
classlabels = list(set.union(*[set(y) for y in ysetall])) |
173
|
|
|
nr_classes = len(classlabels) |
174
|
|
|
mapclasses = {classlabels[i] : i for i in range(len(classlabels))} |
175
|
|
|
#Create input (x) and output (y) sets |
176
|
|
|
xall = [np.array(data[columns_to_use]) for data in datasets_filled] |
177
|
|
|
yall = [np.array(data.activityID) for data in datasets_filled] |
178
|
|
|
|
179
|
|
|
xylists = [split_activities(y, x) for x, y in zip(xall, yall)] |
180
|
|
|
Xlists, ylists = zip(*xylists) |
|
|
|
|
181
|
|
|
ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists] |
182
|
|
|
# Split in train, test and val |
183
|
|
|
train_range = slice(0, 6) |
184
|
|
|
val_range = 6 |
185
|
|
|
test_range = slice(7, len(datasets_filled)) |
186
|
|
|
x_trainlist = [X for Xlist in Xlists[train_range] for X in Xlist] |
187
|
|
|
x_vallist = [X for X in Xlists[val_range]] |
188
|
|
|
x_testlist = [X for Xlist in Xlists[test_range] for X in Xlist] |
189
|
|
|
y_trainlist = [y for ylist in ybinarylists[train_range] for y in ylist] |
190
|
|
|
y_vallist = [y for y in ybinarylists[val_range]] |
191
|
|
|
y_testlist = [y for ylist in ybinarylists[test_range] for y in ylist] |
192
|
|
|
|
193
|
|
|
# Take sliding-window frames. Target is label of last time step |
194
|
|
|
# Data is 100 Hz |
195
|
|
|
frame_length = int(5.12 * 100) |
196
|
|
|
step = 1 * 100 |
197
|
|
|
x_train = [] |
198
|
|
|
y_train = [] |
199
|
|
|
x_val = [] |
200
|
|
|
y_val = [] |
201
|
|
|
x_test = [] |
202
|
|
|
y_test = [] |
203
|
|
|
sliding_window(frame_length, step, x_train, y_train, y_trainlist, \ |
204
|
|
|
x_trainlist) |
205
|
|
|
sliding_window(frame_length, step, x_val, y_val, y_vallist, x_vallist) |
206
|
|
|
sliding_window(frame_length, step, x_test, y_test, \ |
207
|
|
|
y_testlist, x_testlist) |
208
|
|
|
numpify_and_store(x_train, y_train, 'X_train', 'y_train', \ |
209
|
|
|
outdatapath, shuffle=True) |
210
|
|
|
numpify_and_store(x_val, y_val, 'X_val', 'y_val', outdatapath, \ |
211
|
|
|
shuffle=False) |
212
|
|
|
numpify_and_store(x_test, y_test, 'X_test', 'y_test', outdatapath, \ |
213
|
|
|
shuffle=False) |
214
|
|
|
print('Processed data succesfully stored in ' + outdatapath) |
215
|
|
|
return outdatapath |
216
|
|
|
|
217
|
1 |
|
def load_data(outputpath): |
218
|
|
|
ext = '.npy' |
219
|
|
|
x_train = np.load(outputpath+'X_train'+ext) |
220
|
|
|
y_train_binary = np.load(outputpath+'y_train'+ext) |
221
|
|
|
x_val = np.load(outputpath+'X_val'+ext) |
222
|
|
|
y_val_binary = np.load(outputpath+'y_val'+ext) |
223
|
|
|
x_test = np.load(outputpath+'X_test'+ext) |
224
|
|
|
y_test_binary = np.load(outputpath+'y_test'+ext) |
225
|
|
|
return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary |
226
|
|
|
|
This check looks for invalid names for a range of different identifiers.
You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.
If your project includes a Pylint configuration file, the settings contained in that file take precedence.
To find out more about Pylint, please refer to their site.