1
|
|
|
#import required python modules |
2
|
|
|
import numpy as np |
3
|
|
|
from numpy import genfromtxt |
4
|
|
|
import pandas as pd |
5
|
|
|
import matplotlib.pyplot as plt |
6
|
|
|
from os import listdir |
7
|
|
|
import os.path |
8
|
|
|
import urllib.request |
9
|
|
|
import zipfile |
10
|
|
|
import keras |
11
|
|
|
from keras.utils.np_utils import to_categorical |
12
|
|
|
|
13
|
|
|
def split_activities(labels, X, borders=10*100): |
|
|
|
|
14
|
|
|
""" |
15
|
|
|
Splits up the data per activity and exclude activity=0. |
16
|
|
|
Also remove borders for each activity. |
17
|
|
|
Returns lists with subdatasets |
18
|
|
|
""" |
19
|
|
|
tot_len = len(labels) |
20
|
|
|
startpoints = np.where([1] + [labels[i]!=labels[i-1] for i in range(1, tot_len)])[0] |
|
|
|
|
21
|
|
|
endpoints = np.append(startpoints[1:]-1, tot_len-1) |
22
|
|
|
acts = [labels[s] for s,e in zip(startpoints, endpoints)] |
|
|
|
|
23
|
|
|
#Also split up the data, and only keep the non-zero activities |
24
|
|
|
Xy_split = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0] |
|
|
|
|
25
|
|
|
Xy_split = [(X, y) for X,y in Xy_split if len(X)>0] |
|
|
|
|
26
|
|
|
X_list = [X for X,y in Xy_split] |
|
|
|
|
27
|
|
|
y_list = [y for X,y in Xy_split] |
|
|
|
|
28
|
|
|
return X_list, y_list |
29
|
|
|
|
30
|
|
|
def sliding_window(frame_length, step, X_samples, y_samples,y_samples_list,X_samples_list): |
|
|
|
|
31
|
|
|
for j in range(len(X_samples_list)): |
32
|
|
|
X = X_samples_list[j] |
|
|
|
|
33
|
|
|
y_binary = y_samples_list[j] |
34
|
|
|
for i in range(0, X.shape[0]-frame_length, step): |
35
|
|
|
X_sub = X[i:i+frame_length,:] |
|
|
|
|
36
|
|
|
y_sub = y_binary |
37
|
|
|
X_samples.append(X_sub) |
38
|
|
|
y_samples.append(y_sub) |
39
|
|
|
|
40
|
|
|
def transform_y(y,mapclasses,nr_classes): |
|
|
|
|
41
|
|
|
y_mapped = np.array([mapclasses[c] for c in y], dtype='int') |
42
|
|
|
y_binary = to_categorical(y_mapped, nr_classes) |
43
|
|
|
return y_binary |
44
|
|
|
|
45
|
|
|
def addheader(datasets): |
46
|
|
|
# The columns are numbers, which is not very practical. Let's add column labels to the pandas dataframe: |
|
|
|
|
47
|
|
|
axes = ['x', 'y', 'z'] |
48
|
|
|
IMUsensor_columns = ['temperature'] + \ |
|
|
|
|
49
|
|
|
['acc_16g_' + i for i in axes] + \ |
50
|
|
|
['acc_6g_' + i for i in axes] + \ |
51
|
|
|
['gyroscope_'+ i for i in axes] + \ |
52
|
|
|
['magnometer_'+ i for i in axes] + \ |
53
|
|
|
['orientation_' + str(i) for i in range(4)] |
54
|
|
|
header = ["timestamp", "activityID", "heartrate"] + ["hand_"+s for s in IMUsensor_columns]\ |
|
|
|
|
55
|
|
|
+ ["chest_"+s for s in IMUsensor_columns]+ ["ankle_"+s for s in IMUsensor_columns] |
|
|
|
|
56
|
|
|
for i in range(0,len(datasets)): |
|
|
|
|
57
|
|
|
datasets[i].columns = header |
|
|
|
|
58
|
|
|
return datasets |
59
|
|
|
|
60
|
|
|
def split_dataset(datasets_filled,X_lists,y_binary_lists): |
|
|
|
|
61
|
|
|
# Split in train, test and val |
62
|
|
|
train_range = slice(0, 6) |
63
|
|
|
val_range = 6 |
64
|
|
|
test_range = slice(7,len(datasets_filled)) |
|
|
|
|
65
|
|
|
X_train_list = [X for X_list in X_lists[train_range] for X in X_list] |
|
|
|
|
66
|
|
|
X_val_list = [X for X in X_lists[val_range]] |
|
|
|
|
67
|
|
|
X_test_list = [X for X_list in X_lists[test_range] for X in X_list] |
|
|
|
|
68
|
|
|
y_train_list = [y for y_list in y_binary_lists[train_range] for y in y_list] |
|
|
|
|
69
|
|
|
y_val_list = [y for y in y_binary_lists[val_range]] |
70
|
|
|
y_test_list = [y for y_list in y_binary_lists[test_range] for y in y_list] |
71
|
|
|
return X_train_list, X_val_list, X_test_list, y_train_list, y_val_list, y_test_list |
|
|
|
|
72
|
|
|
|
73
|
|
|
def numpify_and_store(x,y,Xname,yname,outdatapath,shuffle=False): |
|
|
|
|
74
|
|
|
x = np.array(x) |
75
|
|
|
y = np.array(y) |
76
|
|
|
#Shuffle around the train set |
77
|
|
|
if shuffle is True: |
78
|
|
|
np.random.seed(123) |
79
|
|
|
neworder = np.random.permutation(x.shape[0]) |
80
|
|
|
x = x[neworder,:,:] |
|
|
|
|
81
|
|
|
y = y[neworder,:] |
|
|
|
|
82
|
|
|
# Save binary file |
83
|
|
|
np.save(outdatapath+ Xname, x) |
84
|
|
|
np.save(outdatapath+ yname, y) |
85
|
|
|
|
86
|
|
|
|
87
|
|
|
def fetch_data(directory_to_extract_to): |
88
|
|
|
targetdir = directory_to_extract_to + '/PAMAP2' |
89
|
|
|
if os.path.exists(targetdir): |
90
|
|
|
print('Data previously downloaded and stored in ' + targetdir) |
91
|
|
|
else: |
92
|
|
|
os.makedirs(targetdir) # create target directory |
93
|
|
|
#download the PAMAP2 data, this is 688 Mb |
94
|
|
|
path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip' |
95
|
|
|
test_file_exist = os.path.isfile(path_to_zip_file) |
96
|
|
|
if test_file_exist is False: |
97
|
|
|
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00231/PAMAP2_Dataset.zip' |
|
|
|
|
98
|
|
|
local_fn, headers = urllib.request.urlretrieve(url,filename=path_to_zip_file) #retrieve data from url |
|
|
|
|
99
|
|
|
print('Download complete and stored in: ' + path_to_zip_file ) |
|
|
|
|
100
|
|
|
else: |
101
|
|
|
print('The data was previously downloaded and stored in ' + path_to_zip_file ) |
|
|
|
|
102
|
|
|
# unzip |
103
|
|
|
with zipfile.ZipFile(path_to_zip_file ,"r") as zip_ref: |
|
|
|
|
104
|
|
|
zip_ref.extractall(targetdir) |
105
|
|
|
return targetdir |
106
|
|
|
|
107
|
|
|
|
108
|
|
|
def fetch_and_preprocess(directory_to_extract_to,columns_to_use): |
|
|
|
|
109
|
|
|
targetdir = fetch_data(directory_to_extract_to) |
110
|
|
|
outdatapath = targetdir + '/PAMAP2_Dataset' + '/slidingwindow512cleaned/' |
111
|
|
|
if not os.path.exists(outdatapath): |
112
|
|
|
os.makedirs(outdatapath) |
113
|
|
|
if os.path.isfile(outdatapath+'X_train.npy'): |
114
|
|
|
print('Data previously pre-processed and np-files saved to ' + outdatapath) |
|
|
|
|
115
|
|
|
else: |
116
|
|
|
datadir = targetdir + '/PAMAP2_Dataset/Protocol' |
117
|
|
|
filenames = listdir(datadir) |
118
|
|
|
print('Start pre-processing all ' + str(len(filenames)) + ' files...') |
119
|
|
|
# load the files and put them in a list of pandas dataframes: |
120
|
|
|
datasets = [pd.read_csv(datadir+'/'+fn, header=None, sep=' ') for fn in filenames] |
|
|
|
|
121
|
|
|
datasets = addheader(datasets) # add headers to the datasets |
122
|
|
|
datasets_filled = [d.interpolate() for d in datasets] #Interpolate dataset to get same sample rate between channels |
|
|
|
|
123
|
|
|
# Create mapping for class labels |
124
|
|
|
y_set_all = [set(np.array(data.activityID)) - set([0]) for data in datasets_filled] |
|
|
|
|
125
|
|
|
classlabels = list(set.union(*[set(y) for y in y_set_all])) |
126
|
|
|
nr_classes = len(classlabels) |
127
|
|
|
mapclasses = {classlabels[i] : i for i in range(len(classlabels))} |
128
|
|
|
#Create input (X) and output (y) sets |
129
|
|
|
X_all = [np.array(data[columns_to_use]) for data in datasets_filled] |
|
|
|
|
130
|
|
|
y_all = [np.array(data.activityID) for data in datasets_filled] |
131
|
|
|
Xy_lists = [split_activities(y, X) for X,y in zip(X_all, y_all)] |
|
|
|
|
132
|
|
|
X_lists, y_lists = zip(*Xy_lists) |
|
|
|
|
133
|
|
|
y_binary_lists = [transform_y(y,mapclasses,nr_classes) for y in y_lists] |
|
|
|
|
134
|
|
|
# Split in train, test and val |
135
|
|
|
X_train_list, X_val_list, X_test_list, y_train_list, y_val_list, y_test_list = split_dataset(datasets_filled,X_lists,y_binary_lists) |
|
|
|
|
136
|
|
|
# Take sliding-window frames. Target is label of last time step |
137
|
|
|
# Data is 100 Hz |
138
|
|
|
frame_length = int(5.12 * 100) |
139
|
|
|
step = 1 * 100 |
140
|
|
|
X_train = [] |
|
|
|
|
141
|
|
|
y_train = [] |
142
|
|
|
X_val = [] |
|
|
|
|
143
|
|
|
y_val = [] |
144
|
|
|
X_test = [] |
|
|
|
|
145
|
|
|
y_test = [] |
146
|
|
|
sliding_window(frame_length, step, X_train, y_train,y_train_list,X_train_list) |
|
|
|
|
147
|
|
|
sliding_window(frame_length, step, X_val, y_val,y_val_list,X_val_list) |
|
|
|
|
148
|
|
|
sliding_window(frame_length, step, X_test, y_test,y_test_list,X_test_list) |
|
|
|
|
149
|
|
|
numpify_and_store(X_train,y_train,'X_train','y_train',outdatapath,shuffle=True) |
|
|
|
|
150
|
|
|
numpify_and_store(X_val,y_val,'X_val','y_val',outdatapath,shuffle=True) |
|
|
|
|
151
|
|
|
numpify_and_store(X_test,y_test,'X_test','y_test',outdatapath,shuffle=True) |
|
|
|
|
152
|
|
|
print('Processed data succesfully stored in ' + outdatapath) |
153
|
|
|
return outdatapath |
154
|
|
|
|
155
|
|
|
def load_data(outputpath): |
156
|
|
|
ext = '.npy' |
157
|
|
|
X_train = np.load(outputpath+'X_train'+ext) |
|
|
|
|
158
|
|
|
y_train_binary = np.load(outputpath+'y_train_binary'+ext) |
159
|
|
|
X_val = np.load(outputpath+'X_val'+ext) |
|
|
|
|
160
|
|
|
y_val_binary = np.load(outputpath+'y_val_binary'+ext) |
161
|
|
|
X_test = np.load(outputpath+'X_test'+ext) |
|
|
|
|
162
|
|
|
y_test_binary = np.load(outputpath+'y_test_binary'+ext) |
163
|
|
|
return X_train, y_train_binary, X_val, y_val_binary, X_test, y_test_binary |
164
|
|
|
|
This check looks for invalid names for a range of different identifiers.
You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.
If your project includes a Pylint configuration file, the settings contained in that file take precedence.
To find out more about Pylint, please refer to their site.