Completed
Push — master ( 12b66d...1bf4c2 )
by Dafne van
10s
created

slidingwindow_store()   B

Complexity

Conditions 1

Size

Total Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1.6296

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
c 2
b 0
f 0
dl 0
loc 27
ccs 1
cts 7
cp 0.1429
crap 1.6296
rs 8.8571

1 Method

Rating   Name   Duplication   Size   Complexity  
A map_class() 0 7 4
1
"""
2
 Summary:
3
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4
 preproces the data.
5
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6
"""
7 1
import numpy as np
8 1
from numpy import genfromtxt
9 1
import pandas as pd
10 1
import matplotlib.pyplot as plt
11 1
from os import listdir
12 1
import os.path
13 1
import zipfile
14 1
import keras
15 1
from keras.utils.np_utils import to_categorical
16 1
import sys
17 1
import six.moves.urllib as urllib
18
19
20 1
def split_activities(labels, X, exclude_activities, borders=10 * 100):
21
    """
22
    Splits up the data per activity and exclude activity=0.
23
    Also remove borders for each activity.
24
    Returns lists with subdatasets
25
26
    Parameters
27
    ----------
28
    labels : numpy array
29
        Activity labels
30
    X : numpy array
31
        Data points
32
    borders : int
33
        Nr of timesteps to remove from the borders of an activity
34
    exclude_activities : list or tuple
35
        activities to exclude from the
36
37
    Returns
38
    -------
39
    X_list
40
    y_list
41
    """
42 1
    tot_len = len(labels)
43 1
    startpoints = np.where([1] + [labels[i] != labels[i - 1]
44
                                  for i in range(1, tot_len)])[0]
45 1
    endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
46 1
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
47
    # Also split up the data, and only keep the non-zero activities
48 1
    xysplit = [(X[s + borders:e - borders + 1, :], a)
49
               for s, e, a in zip(startpoints, endpoints, acts)
50
               if a not in exclude_activities]
51 1
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
52 1
    Xlist = [X for X, y in xysplit]
53 1
    ylist = [y for X, y in xysplit]
54 1
    return Xlist, ylist
55
56
57 1
def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
58
    """
59
    Splits time series in ysampleslist and Xsampleslist
60
    into segments by applying a sliding overlapping window
61
    of size equal to frame_length with steps equal to step
62
    it does this for all the samples and appends all the output together.
63
    So, the participant distinction is not kept
64
65
    Parameters
66
    ----------
67
    frame_length : int
68
        Length of sliding window
69
    step : int
70
        Stepsize between windows
71
    Xsamples : list
72
        Existing list of window fragments
73
    ysamples : list
74
        Existing list of window fragments
75
    Xsampleslist : list
76
        Samples to take sliding windows from
77
    ysampleslist
78
        Samples to take sliding windows from
79
80
    """
81 1
    Xsamples = []
82 1
    ysamples = []
83 1
    for j in range(len(Xsampleslist)):
84 1
        X = Xsampleslist[j]
85 1
        ybinary = ysampleslist[j]
86 1
        for i in range(0, X.shape[0] - frame_length, step):
87 1
            xsub = X[i:i + frame_length, :]
88 1
            ysub = ybinary
89 1
            Xsamples.append(xsub)
90 1
            ysamples.append(ysub)
91 1
    return Xsamples, ysamples
92
93
94 1
def transform_y(y, mapclasses, nr_classes):
95
    """
96
    Transforms y, a list with one sequence of A timesteps
97
    and B unique classes into a binary Numpy matrix of
98
    shape (A, B)
99
100
    Parameters
101
    ----------
102
    y : list or array
103
        List of classes
104
    mapclasses : dict
105
        dictionary that maps the classes to numbers
106
    nr_classes : int
107
        total number of classes
108
    """
109 1
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
110 1
    ybinary = to_categorical(ymapped, nr_classes)
111 1
    return ybinary
112
113 1
def get_header():
114 1
    axes = ['x', 'y', 'z']
115 1
    IMUsensor_columns = ['temperature'] + \
116
        ['acc_16g_' + i for i in axes] + \
117
        ['acc_6g_' + i for i in axes] + \
118
        ['gyroscope_' + i for i in axes] + \
119
        ['magnometer_' + i for i in axes] + \
120
        ['orientation_' + str(i) for i in range(4)]
121 1
    header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
122
                                                         for s in IMUsensor_columns] \
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
123
        + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
124
                                                       for s in IMUsensor_columns]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
125 1
    return header
126
127 1
def addheader(datasets):
128
    """
129
    The columns of the pandas data frame are numbers
130
    this function adds the column labels
131
132
    Parameters
133
    ----------
134
    datasets : list
135
        List of pandas dataframes
136
    """
137 1
    header = get_header()
138 1
    for i in range(0, len(datasets)):
139 1
        datasets[i].columns = header
140 1
    return datasets
141
142
143 1
def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
144
    """
145
    Converts python lists x 3D and y 1D into numpy arrays
146
    and stores the numpy array in directory outdatapath
147
    shuffle is optional and shuffles the samples
148
149
    Parameters
150
    ----------
151
    X : list
152
        list with data
153
    y : list
154
        list with data
155
    X_name : str
156
        name to store the x arrays
157
    y_name : str
158
        name to store the y arrays
159
    outdatapath : str
160
        path to the directory to store the data
161
    shuffle : bool
162
        whether to shuffle the data before storing
163
    """
164 1
    X = np.array(X)
165 1
    y = np.array(y)
166
    # Shuffle the train set
167 1
    if shuffle is True:
168 1
        np.random.seed(123)
169 1
        neworder = np.random.permutation(X.shape[0])
170 1
        X = X[neworder, :, :]
171 1
        y = y[neworder, :]
172
    # Save binary file
173 1
    xpath = os.path.join(outdatapath, X_name)
174 1
    ypath = os.path.join(outdatapath, y_name)
175 1
    np.save(xpath, X)
176 1
    np.save(ypath, y)
177 1
    print('Stored ' + xpath, y_name)
178
179
180 1
def fetch_data(directory_to_extract_to):
181
    """
182
    Fetch the data and extract the contents of the zip file
183
    to the directory_to_extract_to.
184
    First check whether this was done before, if yes, then skip
185
186
    Parameters
187
    ----------
188
    directory_to_extract_to : str
189
        directory to create subfolder 'PAMAP2'
190
191
    Returns
192
    -------
193
    targetdir: str
194
        directory where the data is extracted
195
    """
196
    targetdir = os.path.join(directory_to_extract_to, 'PAMAP2/')
197
    if os.path.exists(targetdir):
198
        print('Data previously downloaded and stored in ' + targetdir)
199
    else:
200
        os.makedirs(targetdir)  # create target directory
201
        # Download the PAMAP2 data, this is 688 Mb
202
        path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
203
        test_file_exist = os.path.isfile(path_to_zip_file)
204
        if test_file_exist is False:
205
            url = str('https://archive.ics.uci.edu/ml/' +
206
                      'machine-learning-databases/00231/PAMAP2_Dataset.zip')
207
            # retrieve data from url
208
            local_fn, headers = urllib.request.urlretrieve(url,
209
                                                           filename=path_to_zip_file)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
210
            print('Download complete and stored in: ' + path_to_zip_file)
211
        else:
212
            print('The data was previously downloaded and stored in ' +
213
                  path_to_zip_file)
214
        # unzip
215
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
216
            zip_ref.extractall(targetdir)
217
    return targetdir
218
219
220 1
def map_class(datasets_filled):
221
    ysetall = [set(np.array(data.activityID)) - set([0])
222
               for data in datasets_filled]
223
    classlabels = list(set.union(*[set(y) for y in ysetall]))
224
    nr_classes = len(classlabels)
225
    mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
226
    return classlabels, nr_classes, mapclasses
227
228
229 1
def split_data(Xlists, ybinarylists, indices):
230
    """ Function takes subset from list given indices
231
232
    Parameters
233
    ----------
234
    Xlists: tuple
235
        tuple (samples) of lists (windows) of numpy-arrays (time, variable)
236
    ybinarylist :
237
        list (samples) of numpy-arrays (window, class)
238
    indices :
239
        indices of the slice of data (samples) to be taken
240
241
    Returns
242
    -------
243
    x_setlist : list
244
        list (windows across samples) of numpy-arrays (time, variable)
245
    y_setlist: list
246
        list (windows across samples) of numpy-arrays (class, )
247
    """
248 1
    tty = str(type(indices))
249
    # or statement in next line is to account for python2 and python3
250
    # difference
251 1
    if tty == "<class 'slice'>" or tty == "<type 'slice'>":
252 1
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
253 1
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
254
    else:
255
        x_setlist = [X for X in Xlists[indices]]
256
        y_setlist = [y for y in ybinarylists[indices]]
257 1
    return x_setlist, y_setlist
258
259 1
def split_data_random(X, y, val_size, test_size):
260
    X = np.array(X)
261
    y = np.array(y)
262
    size = len(X)
263
    train_size = size - val_size - test_size
264
    indices = np.random.permutation(size)
265
    X_train = X[indices[:train_size]]
266
    y_train = y[indices[:train_size]]
267
    X_val = X[indices[train_size:train_size+val_size]]
268
    y_val = y[indices[train_size:train_size+val_size]]
269
    X_test = X[indices[train_size+val_size:]]
270
    y_test = y[indices[train_size+val_size:]]
271
    return X_train, y_train, X_val, y_val, X_test, y_test
272
273 1
def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (80/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
274
               val_test_size=None):
275
    """ Function to preprocess the PAMAP2 data after it is fetched
276
277
    Parameters
278
    ----------
279
    targetdir : str
280
        subdirectory of directory_to_extract_to, targetdir
281
        is defined by function fetch_data
282
    outdatapath : str
283
        a subdirectory of directory_to_extract_to, outdatapath
284
        is the direcotry where the Numpy output will be stored.
285
    columns_to_use : list
286
        list of column names to use
287
    exclude_activities : list or tuple
288
        activities to exclude from the
289
    fold : boolean
290
        Whether to store each fold seperately ('False' creates
291
        Train, Test and Validation sets)
292
293
    Returns
294
    -------
295
    None
296
    """
297
    datadir = targetdir + '/PAMAP2_Dataset/Protocol'
298
    filenames = listdir(datadir)
299
    filenames.sort()
300
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
301
    # load the files and put them in a list of pandas dataframes:
302
    datasets = [pd.read_csv(datadir + '/' + fn, header=None, sep=' ')
303
                for fn in filenames]
304
    datasets = addheader(datasets)  # add headers to the datasets
305
    # Interpolate dataset to get same sample rate between channels
306
    datasets_filled = [d.interpolate() for d in datasets]
307
    # Create mapping for class labels
308
    classlabels, nr_classes, mapclasses = map_class(datasets_filled)
309
    # Create input (x) and output (y) sets
310
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
311
    yall = [np.array(data.activityID) for data in datasets_filled]
312
    xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
313
    Xlists, ylists = zip(*xylists)
314
    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
315
    frame_length = int(5.12 * 100)
316
    step = 1 * 100
317
    if not fold:
318
        if val_test_size is None:
319
            # Split in train, test and val
320
            x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
321
            test_range = slice(7, len(datasets_filled))
322
            x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (81/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
323
            x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
324
                                                  indices=slice(0, 6))
325
            # Take sliding-window frames, target is label of last time step,
326
            # and store as numpy file
327
            x_train, y_train = sliding_window(frame_length, step, x_trainlist,
328
                                              y_trainlist)
329
            x_val, y_val = sliding_window(frame_length, step, x_vallist,
330
                                              y_vallist)
331
            x_test, y_test = sliding_window(frame_length, step, x_testlist,
332
                                              y_testlist)
333
334
        else:
335
            val_size, test_size = val_test_size
336
            X_list, y_list = split_data(Xlists, ybinarylists,
337
                                        slice(0, len(datasets_filled)))
338
            X, y = sliding_window(frame_length, step, X_list,
339
                                  y_list)
340
            x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (105/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
341
342
343
        numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
344
                            outdatapath=outdatapath, shuffle=True)
345
        numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
346
                            outdatapath=outdatapath, shuffle=False)
347
        numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
348
                            outdatapath=outdatapath, shuffle=False)
349
    else :
0 ignored issues
show
Coding Style introduced by
No space allowed before :
else :
^
Loading history...
350
        for i in range(len(Xlists)):
351
            X_i, y_i = split_data(Xlists, ybinarylists, i)
352
            X, y = sliding_window(frame_length, step, X_i,
353
                                              y_i)
354
            numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
355
                            outdatapath=outdatapath, shuffle=True)
356
357
358
    print('Processed data succesfully stored in ' + outdatapath)
359
    return None
360
361
362 1
def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='slidingwindow512cleaned', exclude_activities=[0], fold=False,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (144/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
363
                         val_test_size=None):
364
    """
365
    High level function to fetch_and_preprocess the PAMAP2 dataset
366
367
    Parameters
368
    ----------
369
    directory_to_extract_to : str
370
        the directory where the data will be stored
371
    columns_to_use : list
372
        the columns to use
373
    ouptput_dir : str
374
        name of the directory to write the outputdata to
375
    exclude_activities : list or tuple
376
        activities to exclude from the
377
    fold : boolean
378
        Whether to store each fold seperately ('False' creates
379
        Train, Test and Validation sets)
380
381
    Returns
382
    -------
383
    outdatapath: str
384
        The directory in which the numpy files are stored
385
    """
386
    if columns_to_use is None:
387
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
388
                          'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
389
                          'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
390
    targetdir = fetch_data(directory_to_extract_to)
391
    outdatapath = os.path.join(targetdir, 'PAMAP2_Dataset/', output_dir)
392
    if not os.path.exists(outdatapath):
393
        os.makedirs(outdatapath)
394
    # if os.path.isfile(outdatapath + 'x_train.npy'):
395
    #     print('Data previously pre-processed and np-files saved to ' +
396
    #           outdatapath)
397
    # else:
398
    preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (95/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
399
    return outdatapath
400
401
402 1
def load_data(outputpath):
403
    """ Function to load the numpy data as stored in directory
404
    outputpath.
405
406
    Parameters
407
    ----------
408
    outputpath : str
409
        directory where the numpy files are stored
410
411
    Returns
412
    -------
413
    x_train
414
    y_train_binary
415
    x_val
416
    y_val_binary
417
    x_test
418
    y_test_binary
419
    """
420
    ext = '.npy'
421
    x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
422
    y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
423
    x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
424
    y_val_binary = np.load(os.path.join(outputpath,  'y_val' + ext))
0 ignored issues
show
Coding Style introduced by
Exactly one space required after comma
y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
^
Loading history...
425
    x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
426
    y_test_binary = np.load(os.path.join(outputpath,  'y_test' + ext))
0 ignored issues
show
Coding Style introduced by
Exactly one space required after comma
y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
^
Loading history...
427
    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
428