Completed
Push — master ( 70f326...352d23 )
by Dafne van
05:32
created

addheader()   A

Complexity

Conditions 2

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 4
Bugs 0 Features 0
Metric Value
cc 2
c 4
b 0
f 0
dl 0
loc 14
ccs 5
cts 5
cp 1
crap 2
rs 9.4285
1
"""
2
 Summary:
3
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4
 preproces the data.
5
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6
"""
7 1
import numpy as np
8 1
from numpy import genfromtxt
9 1
import pandas as pd
10 1
import matplotlib.pyplot as plt
11 1
from os import listdir
12 1
import os.path
13 1
import zipfile
14 1
import keras
15 1
from keras.utils.np_utils import to_categorical
16 1
import sys
17 1
import six.moves.urllib as urllib
18
19
20 1
def split_activities(labels, X, exclude_activities, borders=10 * 100):
21
    """
22
    Splits up the data per activity and exclude activity=0.
23
    Also remove borders for each activity.
24
    Returns lists with subdatasets
25
26
    Parameters
27
    ----------
28
    labels : numpy array
29
        Activity labels
30
    X : numpy array
31
        Data points
32
    borders : int
33
        Nr of timesteps to remove from the borders of an activity
34
    exclude_activities : list or tuple
35
        activities to exclude from the
36
37
    Returns
38
    -------
39
    X_list
40
    y_list
41
    """
42
    tot_len = len(labels)
43
    startpoints = np.where([1] + [labels[i] != labels[i - 1]
44
                                  for i in range(1, tot_len)])[0]
45
    endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
46
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
47
    # Also split up the data, and only keep the non-zero activities
48
    xysplit = [(X[s + borders:e - borders + 1, :], a)
49
               for s, e, a in zip(startpoints, endpoints, acts)
50
               if a not in exclude_activities]
51
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
52
    Xlist = [X for X, y in xysplit]
53
    ylist = [y for X, y in xysplit]
54
    return Xlist, ylist
55
56
57 1
def sliding_window(frame_length, step, Xsamples,
58
                   ysamples, Xsampleslist, ysampleslist):
59
    """
60
    Splits time series in ysampleslist and Xsampleslist
61
    into segments by applying a sliding overlapping window
62
    of size equal to frame_length with steps equal to step
63
    it does this for all the samples and appends all the output together.
64
    So, the participant distinction is not kept
65
66
    Parameters
67
    ----------
68
    frame_length : int
69
        Length of sliding window
70
    step : int
71
        Stepsize between windows
72
    Xsamples : list
73
        Existing list of window fragments
74
    ysamples : list
75
        Existing list of window fragments
76
    Xsampleslist : list
77
        Samples to take sliding windows from
78
    ysampleslist
79
        Samples to take sliding windows from
80
81
    """
82 1
    for j in range(len(Xsampleslist)):
83 1
        X = Xsampleslist[j]
84 1
        ybinary = ysampleslist[j]
85 1
        for i in range(0, X.shape[0] - frame_length, step):
86 1
            xsub = X[i:i + frame_length, :]
87 1
            ysub = ybinary
88 1
            Xsamples.append(xsub)
89 1
            ysamples.append(ysub)
90
91
92 1
def transform_y(y, mapclasses, nr_classes):
93
    """
94
    Transforms y, a list with one sequence of A timesteps
95
    and B unique classes into a binary Numpy matrix of
96
    shape (A, B)
97
98
    Parameters
99
    ----------
100
    y : list or array
101
        List of classes
102
    mapclasses : dict
103
        dictionary that maps the classes to numbers
104
    nr_classes : int
105
        total number of classes
106
    """
107 1
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
108 1
    ybinary = to_categorical(ymapped, nr_classes)
109 1
    return ybinary
110
111 1
def get_header():
112 1
    axes = ['x', 'y', 'z']
113 1
    IMUsensor_columns = ['temperature'] + \
114
        ['acc_16g_' + i for i in axes] + \
115
        ['acc_6g_' + i for i in axes] + \
116
        ['gyroscope_' + i for i in axes] + \
117
        ['magnometer_' + i for i in axes] + \
118
        ['orientation_' + str(i) for i in range(4)]
119 1
    header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
120
                                                         for s in IMUsensor_columns] \
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
121
        + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
122
                                                       for s in IMUsensor_columns]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
123 1
    return header
124
125 1
def addheader(datasets):
126
    """
127
    The columns of the pandas data frame are numbers
128
    this function adds the column labels
129
130
    Parameters
131
    ----------
132
    datasets : list
133
        List of pandas dataframes
134
    """
135 1
    header = get_header()
136 1
    for i in range(0, len(datasets)):
137 1
        datasets[i].columns = header
138 1
    return datasets
139
140
141 1
def numpify_and_store(X, y, xname, yname, outdatapath, shuffle=False):
142
    """
143
    Converts python lists x 3D and y 1D into numpy arrays
144
    and stores the numpy array in directory outdatapath
145
    shuffle is optional and shuffles the samples
146
147
    Parameters
148
    ----------
149
    X : list
150
        list with data
151
    y : list
152
        list with data
153
    xname : str
154
        name to store the x arrays
155
    yname : str
156
        name to store the y arrays
157
    outdatapath : str
158
        path to the directory to store the data
159
    shuffle : bool
160
        whether to shuffle the data before storing
161
    """
162 1
    X = np.array(X)
163 1
    y = np.array(y)
164
    # Shuffle the train set
165 1
    if shuffle is True:
166 1
        np.random.seed(123)
167 1
        neworder = np.random.permutation(X.shape[0])
168 1
        X = X[neworder, :, :]
169 1
        y = y[neworder, :]
170
    # Save binary file
171 1
    xpath = os.path.join(outdatapath, xname)
172 1
    ypath = os.path.join(outdatapath, yname)
173 1
    np.save(xpath, X)
174 1
    np.save(ypath, y)
175 1
    print('Stored '+ xpath, yname)
176
177
178 1
def fetch_data(directory_to_extract_to):
179
    """
180
    Fetch the data and extract the contents of the zip file
181
    to the directory_to_extract_to.
182
    First check whether this was done before, if yes, then skip
183
184
    Parameters
185
    ----------
186
    directory_to_extract_to : str
187
        directory to create subfolder 'PAMAP2'
188
189
    Returns
190
    -------
191
    targetdir: str
192
        directory where the data is extracted
193
    """
194
    targetdir = os.path.join(directory_to_extract_to, 'PAMAP2/')
195
    if os.path.exists(targetdir):
196
        print('Data previously downloaded and stored in ' + targetdir)
197
    else:
198
        os.makedirs(targetdir)  # create target directory
199
        # Download the PAMAP2 data, this is 688 Mb
200
        path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
201
        test_file_exist = os.path.isfile(path_to_zip_file)
202
        if test_file_exist is False:
203
            url = str('https://archive.ics.uci.edu/ml/' +
204
                      'machine-learning-databases/00231/PAMAP2_Dataset.zip')
205
            # retrieve data from url
206
            local_fn, headers = urllib.request.urlretrieve(url,
207
                                                           filename=path_to_zip_file)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
208
            print('Download complete and stored in: ' + path_to_zip_file)
209
        else:
210
            print('The data was previously downloaded and stored in ' +
211
                  path_to_zip_file)
212
        # unzip
213
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
214
            zip_ref.extractall(targetdir)
215
    return targetdir
216
217
218 1
def slidingwindow_store(y_list, x_list, X_name, y_name, outdatapath, shuffle):
219
    """
220
    Take sliding-window frames. Target is label of last time step
221
    Data is 100 Hz
222
223
    Parameters
224
    ----------
225
    y_list : list
226
        list of arrays with classes
227
    x_list : list
228
        list of numpy arrays with data
229
    X_name : str
230
        Name for X file
231
    y_name : str
232
        Name for y file
233
    outdatapath : str
234
        directory to store the data
235
    shuffle : bool
236
        whether to shuffle the data
237
    """
238
    frame_length = int(5.12 * 100)
239
    step = 1 * 100
240
    x_set = []
241
    y_set = []
242
    sliding_window(frame_length, step, x_set, y_set, x_list, y_list)
243
    numpify_and_store(x_set, y_set, X_name, y_name,
244
                      outdatapath, shuffle)
245
246
247 1
def map_class(datasets_filled):
248
    ysetall = [set(np.array(data.activityID)) - set([0])
249
               for data in datasets_filled]
250
    classlabels = list(set.union(*[set(y) for y in ysetall]))
251
    nr_classes = len(classlabels)
252
    mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
253
    return classlabels, nr_classes, mapclasses
254
255
256 1
def split_data(Xlists, ybinarylists, indices):
257
    """ Function takes subset from list given indices
258
259
    Parameters
260
    ----------
261
    Xlists: tuple
262
        tuple (samples) of lists (windows) of numpy-arrays (time, variable)
263
    ybinarylist :
264
        list (samples) of numpy-arrays (window, class)
265
    indices :
266
        indices of the slice of data (samples) to be taken
267
268
    Returns
269
    -------
270
    x_setlist : list
271
        list (windows across samples) of numpy-arrays (time, variable)
272
    y_setlist: list
273
        list (windows across samples) of numpy-arrays (class, )
274
    """
275 1
    tty = str(type(indices))
276
    # or statement in next line is to account for python2 and python3
277
    # difference
278 1
    if tty == "<class 'slice'>" or tty == "<type 'slice'>":
279 1
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
280 1
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
281
    else:
282
        x_setlist = [X for X in Xlists[indices]]
283
        y_setlist = [y for y in ybinarylists[indices]]
284 1
    return x_setlist, y_setlist
285
286
287 1
def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (81/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
288
    """ Function to preprocess the PAMAP2 data after it is fetched
289
290
    Parameters
291
    ----------
292
    targetdir : str
293
        subdirectory of directory_to_extract_to, targetdir
294
        is defined by function fetch_data
295
    outdatapath : str
296
        a subdirectory of directory_to_extract_to, outdatapath
297
        is the direcotry where the Numpy output will be stored.
298
    columns_to_use : list
299
        list of column names to use
300
    exclude_activities : list or tuple
301
        activities to exclude from the
302
    fold : boolean
303
        Whether to store each fold seperately ('False' creates
304
        Train, Test and Validation sets)
305
306
    Returns
307
    -------
308
    None
309
    """
310
    datadir = targetdir + '/PAMAP2_Dataset/Protocol'
311
    filenames = listdir(datadir)
312
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
313
    # load the files and put them in a list of pandas dataframes:
314
    datasets = [pd.read_csv(datadir + '/' + fn, header=None, sep=' ')
315
                for fn in filenames]
316
    datasets = addheader(datasets)  # add headers to the datasets
317
    # Interpolate dataset to get same sample rate between channels
318
    datasets_filled = [d.interpolate() for d in datasets]
319
    # Create mapping for class labels
320
    classlabels, nr_classes, mapclasses = map_class(datasets_filled)
321
    # Create input (x) and output (y) sets
322
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
323
    yall = [np.array(data.activityID) for data in datasets_filled]
324
    xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
325
    Xlists, ylists = zip(*xylists)
326
    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
327
328
    if not fold:
329
        # Split in train, test and val
330
        x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
331
        test_range = slice(7, len(datasets_filled))
332
        x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
333
        x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
334
                                              indices=slice(0, 6))
335
        # Take sliding-window frames, target is label of last time step,
336
        # and store as numpy file
337
        slidingwindow_store(y_list=y_trainlist, x_list=x_trainlist,
338
                            X_name='X_train', y_name='y_train',
339
                            outdatapath=outdatapath, shuffle=True)
340
        slidingwindow_store(y_list=y_vallist, x_list=x_vallist,
341
                            X_name='X_val', y_name='y_val',
342
                            outdatapath=outdatapath, shuffle=False)
343
        slidingwindow_store(y_list=y_testlist, x_list=x_testlist,
344
                            X_name='X_test', y_name='y_test',
345
                            outdatapath=outdatapath, shuffle=False)
346
    else :
0 ignored issues
show
Coding Style introduced by
No space allowed before :
else :
^
Loading history...
347
        for i in range(len(Xlists)):
348
            X_i, y_i = split_data(Xlists, ybinarylists, i)
349
            slidingwindow_store(y_list=y_i, x_list=X_i,
350
                            X_name='X_'+str(i), y_name='y_'+str(i),
351
                            outdatapath=outdatapath, shuffle=True)
352
353
354
    print('Processed data succesfully stored in ' + outdatapath)
355
    return None
356
357
358 1
def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='slidingwindow512cleaned', exclude_activities=[0], fold=False):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (145/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
359
    """
360
    High level function to fetch_and_preprocess the PAMAP2 dataset
361
362
    Parameters
363
    ----------
364
    directory_to_extract_to : str
365
        the directory where the data will be stored
366
    columns_to_use : list
367
        the columns to use
368
    ouptput_dir : str
369
        name of the directory to write the outputdata to
370
    exclude_activities : list or tuple
371
        activities to exclude from the
372
    fold : boolean
373
        Whether to store each fold seperately ('False' creates
374
        Train, Test and Validation sets)
375
376
    Returns
377
    -------
378
    outdatapath: str
379
        The directory in which the numpy files are stored
380
    """
381
    if columns_to_use is None:
382
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
383
                          'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
384
                          'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
385
    targetdir = fetch_data(directory_to_extract_to)
386
    outdatapath = os.path.join(targetdir, 'PAMAP2_Dataset/', output_dir)
387
    if not os.path.exists(outdatapath):
388
        os.makedirs(outdatapath)
389
    # if os.path.isfile(outdatapath + 'x_train.npy'):
390
    #     print('Data previously pre-processed and np-files saved to ' +
391
    #           outdatapath)
392
    # else:
393
    preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (80/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
394
    return outdatapath
395
396
397 1
def load_data(outputpath):
398
    """ Function to load the numpy data as stored in directory
399
    outputpath.
400
401
    Parameters
402
    ----------
403
    outputpath : str
404
        directory where the numpy files are stored
405
406
    Returns
407
    -------
408
    x_train
409
    y_train_binary
410
    x_val
411
    y_val_binary
412
    x_test
413
    y_test_binary
414
    """
415
    ext = '.npy'
416
    x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
417
    y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
418
    x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
419
    y_val_binary = np.load(os.path.join(outputpath,  'y_val' + ext))
0 ignored issues
show
Coding Style introduced by
Exactly one space required after comma
y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
^
Loading history...
420
    x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
421
    y_test_binary = np.load(os.path.join(outputpath,  'y_test' + ext))
0 ignored issues
show
Coding Style introduced by
Exactly one space required after comma
y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
^
Loading history...
422
    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
423