Completed
Push — master ( e6f995...b6bff9 )
by Dafne van
05:48
created

download_preprocessed_data()   B

Complexity

Conditions 4

Size

Total Lines 25

Duplication

Lines 25
Ratio 100 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
c 1
b 0
f 0
dl 25
loc 25
rs 8.5806
1
"""
2
 Summary:
3
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4
 preproces the data.
5
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6
"""
7
import numpy as np
8
from numpy import genfromtxt
9
import pandas as pd
10
import matplotlib.pyplot as plt
11
from os import listdir
12
import os.path
13
import zipfile
14
import keras
15
from keras.utils.np_utils import to_categorical
16
import sys
17
import six.moves.urllib as urllib
18
19
20
def split_activities(labels, X, exclude_activities, borders=10 * 100):
21
    """
22
    Splits up the data per activity and exclude activity=0.
23
    Also remove borders for each activity.
24
    Returns lists with subdatasets
25
26
    Parameters
27
    ----------
28
    labels : numpy array
29
        Activity labels
30
    X : numpy array
31
        Data points
32
    borders : int
33
        Nr of timesteps to remove from the borders of an activity
34
    exclude_activities : list or tuple
35
        activities to exclude from the
36
37
    Returns
38
    -------
39
    X_list
40
    y_list
41
    """
42
    tot_len = len(labels)
43
    startpoints = np.where([1] + [labels[i] != labels[i - 1]
44
                                  for i in range(1, tot_len)])[0]
45
    endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
46
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
47
    # Also split up the data, and only keep the non-zero activities
48
    xysplit = [(X[s + borders:e - borders + 1, :], a)
49
               for s, e, a in zip(startpoints, endpoints, acts)
50
               if a not in exclude_activities]
51
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
52
    Xlist = [X for X, y in xysplit]
53
    ylist = [y for X, y in xysplit]
54
    return Xlist, ylist
55
56
57
def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
58
    """
59
    Splits time series in ysampleslist and Xsampleslist
60
    into segments by applying a sliding overlapping window
61
    of size equal to frame_length with steps equal to step
62
    it does this for all the samples and appends all the output together.
63
    So, the participant distinction is not kept
64
65
    Parameters
66
    ----------
67
    frame_length : int
68
        Length of sliding window
69
    step : int
70
        Stepsize between windows
71
    Xsamples : list
72
        Existing list of window fragments
73
    ysamples : list
74
        Existing list of window fragments
75
    Xsampleslist : list
76
        Samples to take sliding windows from
77
    ysampleslist
78
        Samples to take sliding windows from
79
80
    """
81
    Xsamples = []
82
    ysamples = []
83
    for j in range(len(Xsampleslist)):
84
        X = Xsampleslist[j]
85
        ybinary = ysampleslist[j]
86
        for i in range(0, X.shape[0] - frame_length, step):
87
            xsub = X[i:i + frame_length, :]
88
            ysub = ybinary
89
            Xsamples.append(xsub)
90
            ysamples.append(ysub)
91
    return Xsamples, ysamples
92
93
94
def transform_y(y, mapclasses, nr_classes):
95
    """
96
    Transforms y, a list with one sequence of A timesteps
97
    and B unique classes into a binary Numpy matrix of
98
    shape (A, B)
99
100
    Parameters
101
    ----------
102
    y : list or array
103
        List of classes
104
    mapclasses : dict
105
        dictionary that maps the classes to numbers
106
    nr_classes : int
107
        total number of classes
108
    """
109
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
110
    ybinary = to_categorical(ymapped, nr_classes)
111
    return ybinary
112
113
def get_header():
114
    axes = ['x', 'y', 'z']
115
    IMUsensor_columns = ['temperature'] + \
116
        ['acc_16g_' + i for i in axes] + \
117
        ['acc_6g_' + i for i in axes] + \
118
        ['gyroscope_' + i for i in axes] + \
119
        ['magnometer_' + i for i in axes] + \
120
        ['orientation_' + str(i) for i in range(4)]
121
    header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
122
                                                         for s in IMUsensor_columns] \
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
123
        + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
124
                                                       for s in IMUsensor_columns]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
125
    return header
126
127
def addheader(datasets):
128
    """
129
    The columns of the pandas data frame are numbers
130
    this function adds the column labels
131
132
    Parameters
133
    ----------
134
    datasets : list
135
        List of pandas dataframes
136
    """
137
    header = get_header()
138
    for i in range(0, len(datasets)):
139
        datasets[i].columns = header
140
    return datasets
141
142
143
def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
144
    """
145
    Converts python lists x 3D and y 1D into numpy arrays
146
    and stores the numpy array in directory outdatapath
147
    shuffle is optional and shuffles the samples
148
149
    Parameters
150
    ----------
151
    X : list
152
        list with data
153
    y : list
154
        list with data
155
    X_name : str
156
        name to store the x arrays
157
    y_name : str
158
        name to store the y arrays
159
    outdatapath : str
160
        path to the directory to store the data
161
    shuffle : bool
162
        whether to shuffle the data before storing
163
    """
164
    X = np.array(X)
165
    y = np.array(y)
166
    # Shuffle the train set
167
    if shuffle is True:
168
        np.random.seed(123)
169
        neworder = np.random.permutation(X.shape[0])
170
        X = X[neworder, :, :]
171
        y = y[neworder, :]
172
    # Save binary file
173
    xpath = os.path.join(outdatapath, X_name)
174
    ypath = os.path.join(outdatapath, y_name)
175
    np.save(xpath, X)
176
    np.save(ypath, y)
177
    print('Stored ' + xpath, y_name)
178
179
180 View Code Duplication
def fetch_data(directory_to_extract_to):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
181
    """
182
    Fetch the data and extract the contents of the zip file
183
    to the directory_to_extract_to.
184
    First check whether this was done before, if yes, then skip
185
186
    Parameters
187
    ----------
188
    directory_to_extract_to : str
189
        directory to create subfolder 'PAMAP2'
190
191
    Returns
192
    -------
193
    targetdir: str
194
        directory where the data is extracted
195
    """
196
    targetdir = os.path.join(directory_to_extract_to, "PAMAP2")
197
    if os.path.exists(targetdir):
198
        print('Data previously downloaded and stored in ' + targetdir)
199
    else:
200
        os.makedirs(targetdir)  # create target directory
201
        # Download the PAMAP2 data, this is 688 Mb
202
        path_to_zip_file = os.path.join(directory_to_extract_to, 'PAMAP2_Dataset.zip')
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
203
        test_file_exist = os.path.isfile(path_to_zip_file)
204
        if test_file_exist is False:
205
            url = str('https://archive.ics.uci.edu/ml/' +
206
                      'machine-learning-databases/00231/PAMAP2_Dataset.zip')
207
            # retrieve data from url
208
            local_fn, headers = urllib.request.urlretrieve(url,
209
                                                           filename=path_to_zip_file)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
210
            print('Download complete and stored in: ' + path_to_zip_file)
211
        else:
212
            print('The data was previously downloaded and stored in ' +
213
                  path_to_zip_file)
214
        # unzip
215
216
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
217
            zip_ref.extractall(targetdir)
218
        os.remove(path_to_zip_file)
219
    return targetdir
220
221
222
def map_class(datasets_filled):
223
    ysetall = [set(np.array(data.activityID)) - set([0])
224
               for data in datasets_filled]
225
    classlabels = list(set.union(*[set(y) for y in ysetall]))
226
    nr_classes = len(classlabels)
227
    mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
228
    return classlabels, nr_classes, mapclasses
229
230
231
def split_data(Xlists, ybinarylists, indices):
232
    """ Function takes subset from list given indices
233
234
    Parameters
235
    ----------
236
    Xlists: tuple
237
        tuple (samples) of lists (windows) of numpy-arrays (time, variable)
238
    ybinarylist :
239
        list (samples) of numpy-arrays (window, class)
240
    indices :
241
        indices of the slice of data (samples) to be taken
242
243
    Returns
244
    -------
245
    x_setlist : list
246
        list (windows across samples) of numpy-arrays (time, variable)
247
    y_setlist: list
248
        list (windows across samples) of numpy-arrays (class, )
249
    """
250
    tty = str(type(indices))
251
    # or statement in next line is to account for python2 and python3
252
    # difference
253
    if tty == "<class 'slice'>" or tty == "<type 'slice'>":
254
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
255
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
256
    else:
257
        x_setlist = [X for X in Xlists[indices]]
258
        y_setlist = [y for y in ybinarylists[indices]]
259
    return x_setlist, y_setlist
260
261
def split_data_random(X, y, val_size, test_size):
262
    X = np.array(X)
263
    y = np.array(y)
264
    size = len(X)
265
    train_size = size - val_size - test_size
266
    indices = np.random.permutation(size)
267
    X_train = X[indices[:train_size]]
268
    y_train = y[indices[:train_size]]
269
    X_val = X[indices[train_size:train_size+val_size]]
270
    y_val = y[indices[train_size:train_size+val_size]]
271
    X_test = X[indices[train_size+val_size:]]
272
    y_test = y[indices[train_size+val_size:]]
273
    return X_train, y_train, X_val, y_val, X_test, y_test
274
275
def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (80/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
276
               val_test_size=None):
277
    """ Function to preprocess the PAMAP2 data after it is fetched
278
279
    Parameters
280
    ----------
281
    targetdir : str
282
        subdirectory of directory_to_extract_to, targetdir
283
        is defined by function fetch_data
284
    outdatapath : str
285
        a subdirectory of directory_to_extract_to, outdatapath
286
        is the direcotry where the Numpy output will be stored.
287
    columns_to_use : list
288
        list of column names to use
289
    exclude_activities : list or tuple
290
        activities to exclude from the
291
    fold : boolean
292
        Whether to store each fold seperately ('False' creates
293
        Train, Test and Validation sets)
294
295
    Returns
296
    -------
297
    None
298
    """
299
    datadir = os.path.join(targetdir, 'PAMAP2_Dataset', 'Protocol')
300
    filenames = listdir(datadir)
301
    filenames.sort()
302
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
303
    # load the files and put them in a list of pandas dataframes:
304
    datasets = [pd.read_csv(os.path.join(datadir, fn), header=None, sep=' ')
305
                for fn in filenames]
306
    datasets = addheader(datasets)  # add headers to the datasets
307
    # Interpolate dataset to get same sample rate between channels
308
    datasets_filled = [d.interpolate() for d in datasets]
309
    # Create mapping for class labels
310
    classlabels, nr_classes, mapclasses = map_class(datasets_filled)
311
    # Create input (x) and output (y) sets
312
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
313
    yall = [np.array(data.activityID) for data in datasets_filled]
314
    xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
315
    Xlists, ylists = zip(*xylists)
316
    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
317
    frame_length = int(5.12 * 100)
318
    step = 1 * 100
319
    if not fold:
320
        if val_test_size is None:
321
            # Split in train, test and val
322
            x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
323
            test_range = slice(7, len(datasets_filled))
324
            x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (81/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
325
            x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
326
                                                  indices=slice(0, 6))
327
            # Take sliding-window frames, target is label of last time step,
328
            # and store as numpy file
329
            x_train, y_train = sliding_window(frame_length, step, x_trainlist,
330
                                              y_trainlist)
331
            x_val, y_val = sliding_window(frame_length, step, x_vallist,
332
                                              y_vallist)
333
            x_test, y_test = sliding_window(frame_length, step, x_testlist,
334
                                              y_testlist)
335
336
        else:
337
            val_size, test_size = val_test_size
338
            X_list, y_list = split_data(Xlists, ybinarylists,
339
                                        slice(0, len(datasets_filled)))
340
            X, y = sliding_window(frame_length, step, X_list,
341
                                  y_list)
342
            x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (105/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
343
344
345
        numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
346
                            outdatapath=outdatapath, shuffle=True)
347
        numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
348
                            outdatapath=outdatapath, shuffle=False)
349
        numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
350
                            outdatapath=outdatapath, shuffle=False)
351
    else :
0 ignored issues
show
Coding Style introduced by
No space allowed before :
else :
^
Loading history...
352
        for i in range(len(Xlists)):
353
            X_i, y_i = split_data(Xlists, ybinarylists, i)
354
            X, y = sliding_window(frame_length, step, X_i,
355
                                              y_i)
356
            numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
357
                            outdatapath=outdatapath, shuffle=True)
358
359
360
    print('Processed data succesfully stored in ' + outdatapath)
361
    return None
362
363
364
def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='preprocessed', exclude_activities=[0], fold=False,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (133/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
365
                         val_test_size=None):
366
    """
367
    High level function to fetch_and_preprocess the PAMAP2 dataset
368
369
    Parameters
370
    ----------
371
    directory_to_extract_to : str
372
        the directory where the data will be stored
373
    columns_to_use : list
374
        the columns to use
375
    ouptput_dir : str
376
        name of the directory to write the outputdata to
377
    exclude_activities : list or tuple
378
        activities to exclude from the
379
    fold : boolean
380
        Whether to store each fold seperately ('False' creates
381
        Train, Test and Validation sets)
382
383
    Returns
384
    -------
385
    outdatapath: str
386
        The directory in which the numpy files are stored
387
    """
388
    if columns_to_use is None:
389
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
390
                          'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
391
                          'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
392
    targetdir = fetch_data(directory_to_extract_to)
393
    outdatapath = os.path.join(targetdir, output_dir)
394
    if not os.path.exists(outdatapath):
395
        os.makedirs(outdatapath)
396
    if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')):
397
        print('Data previously pre-processed and np-files saved to ' +
398
              outdatapath)
399
    else:
400
        preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (99/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
401
    return outdatapath
402
403
404
def load_data(outputpath):
405
    """ Function to load the numpy data as stored in directory
406
    outputpath.
407
408
    Parameters
409
    ----------
410
    outputpath : str
411
        directory where the numpy files are stored
412
413
    Returns
414
    -------
415
    x_train
416
    y_train_binary
417
    x_val
418
    y_val_binary
419
    x_test
420
    y_test_binary
421
    """
422
    ext = '.npy'
423
    x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
424
    y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
425
    x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
426
    y_val_binary = np.load(os.path.join(outputpath,  'y_val' + ext))
0 ignored issues
show
Coding Style introduced by
Exactly one space required after comma
y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
^
Loading history...
427
    x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
428
    y_test_binary = np.load(os.path.join(outputpath,  'y_test' + ext))
0 ignored issues
show
Coding Style introduced by
Exactly one space required after comma
y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
^
Loading history...
429
    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
430
431
432 View Code Duplication
def download_preprocessed_data(directory_to_extract_to):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
433
    data_path = os.path.join(directory_to_extract_to,
434
                             'data/PAMAP2/preprocessed')
435
436
    if not os.path.isdir(data_path):
437
        path_to_zip_file = os.path.join(directory_to_extract_to, 'data.zip')
438
439
        # Download zip file with data
440
        if not os.path.isfile(path_to_zip_file):
441
            print("Downloading data...")
442
            local_fn, headers = urllib.request.urlretrieve(
443
                'https://zenodo.org/record/345082/files/data.zip',
444
                filename=path_to_zip_file)
445
        else:
446
            print("Data already downloaded")
447
448
        # Extract the zip file
449
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
450
            print("Extracting data...")
451
            zip_ref.extractall(directory_to_extract_to)
452
        print("Done")
453
    else:
454
        print("Data already downloaded and extracted.")
455
456
    return data_path