split_data_random() - Code Metrics - Inspection of "Merge pull request #93 from NLeSC/workshop" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 12b66d...1bf4c2 )

by Dafne van

created 2017-01-18 12:27 UTC

split_data_random() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	1
CRAP Score	1.7865

Importance

Changes

Metric	Value
cc	1
dl	0
loc	13
ccs	1
cts	13
cp	0.0769
crap	1.7865
rs	9.4285
c	0
b	0
f	0

"""
 Summary:
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
 preproces the data.
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
"""
import numpy as np
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import os.path
import zipfile
import keras
from keras.utils.np_utils import to_categorical
import sys
import six.moves.urllib as urllib


def split_activities(labels, X, exclude_activities, borders=10 * 100):
    """
    Splits up the data per activity and exclude activity=0.
    Also remove borders for each activity.
    Returns lists with subdatasets

    Parameters
    ----------
    labels : numpy array
        Activity labels
    X : numpy array
        Data points
    borders : int
        Nr of timesteps to remove from the borders of an activity
    exclude_activities : list or tuple
        activities to exclude from the

    Returns
    -------
    X_list
    y_list
    """
    tot_len = len(labels)
    startpoints = np.where([1] + [labels[i] != labels[i - 1]
                                  for i in range(1, tot_len)])[0]
    endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
    # Also split up the data, and only keep the non-zero activities
    xysplit = [(X[s + borders:e - borders + 1, :], a)
               for s, e, a in zip(startpoints, endpoints, acts)
               if a not in exclude_activities]
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
    Xlist = [X for X, y in xysplit]
    ylist = [y for X, y in xysplit]
    return Xlist, ylist


def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept

    Parameters
    ----------
    frame_length : int
        Length of sliding window
    step : int
        Stepsize between windows
    Xsamples : list
        Existing list of window fragments
    ysamples : list
        Existing list of window fragments
    Xsampleslist : list
        Samples to take sliding windows from
    ysampleslist
        Samples to take sliding windows from

    """
    Xsamples = []
    ysamples = []
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]
        ybinary = ysampleslist[j]
        for i in range(0, X.shape[0] - frame_length, step):
            xsub = X[i:i + frame_length, :]
            ysub = ybinary
            Xsamples.append(xsub)
            ysamples.append(ysub)
    return Xsamples, ysamples


def transform_y(y, mapclasses, nr_classes):
    """
    Transforms y, a list with one sequence of A timesteps
    and B unique classes into a binary Numpy matrix of
    shape (A, B)

    Parameters
    ----------
    y : list or array
        List of classes
    mapclasses : dict
        dictionary that maps the classes to numbers
    nr_classes : int
        total number of classes
    """
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
    ybinary = to_categorical(ymapped, nr_classes)
    return ybinary

def get_header():
    axes = ['x', 'y', 'z']
    IMUsensor_columns = ['temperature'] + \
        ['acc_16g_' + i for i in axes] + \
        ['acc_6g_' + i for i in axes] + \
        ['gyroscope_' + i for i in axes] + \
        ['magnometer_' + i for i in axes] + \
        ['orientation_' + str(i) for i in range(4)]
    header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
                                                         for s in IMUsensor_columns] \

        + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
                                                       for s in IMUsensor_columns]

    return header

def addheader(datasets):
    """
    The columns of the pandas data frame are numbers
    this function adds the column labels

    Parameters
    ----------
    datasets : list
        List of pandas dataframes
    """
    header = get_header()
    for i in range(0, len(datasets)):
        datasets[i].columns = header
    return datasets


def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
    """
    Converts python lists x 3D and y 1D into numpy arrays
    and stores the numpy array in directory outdatapath
    shuffle is optional and shuffles the samples

    Parameters
    ----------
    X : list
        list with data
    y : list
        list with data
    X_name : str
        name to store the x arrays
    y_name : str
        name to store the y arrays
    outdatapath : str
        path to the directory to store the data
    shuffle : bool
        whether to shuffle the data before storing
    """
    X = np.array(X)
    y = np.array(y)
    # Shuffle the train set
    if shuffle is True:
        np.random.seed(123)
        neworder = np.random.permutation(X.shape[0])
        X = X[neworder, :, :]
        y = y[neworder, :]
    # Save binary file
    xpath = os.path.join(outdatapath, X_name)
    ypath = os.path.join(outdatapath, y_name)
    np.save(xpath, X)
    np.save(ypath, y)
    print('Stored ' + xpath, y_name)


def fetch_data(directory_to_extract_to):
    """
    Fetch the data and extract the contents of the zip file
    to the directory_to_extract_to.
    First check whether this was done before, if yes, then skip

    Parameters
    ----------
    directory_to_extract_to : str
        directory to create subfolder 'PAMAP2'

    Returns
    -------
    targetdir: str
        directory where the data is extracted
    """
    targetdir = os.path.join(directory_to_extract_to, 'PAMAP2/')
    if os.path.exists(targetdir):
        print('Data previously downloaded and stored in ' + targetdir)
    else:
        os.makedirs(targetdir)  # create target directory
        # Download the PAMAP2 data, this is 688 Mb
        path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
        test_file_exist = os.path.isfile(path_to_zip_file)
        if test_file_exist is False:
            url = str('https://archive.ics.uci.edu/ml/' +
                      'machine-learning-databases/00231/PAMAP2_Dataset.zip')
            # retrieve data from url
            local_fn, headers = urllib.request.urlretrieve(url,
                                                           filename=path_to_zip_file)

            print('Download complete and stored in: ' + path_to_zip_file)
        else:
            print('The data was previously downloaded and stored in ' +
                  path_to_zip_file)
        # unzip
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
            zip_ref.extractall(targetdir)
    return targetdir


def map_class(datasets_filled):
    ysetall = [set(np.array(data.activityID)) - set([0])
               for data in datasets_filled]
    classlabels = list(set.union(*[set(y) for y in ysetall]))
    nr_classes = len(classlabels)
    mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
    return classlabels, nr_classes, mapclasses


def split_data(Xlists, ybinarylists, indices):
    """ Function takes subset from list given indices

    Parameters
    ----------
    Xlists: tuple
        tuple (samples) of lists (windows) of numpy-arrays (time, variable)
    ybinarylist :
        list (samples) of numpy-arrays (window, class)
    indices :
        indices of the slice of data (samples) to be taken

    Returns
    -------
    x_setlist : list
        list (windows across samples) of numpy-arrays (time, variable)
    y_setlist: list
        list (windows across samples) of numpy-arrays (class, )
    """
    tty = str(type(indices))
    # or statement in next line is to account for python2 and python3
    # difference
    if tty == "<class 'slice'>" or tty == "<type 'slice'>":
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
    else:
        x_setlist = [X for X in Xlists[indices]]
        y_setlist = [y for y in ybinarylists[indices]]
    return x_setlist, y_setlist

def split_data_random(X, y, val_size, test_size):
    X = np.array(X)
    y = np.array(y)
    size = len(X)
    train_size = size - val_size - test_size
    indices = np.random.permutation(size)
    X_train = X[indices[:train_size]]
    y_train = y[indices[:train_size]]
    X_val = X[indices[train_size:train_size+val_size]]
    y_val = y[indices[train_size:train_size+val_size]]
    X_test = X[indices[train_size+val_size:]]
    y_test = y[indices[train_size+val_size:]]
    return X_train, y_train, X_val, y_val, X_test, y_test

def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,

               val_test_size=None):
    """ Function to preprocess the PAMAP2 data after it is fetched

    Parameters
    ----------
    targetdir : str
        subdirectory of directory_to_extract_to, targetdir
        is defined by function fetch_data
    outdatapath : str
        a subdirectory of directory_to_extract_to, outdatapath
        is the direcotry where the Numpy output will be stored.
    columns_to_use : list
        list of column names to use
    exclude_activities : list or tuple
        activities to exclude from the
    fold : boolean
        Whether to store each fold seperately ('False' creates
        Train, Test and Validation sets)

    Returns
    -------
    None
    """
    datadir = targetdir + '/PAMAP2_Dataset/Protocol'
    filenames = listdir(datadir)
    filenames.sort()
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
    # load the files and put them in a list of pandas dataframes:
    datasets = [pd.read_csv(datadir + '/' + fn, header=None, sep=' ')
                for fn in filenames]
    datasets = addheader(datasets)  # add headers to the datasets
    # Interpolate dataset to get same sample rate between channels
    datasets_filled = [d.interpolate() for d in datasets]
    # Create mapping for class labels
    classlabels, nr_classes, mapclasses = map_class(datasets_filled)
    # Create input (x) and output (y) sets
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
    yall = [np.array(data.activityID) for data in datasets_filled]
    xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]

    Xlists, ylists = zip(*xylists)
    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
    frame_length = int(5.12 * 100)
    step = 1 * 100
    if not fold:
        if val_test_size is None:
            # Split in train, test and val
            x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
            test_range = slice(7, len(datasets_filled))
            x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)

            x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
                                                  indices=slice(0, 6))
            # Take sliding-window frames, target is label of last time step,
            # and store as numpy file
            x_train, y_train = sliding_window(frame_length, step, x_trainlist,
                                              y_trainlist)
            x_val, y_val = sliding_window(frame_length, step, x_vallist,
                                              y_vallist)
            x_test, y_test = sliding_window(frame_length, step, x_testlist,
                                              y_testlist)

        else:
            val_size, test_size = val_test_size
            X_list, y_list = split_data(Xlists, ybinarylists,
                                        slice(0, len(datasets_filled)))
            X, y = sliding_window(frame_length, step, X_list,
                                  y_list)
            x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)



        numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
                            outdatapath=outdatapath, shuffle=True)
        numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
                            outdatapath=outdatapath, shuffle=False)
        numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
                            outdatapath=outdatapath, shuffle=False)
    else :

        for i in range(len(Xlists)):
            X_i, y_i = split_data(Xlists, ybinarylists, i)
            X, y = sliding_window(frame_length, step, X_i,
                                              y_i)
            numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
                            outdatapath=outdatapath, shuffle=True)


    print('Processed data succesfully stored in ' + outdatapath)
    return None


def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='slidingwindow512cleaned', exclude_activities=[0], fold=False,

                         val_test_size=None):
    """
    High level function to fetch_and_preprocess the PAMAP2 dataset

    Parameters
    ----------
    directory_to_extract_to : str
        the directory where the data will be stored
    columns_to_use : list
        the columns to use
    ouptput_dir : str
        name of the directory to write the outputdata to
    exclude_activities : list or tuple
        activities to exclude from the
    fold : boolean
        Whether to store each fold seperately ('False' creates
        Train, Test and Validation sets)

    Returns
    -------
    outdatapath: str
        The directory in which the numpy files are stored
    """
    if columns_to_use is None:
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
                          'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',

                          'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']

    targetdir = fetch_data(directory_to_extract_to)
    outdatapath = os.path.join(targetdir, 'PAMAP2_Dataset/', output_dir)
    if not os.path.exists(outdatapath):
        os.makedirs(outdatapath)
    # if os.path.isfile(outdatapath + 'x_train.npy'):
    #     print('Data previously pre-processed and np-files saved to ' +
    #           outdatapath)
    # else:
    preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)

    return outdatapath


def load_data(outputpath):
    """ Function to load the numpy data as stored in directory
    outputpath.

    Parameters
    ----------
    outputpath : str
        directory where the numpy files are stored

    Returns
    -------
    x_train
    y_train_binary
    x_val
    y_val_binary
    x_test
    y_test_binary
    """
    ext = '.npy'
    x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
    y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
    x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
    y_val_binary = np.load(os.path.join(outputpath,  'y_val' + ext))

    x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
    y_test_binary = np.load(os.path.join(outputpath,  'y_test' + ext))

    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary


1		"""
2		Summary:
3		Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4		preproces the data.
5		Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6		"""
7	1	import numpy as np
8	1	from numpy import genfromtxt
9	1	import pandas as pd
10	1	import matplotlib.pyplot as plt
11	1	from os import listdir
12	1	import os.path
13	1	import zipfile
14	1	import keras
15	1	from keras.utils.np_utils import to_categorical
16	1	import sys
17	1	import six.moves.urllib as urllib
18
19
20	1	def split_activities(labels, X, exclude_activities, borders=10 * 100):
21		"""
22		Splits up the data per activity and exclude activity=0.
23		Also remove borders for each activity.
24		Returns lists with subdatasets
25
26		Parameters
27		----------
28		labels : numpy array
29		Activity labels
30		X : numpy array
31		Data points
32		borders : int
33		Nr of timesteps to remove from the borders of an activity
34		exclude_activities : list or tuple
35		activities to exclude from the
36
37		Returns
38		-------
39		X_list
40		y_list
41		"""
42	1	tot_len = len(labels)
43	1	startpoints = np.where([1] + [labels[i] != labels[i - 1]
44		for i in range(1, tot_len)])[0]
45	1	endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
46	1	acts = [labels[s] for s, e in zip(startpoints, endpoints)]
47		# Also split up the data, and only keep the non-zero activities
48	1	xysplit = [(X[s + borders:e - borders + 1, :], a)
49		for s, e, a in zip(startpoints, endpoints, acts)
50		if a not in exclude_activities]
51	1	xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
52	1	Xlist = [X for X, y in xysplit]
53	1	ylist = [y for X, y in xysplit]
54	1	return Xlist, ylist
55
56
57	1	def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
58		"""
59		Splits time series in ysampleslist and Xsampleslist
60		into segments by applying a sliding overlapping window
61		of size equal to frame_length with steps equal to step
62		it does this for all the samples and appends all the output together.
63		So, the participant distinction is not kept
64
65		Parameters
66		----------
67		frame_length : int
68		Length of sliding window
69		step : int
70		Stepsize between windows
71		Xsamples : list
72		Existing list of window fragments
73		ysamples : list
74		Existing list of window fragments
75		Xsampleslist : list
76		Samples to take sliding windows from
77		ysampleslist
78		Samples to take sliding windows from
79
80		"""
81	1	Xsamples = []
82	1	ysamples = []
83	1	for j in range(len(Xsampleslist)):
84	1	X = Xsampleslist[j]
85	1	ybinary = ysampleslist[j]
86	1	for i in range(0, X.shape[0] - frame_length, step):
87	1	xsub = X[i:i + frame_length, :]
88	1	ysub = ybinary
89	1	Xsamples.append(xsub)
90	1	ysamples.append(ysub)
91	1	return Xsamples, ysamples
92
93
94	1	def transform_y(y, mapclasses, nr_classes):
95		"""
96		Transforms y, a list with one sequence of A timesteps
97		and B unique classes into a binary Numpy matrix of
98		shape (A, B)
99
100		Parameters
101		----------
102		y : list or array
103		List of classes
104		mapclasses : dict
105		dictionary that maps the classes to numbers
106		nr_classes : int
107		total number of classes
108		"""
109	1	ymapped = np.array([mapclasses[c] for c in y], dtype='int')
110	1	ybinary = to_categorical(ymapped, nr_classes)
111	1	return ybinary
112
113	1	def get_header():
114	1	axes = ['x', 'y', 'z']
115	1	IMUsensor_columns = ['temperature'] + \
116		['acc_16g_' + i for i in axes] + \
117		['acc_6g_' + i for i in axes] + \
118		['gyroscope_' + i for i in axes] + \
119		['magnometer_' + i for i in axes] + \
120		['orientation_' + str(i) for i in range(4)]
121	1	header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
122		for s in IMUsensor_columns] \
		0 ignored issues – show Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
123		+ ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
124		for s in IMUsensor_columns]
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
125	1	return header
126
127	1	def addheader(datasets):
128		"""
129		The columns of the pandas data frame are numbers
130		this function adds the column labels
131
132		Parameters
133		----------
134		datasets : list
135		List of pandas dataframes
136		"""
137	1	header = get_header()
138	1	for i in range(0, len(datasets)):
139	1	datasets[i].columns = header
140	1	return datasets
141
142
143	1	def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
144		"""
145		Converts python lists x 3D and y 1D into numpy arrays
146		and stores the numpy array in directory outdatapath
147		shuffle is optional and shuffles the samples
148
149		Parameters
150		----------
151		X : list
152		list with data
153		y : list
154		list with data
155		X_name : str
156		name to store the x arrays
157		y_name : str
158		name to store the y arrays
159		outdatapath : str
160		path to the directory to store the data
161		shuffle : bool
162		whether to shuffle the data before storing
163		"""
164	1	X = np.array(X)
165	1	y = np.array(y)
166		# Shuffle the train set
167	1	if shuffle is True:
168	1	np.random.seed(123)
169	1	neworder = np.random.permutation(X.shape[0])
170	1	X = X[neworder, :, :]
171	1	y = y[neworder, :]
172		# Save binary file
173	1	xpath = os.path.join(outdatapath, X_name)
174	1	ypath = os.path.join(outdatapath, y_name)
175	1	np.save(xpath, X)
176	1	np.save(ypath, y)
177	1	print('Stored ' + xpath, y_name)
178
179
180	1	def fetch_data(directory_to_extract_to):
181		"""
182		Fetch the data and extract the contents of the zip file
183		to the directory_to_extract_to.
184		First check whether this was done before, if yes, then skip
185
186		Parameters
187		----------
188		directory_to_extract_to : str
189		directory to create subfolder 'PAMAP2'
190
191		Returns
192		-------
193		targetdir: str
194		directory where the data is extracted
195		"""
196		targetdir = os.path.join(directory_to_extract_to, 'PAMAP2/')
197		if os.path.exists(targetdir):
198		print('Data previously downloaded and stored in ' + targetdir)
199		else:
200		os.makedirs(targetdir) # create target directory
201		# Download the PAMAP2 data, this is 688 Mb
202		path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
203		test_file_exist = os.path.isfile(path_to_zip_file)
204		if test_file_exist is False:
205		url = str('https://archive.ics.uci.edu/ml/' +
206		'machine-learning-databases/00231/PAMAP2_Dataset.zip')
207		# retrieve data from url
208		local_fn, headers = urllib.request.urlretrieve(url,
209		filename=path_to_zip_file)
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (85/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
210		print('Download complete and stored in: ' + path_to_zip_file)
211		else:
212		print('The data was previously downloaded and stored in ' +
213		path_to_zip_file)
214		# unzip
215		with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
216		zip_ref.extractall(targetdir)
217		return targetdir
218
219
220	1	def map_class(datasets_filled):
221		ysetall = [set(np.array(data.activityID)) - set([0])
222		for data in datasets_filled]
223		classlabels = list(set.union(*[set(y) for y in ysetall]))
224		nr_classes = len(classlabels)
225		mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
226		return classlabels, nr_classes, mapclasses
227
228
229	1	def split_data(Xlists, ybinarylists, indices):
230		""" Function takes subset from list given indices
231
232		Parameters
233		----------
234		Xlists: tuple
235		tuple (samples) of lists (windows) of numpy-arrays (time, variable)
236		ybinarylist :
237		list (samples) of numpy-arrays (window, class)
238		indices :
239		indices of the slice of data (samples) to be taken
240
241		Returns
242		-------
243		x_setlist : list
244		list (windows across samples) of numpy-arrays (time, variable)
245		y_setlist: list
246		list (windows across samples) of numpy-arrays (class, )
247		"""
248	1	tty = str(type(indices))
249		# or statement in next line is to account for python2 and python3
250		# difference
251	1	if tty == "<class 'slice'>" or tty == "<type 'slice'>":
252	1	x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
253	1	y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
254		else:
255		x_setlist = [X for X in Xlists[indices]]
256		y_setlist = [y for y in ybinarylists[indices]]
257	1	return x_setlist, y_setlist
258
259	1	def split_data_random(X, y, val_size, test_size):
260		X = np.array(X)
261		y = np.array(y)
262		size = len(X)
263		train_size = size - val_size - test_size
264		indices = np.random.permutation(size)
265		X_train = X[indices[:train_size]]
266		y_train = y[indices[:train_size]]
267		X_val = X[indices[train_size:train_size+val_size]]
268		y_val = y[indices[train_size:train_size+val_size]]
269		X_test = X[indices[train_size+val_size:]]
270		y_test = y[indices[train_size+val_size:]]
271		return X_train, y_train, X_val, y_val, X_test, y_test
272
273	1	def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (80/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
274		val_test_size=None):
275		""" Function to preprocess the PAMAP2 data after it is fetched
276
277		Parameters
278		----------
279		targetdir : str
280		subdirectory of directory_to_extract_to, targetdir
281		is defined by function fetch_data
282		outdatapath : str
283		a subdirectory of directory_to_extract_to, outdatapath
284		is the direcotry where the Numpy output will be stored.
285		columns_to_use : list
286		list of column names to use
287		exclude_activities : list or tuple
288		activities to exclude from the
289		fold : boolean
290		Whether to store each fold seperately ('False' creates
291		Train, Test and Validation sets)
292
293		Returns
294		-------
295		None
296		"""
297		datadir = targetdir + '/PAMAP2_Dataset/Protocol'
298		filenames = listdir(datadir)
299		filenames.sort()
300		print('Start pre-processing all ' + str(len(filenames)) + ' files...')
301		# load the files and put them in a list of pandas dataframes:
302		datasets = [pd.read_csv(datadir + '/' + fn, header=None, sep=' ')
303		for fn in filenames]
304		datasets = addheader(datasets) # add headers to the datasets
305		# Interpolate dataset to get same sample rate between channels
306		datasets_filled = [d.interpolate() for d in datasets]
307		# Create mapping for class labels
308		classlabels, nr_classes, mapclasses = map_class(datasets_filled)
309		# Create input (x) and output (y) sets
310		xall = [np.array(data[columns_to_use]) for data in datasets_filled]
311		yall = [np.array(data.activityID) for data in datasets_filled]
312		xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
313		Xlists, ylists = zip(*xylists)
314		ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
315		frame_length = int(5.12 * 100)
316		step = 1 * 100
317		if not fold:
318		if val_test_size is None:
319		# Split in train, test and val
320		x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
321		test_range = slice(7, len(datasets_filled))
322		x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
		0 ignored issues – show Coding Style introduced 2017-01-16 15:09 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (81/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
323		x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
324		indices=slice(0, 6))
325		# Take sliding-window frames, target is label of last time step,
326		# and store as numpy file
327		x_train, y_train = sliding_window(frame_length, step, x_trainlist,
328		y_trainlist)
329		x_val, y_val = sliding_window(frame_length, step, x_vallist,
330		y_vallist)
331		x_test, y_test = sliding_window(frame_length, step, x_testlist,
332		y_testlist)
333
334		else:
335		val_size, test_size = val_test_size
336		X_list, y_list = split_data(Xlists, ybinarylists,
337		slice(0, len(datasets_filled)))
338		X, y = sliding_window(frame_length, step, X_list,
339		y_list)
340		x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)
		0 ignored issues – show Coding Style introduced 2017-01-16 15:09 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (105/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
341
342
343		numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
344		outdatapath=outdatapath, shuffle=True)
345		numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
346		outdatapath=outdatapath, shuffle=False)
347		numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
348		outdatapath=outdatapath, shuffle=False)
349		else :
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report No space allowed before : else : ^ Loading history...
350		for i in range(len(Xlists)):
351		X_i, y_i = split_data(Xlists, ybinarylists, i)
352		X, y = sliding_window(frame_length, step, X_i,
353		y_i)
354		numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
355		outdatapath=outdatapath, shuffle=True)
356
357
358		print('Processed data succesfully stored in ' + outdatapath)
359		return None
360
361
362	1	def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='slidingwindow512cleaned', exclude_activities=[0], fold=False,
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (144/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
363		val_test_size=None):
364		"""
365		High level function to fetch_and_preprocess the PAMAP2 dataset
366
367		Parameters
368		----------
369		directory_to_extract_to : str
370		the directory where the data will be stored
371		columns_to_use : list
372		the columns to use
373		ouptput_dir : str
374		name of the directory to write the outputdata to
375		exclude_activities : list or tuple
376		activities to exclude from the
377		fold : boolean
378		Whether to store each fold seperately ('False' creates
379		Train, Test and Validation sets)
380
381		Returns
382		-------
383		outdatapath: str
384		The directory in which the numpy files are stored
385		"""
386		if columns_to_use is None:
387		columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
388		'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
389		'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
390		targetdir = fetch_data(directory_to_extract_to)
391		outdatapath = os.path.join(targetdir, 'PAMAP2_Dataset/', output_dir)
392		if not os.path.exists(outdatapath):
393		os.makedirs(outdatapath)
394		# if os.path.isfile(outdatapath + 'x_train.npy'):
395		# print('Data previously pre-processed and np-files saved to ' +
396		# outdatapath)
397		# else:
398		preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)
		0 ignored issues – show Coding Style introduced 2017-01-16 15:09 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (95/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
399		return outdatapath
400
401
402	1	def load_data(outputpath):
403		""" Function to load the numpy data as stored in directory
404		outputpath.
405
406		Parameters
407		----------
408		outputpath : str
409		directory where the numpy files are stored
410
411		Returns
412		-------
413		x_train
414		y_train_binary
415		x_val
416		y_val_binary
417		x_test
418		y_test_binary
419		"""
420		ext = '.npy'
421		x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
422		y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
423		x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
424		y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report Exactly one space required after comma y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext)) ^ Loading history...
425		x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
426		y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report Exactly one space required after comma y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext)) ^ Loading history...
427		return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
428

NLeSC / mcfly

Push — master ( 12b66d...1bf4c2 )

split_data_random() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like