addheader() - Code Metrics - Inspection of "Changed preprocessing of PAMAP2 to exclude more ac..." - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 70f326...352d23 )

by Dafne van

created 2016-11-15 15:02 UTC

addheader() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	5
CRAP Score	2

Importance

Changes	4
Bugs	0	Features	0

Metric	Value
cc	2
c	4
b	0
f	0
dl	0
loc	14
ccs	5
cts	5
cp	1
crap	2
rs	9.4285

"""
 Summary:
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
 preproces the data.
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
"""
import numpy as np
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import os.path
import zipfile
import keras
from keras.utils.np_utils import to_categorical
import sys
import six.moves.urllib as urllib


def split_activities(labels, X, exclude_activities, borders=10 * 100):
    """
    Splits up the data per activity and exclude activity=0.
    Also remove borders for each activity.
    Returns lists with subdatasets

    Parameters
    ----------
    labels : numpy array
        Activity labels
    X : numpy array
        Data points
    borders : int
        Nr of timesteps to remove from the borders of an activity
    exclude_activities : list or tuple
        activities to exclude from the

    Returns
    -------
    X_list
    y_list
    """
    tot_len = len(labels)
    startpoints = np.where([1] + [labels[i] != labels[i - 1]
                                  for i in range(1, tot_len)])[0]
    endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
    # Also split up the data, and only keep the non-zero activities
    xysplit = [(X[s + borders:e - borders + 1, :], a)
               for s, e, a in zip(startpoints, endpoints, acts)
               if a not in exclude_activities]
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
    Xlist = [X for X, y in xysplit]
    ylist = [y for X, y in xysplit]
    return Xlist, ylist


def sliding_window(frame_length, step, Xsamples,
                   ysamples, Xsampleslist, ysampleslist):
    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept

    Parameters
    ----------
    frame_length : int
        Length of sliding window
    step : int
        Stepsize between windows
    Xsamples : list
        Existing list of window fragments
    ysamples : list
        Existing list of window fragments
    Xsampleslist : list
        Samples to take sliding windows from
    ysampleslist
        Samples to take sliding windows from

    """
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]
        ybinary = ysampleslist[j]
        for i in range(0, X.shape[0] - frame_length, step):
            xsub = X[i:i + frame_length, :]
            ysub = ybinary
            Xsamples.append(xsub)
            ysamples.append(ysub)


def transform_y(y, mapclasses, nr_classes):
    """
    Transforms y, a list with one sequence of A timesteps
    and B unique classes into a binary Numpy matrix of
    shape (A, B)

    Parameters
    ----------
    y : list or array
        List of classes
    mapclasses : dict
        dictionary that maps the classes to numbers
    nr_classes : int
        total number of classes
    """
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
    ybinary = to_categorical(ymapped, nr_classes)
    return ybinary

def get_header():
    axes = ['x', 'y', 'z']
    IMUsensor_columns = ['temperature'] + \
        ['acc_16g_' + i for i in axes] + \
        ['acc_6g_' + i for i in axes] + \
        ['gyroscope_' + i for i in axes] + \
        ['magnometer_' + i for i in axes] + \
        ['orientation_' + str(i) for i in range(4)]
    header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
                                                         for s in IMUsensor_columns] \

        + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
                                                       for s in IMUsensor_columns]

    return header

def addheader(datasets):
    """
    The columns of the pandas data frame are numbers
    this function adds the column labels

    Parameters
    ----------
    datasets : list
        List of pandas dataframes
    """
    header = get_header()
    for i in range(0, len(datasets)):
        datasets[i].columns = header
    return datasets


def numpify_and_store(X, y, xname, yname, outdatapath, shuffle=False):
    """
    Converts python lists x 3D and y 1D into numpy arrays
    and stores the numpy array in directory outdatapath
    shuffle is optional and shuffles the samples

    Parameters
    ----------
    X : list
        list with data
    y : list
        list with data
    xname : str
        name to store the x arrays
    yname : str
        name to store the y arrays
    outdatapath : str
        path to the directory to store the data
    shuffle : bool
        whether to shuffle the data before storing
    """
    X = np.array(X)
    y = np.array(y)
    # Shuffle the train set
    if shuffle is True:
        np.random.seed(123)
        neworder = np.random.permutation(X.shape[0])
        X = X[neworder, :, :]
        y = y[neworder, :]
    # Save binary file
    xpath = os.path.join(outdatapath, xname)
    ypath = os.path.join(outdatapath, yname)
    np.save(xpath, X)
    np.save(ypath, y)
    print('Stored '+ xpath, yname)


def fetch_data(directory_to_extract_to):
    """
    Fetch the data and extract the contents of the zip file
    to the directory_to_extract_to.
    First check whether this was done before, if yes, then skip

    Parameters
    ----------
    directory_to_extract_to : str
        directory to create subfolder 'PAMAP2'

    Returns
    -------
    targetdir: str
        directory where the data is extracted
    """
    targetdir = os.path.join(directory_to_extract_to, 'PAMAP2/')
    if os.path.exists(targetdir):
        print('Data previously downloaded and stored in ' + targetdir)
    else:
        os.makedirs(targetdir)  # create target directory
        # Download the PAMAP2 data, this is 688 Mb
        path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
        test_file_exist = os.path.isfile(path_to_zip_file)
        if test_file_exist is False:
            url = str('https://archive.ics.uci.edu/ml/' +
                      'machine-learning-databases/00231/PAMAP2_Dataset.zip')
            # retrieve data from url
            local_fn, headers = urllib.request.urlretrieve(url,
                                                           filename=path_to_zip_file)

            print('Download complete and stored in: ' + path_to_zip_file)
        else:
            print('The data was previously downloaded and stored in ' +
                  path_to_zip_file)
        # unzip
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
            zip_ref.extractall(targetdir)
    return targetdir


def slidingwindow_store(y_list, x_list, X_name, y_name, outdatapath, shuffle):
    """
    Take sliding-window frames. Target is label of last time step
    Data is 100 Hz

    Parameters
    ----------
    y_list : list
        list of arrays with classes
    x_list : list
        list of numpy arrays with data
    X_name : str
        Name for X file
    y_name : str
        Name for y file
    outdatapath : str
        directory to store the data
    shuffle : bool
        whether to shuffle the data
    """
    frame_length = int(5.12 * 100)
    step = 1 * 100
    x_set = []
    y_set = []
    sliding_window(frame_length, step, x_set, y_set, x_list, y_list)
    numpify_and_store(x_set, y_set, X_name, y_name,
                      outdatapath, shuffle)


def map_class(datasets_filled):
    ysetall = [set(np.array(data.activityID)) - set([0])
               for data in datasets_filled]
    classlabels = list(set.union(*[set(y) for y in ysetall]))
    nr_classes = len(classlabels)
    mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
    return classlabels, nr_classes, mapclasses


def split_data(Xlists, ybinarylists, indices):
    """ Function takes subset from list given indices

    Parameters
    ----------
    Xlists: tuple
        tuple (samples) of lists (windows) of numpy-arrays (time, variable)
    ybinarylist :
        list (samples) of numpy-arrays (window, class)
    indices :
        indices of the slice of data (samples) to be taken

    Returns
    -------
    x_setlist : list
        list (windows across samples) of numpy-arrays (time, variable)
    y_setlist: list
        list (windows across samples) of numpy-arrays (class, )
    """
    tty = str(type(indices))
    # or statement in next line is to account for python2 and python3
    # difference
    if tty == "<class 'slice'>" or tty == "<type 'slice'>":
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
    else:
        x_setlist = [X for X in Xlists[indices]]
        y_setlist = [y for y in ybinarylists[indices]]
    return x_setlist, y_setlist


def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold):

    """ Function to preprocess the PAMAP2 data after it is fetched

    Parameters
    ----------
    targetdir : str
        subdirectory of directory_to_extract_to, targetdir
        is defined by function fetch_data
    outdatapath : str
        a subdirectory of directory_to_extract_to, outdatapath
        is the direcotry where the Numpy output will be stored.
    columns_to_use : list
        list of column names to use
    exclude_activities : list or tuple
        activities to exclude from the
    fold : boolean
        Whether to store each fold seperately ('False' creates
        Train, Test and Validation sets)

    Returns
    -------
    None
    """
    datadir = targetdir + '/PAMAP2_Dataset/Protocol'
    filenames = listdir(datadir)
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
    # load the files and put them in a list of pandas dataframes:
    datasets = [pd.read_csv(datadir + '/' + fn, header=None, sep=' ')
                for fn in filenames]
    datasets = addheader(datasets)  # add headers to the datasets
    # Interpolate dataset to get same sample rate between channels
    datasets_filled = [d.interpolate() for d in datasets]
    # Create mapping for class labels
    classlabels, nr_classes, mapclasses = map_class(datasets_filled)
    # Create input (x) and output (y) sets
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
    yall = [np.array(data.activityID) for data in datasets_filled]
    xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]

    Xlists, ylists = zip(*xylists)
    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]

    if not fold:
        # Split in train, test and val
        x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
        test_range = slice(7, len(datasets_filled))
        x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
        x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
                                              indices=slice(0, 6))
        # Take sliding-window frames, target is label of last time step,
        # and store as numpy file
        slidingwindow_store(y_list=y_trainlist, x_list=x_trainlist,
                            X_name='X_train', y_name='y_train',
                            outdatapath=outdatapath, shuffle=True)
        slidingwindow_store(y_list=y_vallist, x_list=x_vallist,
                            X_name='X_val', y_name='y_val',
                            outdatapath=outdatapath, shuffle=False)
        slidingwindow_store(y_list=y_testlist, x_list=x_testlist,
                            X_name='X_test', y_name='y_test',
                            outdatapath=outdatapath, shuffle=False)
    else :

        for i in range(len(Xlists)):
            X_i, y_i = split_data(Xlists, ybinarylists, i)
            slidingwindow_store(y_list=y_i, x_list=X_i,
                            X_name='X_'+str(i), y_name='y_'+str(i),
                            outdatapath=outdatapath, shuffle=True)


    print('Processed data succesfully stored in ' + outdatapath)
    return None


def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='slidingwindow512cleaned', exclude_activities=[0], fold=False):

    """
    High level function to fetch_and_preprocess the PAMAP2 dataset

    Parameters
    ----------
    directory_to_extract_to : str
        the directory where the data will be stored
    columns_to_use : list
        the columns to use
    ouptput_dir : str
        name of the directory to write the outputdata to
    exclude_activities : list or tuple
        activities to exclude from the
    fold : boolean
        Whether to store each fold seperately ('False' creates
        Train, Test and Validation sets)

    Returns
    -------
    outdatapath: str
        The directory in which the numpy files are stored
    """
    if columns_to_use is None:
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
                          'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',

                          'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']

    targetdir = fetch_data(directory_to_extract_to)
    outdatapath = os.path.join(targetdir, 'PAMAP2_Dataset/', output_dir)
    if not os.path.exists(outdatapath):
        os.makedirs(outdatapath)
    # if os.path.isfile(outdatapath + 'x_train.npy'):
    #     print('Data previously pre-processed and np-files saved to ' +
    #           outdatapath)
    # else:
    preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold)

    return outdatapath


def load_data(outputpath):
    """ Function to load the numpy data as stored in directory
    outputpath.

    Parameters
    ----------
    outputpath : str
        directory where the numpy files are stored

    Returns
    -------
    x_train
    y_train_binary
    x_val
    y_val_binary
    x_test
    y_test_binary
    """
    ext = '.npy'
    x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
    y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
    x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
    y_val_binary = np.load(os.path.join(outputpath,  'y_val' + ext))

    x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
    y_test_binary = np.load(os.path.join(outputpath,  'y_test' + ext))

    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary


1		"""
2		Summary:
3		Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4		preproces the data.
5		Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6		"""
7	1	import numpy as np
8	1	from numpy import genfromtxt
9	1	import pandas as pd
10	1	import matplotlib.pyplot as plt
11	1	from os import listdir
12	1	import os.path
13	1	import zipfile
14	1	import keras
15	1	from keras.utils.np_utils import to_categorical
16	1	import sys
17	1	import six.moves.urllib as urllib
18
19
20	1	def split_activities(labels, X, exclude_activities, borders=10 * 100):
21		"""
22		Splits up the data per activity and exclude activity=0.
23		Also remove borders for each activity.
24		Returns lists with subdatasets
25
26		Parameters
27		----------
28		labels : numpy array
29		Activity labels
30		X : numpy array
31		Data points
32		borders : int
33		Nr of timesteps to remove from the borders of an activity
34		exclude_activities : list or tuple
35		activities to exclude from the
36
37		Returns
38		-------
39		X_list
40		y_list
41		"""
42		tot_len = len(labels)
43		startpoints = np.where([1] + [labels[i] != labels[i - 1]
44		for i in range(1, tot_len)])[0]
45		endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
46		acts = [labels[s] for s, e in zip(startpoints, endpoints)]
47		# Also split up the data, and only keep the non-zero activities
48		xysplit = [(X[s + borders:e - borders + 1, :], a)
49		for s, e, a in zip(startpoints, endpoints, acts)
50		if a not in exclude_activities]
51		xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
52		Xlist = [X for X, y in xysplit]
53		ylist = [y for X, y in xysplit]
54		return Xlist, ylist
55
56
57	1	def sliding_window(frame_length, step, Xsamples,
58		ysamples, Xsampleslist, ysampleslist):
59		"""
60		Splits time series in ysampleslist and Xsampleslist
61		into segments by applying a sliding overlapping window
62		of size equal to frame_length with steps equal to step
63		it does this for all the samples and appends all the output together.
64		So, the participant distinction is not kept
65
66		Parameters
67		----------
68		frame_length : int
69		Length of sliding window
70		step : int
71		Stepsize between windows
72		Xsamples : list
73		Existing list of window fragments
74		ysamples : list
75		Existing list of window fragments
76		Xsampleslist : list
77		Samples to take sliding windows from
78		ysampleslist
79		Samples to take sliding windows from
80
81		"""
82	1	for j in range(len(Xsampleslist)):
83	1	X = Xsampleslist[j]
84	1	ybinary = ysampleslist[j]
85	1	for i in range(0, X.shape[0] - frame_length, step):
86	1	xsub = X[i:i + frame_length, :]
87	1	ysub = ybinary
88	1	Xsamples.append(xsub)
89	1	ysamples.append(ysub)
90
91
92	1	def transform_y(y, mapclasses, nr_classes):
93		"""
94		Transforms y, a list with one sequence of A timesteps
95		and B unique classes into a binary Numpy matrix of
96		shape (A, B)
97
98		Parameters
99		----------
100		y : list or array
101		List of classes
102		mapclasses : dict
103		dictionary that maps the classes to numbers
104		nr_classes : int
105		total number of classes
106		"""
107	1	ymapped = np.array([mapclasses[c] for c in y], dtype='int')
108	1	ybinary = to_categorical(ymapped, nr_classes)
109	1	return ybinary
110
111	1	def get_header():
112	1	axes = ['x', 'y', 'z']
113	1	IMUsensor_columns = ['temperature'] + \
114		['acc_16g_' + i for i in axes] + \
115		['acc_6g_' + i for i in axes] + \
116		['gyroscope_' + i for i in axes] + \
117		['magnometer_' + i for i in axes] + \
118		['orientation_' + str(i) for i in range(4)]
119	1	header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
120		for s in IMUsensor_columns] \
		0 ignored issues – show Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
121		+ ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
122		for s in IMUsensor_columns]
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
123	1	return header
124
125	1	def addheader(datasets):
126		"""
127		The columns of the pandas data frame are numbers
128		this function adds the column labels
129
130		Parameters
131		----------
132		datasets : list
133		List of pandas dataframes
134		"""
135	1	header = get_header()
136	1	for i in range(0, len(datasets)):
137	1	datasets[i].columns = header
138	1	return datasets
139
140
141	1	def numpify_and_store(X, y, xname, yname, outdatapath, shuffle=False):
142		"""
143		Converts python lists x 3D and y 1D into numpy arrays
144		and stores the numpy array in directory outdatapath
145		shuffle is optional and shuffles the samples
146
147		Parameters
148		----------
149		X : list
150		list with data
151		y : list
152		list with data
153		xname : str
154		name to store the x arrays
155		yname : str
156		name to store the y arrays
157		outdatapath : str
158		path to the directory to store the data
159		shuffle : bool
160		whether to shuffle the data before storing
161		"""
162	1	X = np.array(X)
163	1	y = np.array(y)
164		# Shuffle the train set
165	1	if shuffle is True:
166	1	np.random.seed(123)
167	1	neworder = np.random.permutation(X.shape[0])
168	1	X = X[neworder, :, :]
169	1	y = y[neworder, :]
170		# Save binary file
171	1	xpath = os.path.join(outdatapath, xname)
172	1	ypath = os.path.join(outdatapath, yname)
173	1	np.save(xpath, X)
174	1	np.save(ypath, y)
175	1	print('Stored '+ xpath, yname)
176
177
178	1	def fetch_data(directory_to_extract_to):
179		"""
180		Fetch the data and extract the contents of the zip file
181		to the directory_to_extract_to.
182		First check whether this was done before, if yes, then skip
183
184		Parameters
185		----------
186		directory_to_extract_to : str
187		directory to create subfolder 'PAMAP2'
188
189		Returns
190		-------
191		targetdir: str
192		directory where the data is extracted
193		"""
194		targetdir = os.path.join(directory_to_extract_to, 'PAMAP2/')
195		if os.path.exists(targetdir):
196		print('Data previously downloaded and stored in ' + targetdir)
197		else:
198		os.makedirs(targetdir) # create target directory
199		# Download the PAMAP2 data, this is 688 Mb
200		path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
201		test_file_exist = os.path.isfile(path_to_zip_file)
202		if test_file_exist is False:
203		url = str('https://archive.ics.uci.edu/ml/' +
204		'machine-learning-databases/00231/PAMAP2_Dataset.zip')
205		# retrieve data from url
206		local_fn, headers = urllib.request.urlretrieve(url,
207		filename=path_to_zip_file)
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (85/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
208		print('Download complete and stored in: ' + path_to_zip_file)
209		else:
210		print('The data was previously downloaded and stored in ' +
211		path_to_zip_file)
212		# unzip
213		with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
214		zip_ref.extractall(targetdir)
215		return targetdir
216
217
218	1	def slidingwindow_store(y_list, x_list, X_name, y_name, outdatapath, shuffle):
219		"""
220		Take sliding-window frames. Target is label of last time step
221		Data is 100 Hz
222
223		Parameters
224		----------
225		y_list : list
226		list of arrays with classes
227		x_list : list
228		list of numpy arrays with data
229		X_name : str
230		Name for X file
231		y_name : str
232		Name for y file
233		outdatapath : str
234		directory to store the data
235		shuffle : bool
236		whether to shuffle the data
237		"""
238		frame_length = int(5.12 * 100)
239		step = 1 * 100
240		x_set = []
241		y_set = []
242		sliding_window(frame_length, step, x_set, y_set, x_list, y_list)
243		numpify_and_store(x_set, y_set, X_name, y_name,
244		outdatapath, shuffle)
245
246
247	1	def map_class(datasets_filled):
248		ysetall = [set(np.array(data.activityID)) - set([0])
249		for data in datasets_filled]
250		classlabels = list(set.union(*[set(y) for y in ysetall]))
251		nr_classes = len(classlabels)
252		mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
253		return classlabels, nr_classes, mapclasses
254
255
256	1	def split_data(Xlists, ybinarylists, indices):
257		""" Function takes subset from list given indices
258
259		Parameters
260		----------
261		Xlists: tuple
262		tuple (samples) of lists (windows) of numpy-arrays (time, variable)
263		ybinarylist :
264		list (samples) of numpy-arrays (window, class)
265		indices :
266		indices of the slice of data (samples) to be taken
267
268		Returns
269		-------
270		x_setlist : list
271		list (windows across samples) of numpy-arrays (time, variable)
272		y_setlist: list
273		list (windows across samples) of numpy-arrays (class, )
274		"""
275	1	tty = str(type(indices))
276		# or statement in next line is to account for python2 and python3
277		# difference
278	1	if tty == "<class 'slice'>" or tty == "<type 'slice'>":
279	1	x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
280	1	y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
281		else:
282		x_setlist = [X for X in Xlists[indices]]
283		y_setlist = [y for y in ybinarylists[indices]]
284	1	return x_setlist, y_setlist
285
286
287	1	def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold):
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (81/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
288		""" Function to preprocess the PAMAP2 data after it is fetched
289
290		Parameters
291		----------
292		targetdir : str
293		subdirectory of directory_to_extract_to, targetdir
294		is defined by function fetch_data
295		outdatapath : str
296		a subdirectory of directory_to_extract_to, outdatapath
297		is the direcotry where the Numpy output will be stored.
298		columns_to_use : list
299		list of column names to use
300		exclude_activities : list or tuple
301		activities to exclude from the
302		fold : boolean
303		Whether to store each fold seperately ('False' creates
304		Train, Test and Validation sets)
305
306		Returns
307		-------
308		None
309		"""
310		datadir = targetdir + '/PAMAP2_Dataset/Protocol'
311		filenames = listdir(datadir)
312		print('Start pre-processing all ' + str(len(filenames)) + ' files...')
313		# load the files and put them in a list of pandas dataframes:
314		datasets = [pd.read_csv(datadir + '/' + fn, header=None, sep=' ')
315		for fn in filenames]
316		datasets = addheader(datasets) # add headers to the datasets
317		# Interpolate dataset to get same sample rate between channels
318		datasets_filled = [d.interpolate() for d in datasets]
319		# Create mapping for class labels
320		classlabels, nr_classes, mapclasses = map_class(datasets_filled)
321		# Create input (x) and output (y) sets
322		xall = [np.array(data[columns_to_use]) for data in datasets_filled]
323		yall = [np.array(data.activityID) for data in datasets_filled]
324		xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
325		Xlists, ylists = zip(*xylists)
326		ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
327
328		if not fold:
329		# Split in train, test and val
330		x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
331		test_range = slice(7, len(datasets_filled))
332		x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
333		x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
334		indices=slice(0, 6))
335		# Take sliding-window frames, target is label of last time step,
336		# and store as numpy file
337		slidingwindow_store(y_list=y_trainlist, x_list=x_trainlist,
338		X_name='X_train', y_name='y_train',
339		outdatapath=outdatapath, shuffle=True)
340		slidingwindow_store(y_list=y_vallist, x_list=x_vallist,
341		X_name='X_val', y_name='y_val',
342		outdatapath=outdatapath, shuffle=False)
343		slidingwindow_store(y_list=y_testlist, x_list=x_testlist,
344		X_name='X_test', y_name='y_test',
345		outdatapath=outdatapath, shuffle=False)
346		else :
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report No space allowed before : else : ^ Loading history...
347		for i in range(len(Xlists)):
348		X_i, y_i = split_data(Xlists, ybinarylists, i)
349		slidingwindow_store(y_list=y_i, x_list=X_i,
350		X_name='X_'+str(i), y_name='y_'+str(i),
351		outdatapath=outdatapath, shuffle=True)
352
353
354		print('Processed data succesfully stored in ' + outdatapath)
355		return None
356
357
358	1	def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='slidingwindow512cleaned', exclude_activities=[0], fold=False):
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (145/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
359		"""
360		High level function to fetch_and_preprocess the PAMAP2 dataset
361
362		Parameters
363		----------
364		directory_to_extract_to : str
365		the directory where the data will be stored
366		columns_to_use : list
367		the columns to use
368		ouptput_dir : str
369		name of the directory to write the outputdata to
370		exclude_activities : list or tuple
371		activities to exclude from the
372		fold : boolean
373		Whether to store each fold seperately ('False' creates
374		Train, Test and Validation sets)
375
376		Returns
377		-------
378		outdatapath: str
379		The directory in which the numpy files are stored
380		"""
381		if columns_to_use is None:
382		columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
383		'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
384		'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
385		targetdir = fetch_data(directory_to_extract_to)
386		outdatapath = os.path.join(targetdir, 'PAMAP2_Dataset/', output_dir)
387		if not os.path.exists(outdatapath):
388		os.makedirs(outdatapath)
389		# if os.path.isfile(outdatapath + 'x_train.npy'):
390		# print('Data previously pre-processed and np-files saved to ' +
391		# outdatapath)
392		# else:
393		preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold)
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (80/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
394		return outdatapath
395
396
397	1	def load_data(outputpath):
398		""" Function to load the numpy data as stored in directory
399		outputpath.
400
401		Parameters
402		----------
403		outputpath : str
404		directory where the numpy files are stored
405
406		Returns
407		-------
408		x_train
409		y_train_binary
410		x_val
411		y_val_binary
412		x_test
413		y_test_binary
414		"""
415		ext = '.npy'
416		x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
417		y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
418		x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
419		y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report Exactly one space required after comma y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext)) ^ Loading history...
420		x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
421		y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report Exactly one space required after comma y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext)) ^ Loading history...
422		return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
423

NLeSC / mcfly

Push — master ( 70f326...352d23 )

addheader() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like