split_data() - Code Metrics - Inspection of "function split_data now python2&3lack of consisten..." - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 7b8a0a...e386d2 )

unknown

created 2016-09-08 13:10 UTC

split_data() C

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	6
CRAP Score	10.2655

Importance

Changes	3
Bugs	0	Features	0

Metric	Value
cc	9
c	3
b	0
f	0
dl	0
loc	21
ccs	6
cts	8
cp	0.75
crap	10.2655
rs	5.4999

"""
 Summary:
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
 preproces the data.
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
"""
import numpy as np
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import os.path
import zipfile
import keras
from keras.utils.np_utils import to_categorical
import sys
if sys.version_info <= (3,): #python2
    import urllib
else: #python3
    import urllib.request


def split_activities(labels, X, borders=10*100):

    """
    Splits up the data per activity and exclude activity=0.
    Also remove borders for each activity.
    Returns lists with subdatasets
    """
    tot_len = len(labels)
    startpoints = np.where([1] + [labels[i] != labels[i-1] \
        for i in range(1, tot_len)])[0]
    endpoints = np.append(startpoints[1:]-1, tot_len-1)
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
    #Also split up the data, and only keep the non-zero activities
    xysplit = [(X[s+borders:e-borders+1, :], a) \
        for s, e, a in zip(startpoints, endpoints, acts) if a != 0]
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
    Xlist = [X for X, y in xysplit]

    ylist = [y for X, y in xysplit]
    return Xlist, ylist

def sliding_window(frame_length, step, Xsamples,\

    ysamples, Xsampleslist, ysampleslist):
    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept
    """
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]

        ybinary = ysampleslist[j]
        for i in range(0, X.shape[0]-frame_length, step):
            xsub = X[i:i+frame_length, :]
            ysub = ybinary
            Xsamples.append(xsub)
            ysamples.append(ysub)

def transform_y(y, mapclasses, nr_classes):

    """
    Transforms y, a list with one sequence of A timesteps
    and B unique classes into a binary Numpy matrix of
    shape (A, B)
    """
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
    ybinary = to_categorical(ymapped, nr_classes)
    return ybinary

def addheader(datasets):
    """
    The columns of the pandas data frame are numbers
    this function adds the column labels
    """
    axes = ['x', 'y', 'z']
    IMUsensor_columns = ['temperature'] + \

                    ['acc_16g_' + i for i in axes] + \
                    ['acc_6g_' + i for i in axes] + \
                    ['gyroscope_' + i for i in axes] + \
                    ['magnometer_' + i for i in axes] + \
                    ['orientation_' + str(i) for i in range(4)]
    header = ["timestamp", "activityID", "heartrate"] + ["hand_"+s \
        for s in IMUsensor_columns] \
        + ["chest_"+s for s in IMUsensor_columns]+ ["ankle_"+s \
            for s in IMUsensor_columns]
    for i in range(0, len(datasets)):
        datasets[i].columns = header
    return datasets

def numpify_and_store(X, y, xname, yname, outdatapath, shuffle=False):

    """
    Converts python lists x 3D and y 1D into numpy arrays
    and stores the numpy array in directory outdatapath
    shuffle is optional and shuffles the samples
    """
    X = np.array(X)
    y = np.array(y)
    #Shuffle around the train set
    if shuffle is True:
        np.random.seed(123)
        neworder = np.random.permutation(X.shape[0])
        X = X[neworder, :, :]
        y = y[neworder, :]
    # Save binary file
    np.save(outdatapath+ xname, X)
    np.save(outdatapath+ yname, y)


def fetch_data(directory_to_extract_to):
    """
    Fetch the data and extract the contents of the zip file
    to the directory_to_extract_to.
    First check whether this was done before, if yes, then skip
    """
    targetdir = directory_to_extract_to + '/PAMAP2'
    if os.path.exists(targetdir):
        print('Data previously downloaded and stored in ' + targetdir)
    else:
        os.makedirs(targetdir) # create target directory
        #download the PAMAP2 data, this is 688 Mb
        path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
        test_file_exist = os.path.isfile(path_to_zip_file)
        if test_file_exist is False:
            url = str('https://archive.ics.uci.edu/ml/' +
                'machine-learning-databases/00231/PAMAP2_Dataset.zip')
            #retrieve data from url
            if sys.version_info <= (3,): #python2
                local_fn, headers = urllib.urlretrieve(url, \
                    filename=path_to_zip_file)
            else: #python3
                local_fn, headers = urllib.request.urlretrieve(url,\
                    filename=path_to_zip_file)
            print('Download complete and stored in: ' + path_to_zip_file)
        else:
            print('The data was previously downloaded and stored in ' +
                path_to_zip_file)
        # unzip
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
            zip_ref.extractall(targetdir)
    return targetdir


def slidingwindow_store(y_list, x_list, X_name, y_name, outdatapath, shuffle):

    # Take sliding-window frames. Target is label of last time step
    # Data is 100 Hz
    frame_length = int(5.12 * 100)
    step = 1 * 100
    x_set = []
    y_set = []
    sliding_window(frame_length, step, x_set, y_set, x_list, y_list)
    numpify_and_store(x_set, y_set, X_name, y_name, \
        outdatapath, shuffle)

def map_clas(datasets_filled):
    ysetall = [set(np.array(data.activityID)) - set([0]) \
        for data in datasets_filled]
    classlabels = list(set.union(*[set(y) for y in ysetall]))
    nr_classes = len(classlabels)
    mapclasses = {classlabels[i] : i for i in range(len(classlabels))}
    return classlabels, nr_classes, mapclasses

def split_data(Xlists, ybinarylists, indices):

    """ Function takes subset from list given indices
    Arguments:
    - Xlists: tuple (samples) of lists (windows)
            of numpy-arrays (time, variable)
    - ybinarylist: list (samples) of numpy-arrays (window, class)
    - indices: indices of the slice of data (samples) to be taken
    Value (output):
    - x_setlist: list (windows across samples) of numpy-arrays (time, variable)
    - y_setlist: list (windows across samples) of numpy-arrays (class, )
    """
    tty = str(type(indices))
    # or statement in next line is to account for python2 and python3
    # difference
    if  tty == "<class 'slice'>" or tty == "<type 'slice'>":
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
    else:
        x_setlist = [X for X in Xlists[indices]]
        y_setlist = [y for y in ybinarylists[indices]]
    return x_setlist, y_setlist

def preprocess(targetdir, outdatapath, columns_to_use):
    """ Function to preprocess the PAMAP2 data after it is fetched
    Arguments:
    - targetdir: subdirectory of directory_to_extract_to, targetdir
        is defined by function fetch_data
    - outdatapath: a subdirectory of directory_to_extract_to, outdatapath
        is the direcotry where the Numpy output will be stored.
    Value (output):
    - None
    """
    datadir = targetdir + '/PAMAP2_Dataset/Protocol'
    filenames = listdir(datadir)
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
    # load the files and put them in a list of pandas dataframes:
    datasets = [pd.read_csv(datadir+'/'+fn, header=None, sep=' ') \
        for fn in filenames]
    datasets = addheader(datasets) # add headers to the datasets
    #Interpolate dataset to get same sample rate between channels
    datasets_filled = [d.interpolate() for d in datasets]
    # Create mapping for class labels
    classlabels, nr_classes, mapclasses = map_clas(datasets_filled)
    #Create input (x) and output (y) sets
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
    yall = [np.array(data.activityID) for data in datasets_filled]
    xylists = [split_activities(y, x) for x, y in zip(xall, yall)]
    Xlists, ylists = zip(*xylists)

    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
    # Split in train, test and val
    x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
    test_range = slice(7, len(datasets_filled))
    x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
    x_trainlist, y_trainlist = split_data(Xlists, ybinarylists, \
        indices=slice(0, 6))
    # Take sliding-window frames, target is label of last time step,
    # and store as numpy file
    slidingwindow_store(y_list=y_trainlist, x_list=x_trainlist, \
                X_name='X_train', y_name='y_train', \
                outdatapath=outdatapath, shuffle=True)
    slidingwindow_store(y_list=y_vallist, x_list=x_vallist, \
        X_name='X_val', y_name='y_val', \
        outdatapath=outdatapath, shuffle=False)
    slidingwindow_store(y_list=y_testlist, x_list=x_testlist, \
            X_name='X_test', y_name='y_test', \
            outdatapath=outdatapath, shuffle=False)
    print('Processed data succesfully stored in ' + outdatapath)
    return None

def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None):
    """
    High level function to fetch_and_preprocess the PAMAP2 dataset
    Arguments:
    - directory_to_extract_to: the directory where the data will be stored
    - columns_to_use: the columns to use
    Values (output):
    - outdatapath: The directory in which the numpy files are stored
    """
    if columns_to_use is None:
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
                     'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
                     'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
    targetdir = fetch_data(directory_to_extract_to)
    outdatapath = targetdir + '/PAMAP2_Dataset/slidingwindow512cleaned/'
    if not os.path.exists(outdatapath):
        os.makedirs(outdatapath)
    if os.path.isfile(outdatapath+'x_train.npy'):
        print('Data previously pre-processed and np-files saved to ' +
            outdatapath)
    else:
        preprocess(targetdir, outdatapath, columns_to_use)
    return outdatapath

def load_data(outputpath):
    """ Function to load the numpy data as stored in directory
    outputpath.
    """
    ext = '.npy'
    x_train = np.load(outputpath+'X_train'+ext)
    y_train_binary = np.load(outputpath+'y_train'+ext)
    x_val = np.load(outputpath+'X_val'+ext)
    y_val_binary = np.load(outputpath+'y_val'+ext)
    x_test = np.load(outputpath+'X_test'+ext)
    y_test_binary = np.load(outputpath+'y_test'+ext)
    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary


1		"""
2		Summary:
3		Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4		preproces the data.
5		Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6		"""
7	1	import numpy as np
8	1	from numpy import genfromtxt
9	1	import pandas as pd
10	1	import matplotlib.pyplot as plt
11	1	from os import listdir
12	1	import os.path
13	1	import zipfile
14	1	import keras
15	1	from keras.utils.np_utils import to_categorical
16	1	import sys
17	1	if sys.version_info <= (3,): #python2
18	1	import urllib
19		else: #python3
20		import urllib.request
21
22
23	1	def split_activities(labels, X, borders=10*100):
		0 ignored issues – show Coding Style Naming introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report The name `X` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
24		"""
25		Splits up the data per activity and exclude activity=0.
26		Also remove borders for each activity.
27		Returns lists with subdatasets
28		"""
29	1	tot_len = len(labels)
30	1	startpoints = np.where([1] + [labels[i] != labels[i-1] \
31		for i in range(1, tot_len)])[0]
32	1	endpoints = np.append(startpoints[1:]-1, tot_len-1)
33	1	acts = [labels[s] for s, e in zip(startpoints, endpoints)]
34		#Also split up the data, and only keep the non-zero activities
35	1	xysplit = [(X[s+borders:e-borders+1, :], a) \
36		for s, e, a in zip(startpoints, endpoints, acts) if a != 0]
37	1	xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
38	1	Xlist = [X for X, y in xysplit]
		0 ignored issues – show Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xlist` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
39	1	ylist = [y for X, y in xysplit]
40	1	return Xlist, ylist
41
42	1	def sliding_window(frame_length, step, Xsamples,\
		0 ignored issues – show Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xsamples` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xsampleslist` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
43		ysamples, Xsampleslist, ysampleslist):
44		"""
45		Splits time series in ysampleslist and Xsampleslist
46		into segments by applying a sliding overlapping window
47		of size equal to frame_length with steps equal to step
48		it does this for all the samples and appends all the output together.
49		So, the participant distinction is not kept
50		"""
51	1	for j in range(len(Xsampleslist)):
52	1	X = Xsampleslist[j]
		0 ignored issues – show Coding Style Naming introduced 2016-09-01 15:34 UTC by Report Bug Copy Issue Report The name `X` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
53	1	ybinary = ysampleslist[j]
54	1	for i in range(0, X.shape[0]-frame_length, step):
55	1	xsub = X[i:i+frame_length, :]
56	1	ysub = ybinary
57	1	Xsamples.append(xsub)
58	1	ysamples.append(ysub)
59
60	1	def transform_y(y, mapclasses, nr_classes):
		0 ignored issues – show Coding Style Naming introduced 2016-09-01 15:34 UTC by Report Bug Copy Issue Report The name `y` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
61		"""
62		Transforms y, a list with one sequence of A timesteps
63		and B unique classes into a binary Numpy matrix of
64		shape (A, B)
65		"""
66	1	ymapped = np.array([mapclasses[c] for c in y], dtype='int')
67	1	ybinary = to_categorical(ymapped, nr_classes)
68	1	return ybinary
69
70	1	def addheader(datasets):
71		"""
72		The columns of the pandas data frame are numbers
73		this function adds the column labels
74		"""
75	1	axes = ['x', 'y', 'z']
76	1	IMUsensor_columns = ['temperature'] + \
		0 ignored issues – show Coding Style Naming introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report The name `IMUsensor_columns` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
77		['acc_16g_' + i for i in axes] + \
78		['acc_6g_' + i for i in axes] + \
79		['gyroscope_' + i for i in axes] + \
80		['magnometer_' + i for i in axes] + \
81		['orientation_' + str(i) for i in range(4)]
82	1	header = ["timestamp", "activityID", "heartrate"] + ["hand_"+s \
83		for s in IMUsensor_columns] \
84		+ ["chest_"+s for s in IMUsensor_columns]+ ["ankle_"+s \
85		for s in IMUsensor_columns]
86	1	for i in range(0, len(datasets)):
87	1	datasets[i].columns = header
88	1	return datasets
89
90	1	def numpify_and_store(X, y, xname, yname, outdatapath, shuffle=False):
		0 ignored issues – show Coding Style Naming introduced 2016-09-05 08:07 UTC by Report Bug Copy Issue Report The name `X` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2016-09-01 15:34 UTC by Report Bug Copy Issue Report The name `y` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
91		"""
92		Converts python lists x 3D and y 1D into numpy arrays
93		and stores the numpy array in directory outdatapath
94		shuffle is optional and shuffles the samples
95		"""
96	1	X = np.array(X)
97	1	y = np.array(y)
98		#Shuffle around the train set
99	1	if shuffle is True:
100	1	np.random.seed(123)
101	1	neworder = np.random.permutation(X.shape[0])
102	1	X = X[neworder, :, :]
103	1	y = y[neworder, :]
104		# Save binary file
105	1	np.save(outdatapath+ xname, X)
106	1	np.save(outdatapath+ yname, y)
107
108
109	1	def fetch_data(directory_to_extract_to):
110		"""
111		Fetch the data and extract the contents of the zip file
112		to the directory_to_extract_to.
113		First check whether this was done before, if yes, then skip
114		"""
115		targetdir = directory_to_extract_to + '/PAMAP2'
116		if os.path.exists(targetdir):
117		print('Data previously downloaded and stored in ' + targetdir)
118		else:
119		os.makedirs(targetdir) # create target directory
120		#download the PAMAP2 data, this is 688 Mb
121		path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
122		test_file_exist = os.path.isfile(path_to_zip_file)
123		if test_file_exist is False:
124		url = str('https://archive.ics.uci.edu/ml/' +
125		'machine-learning-databases/00231/PAMAP2_Dataset.zip')
126		#retrieve data from url
127		if sys.version_info <= (3,): #python2
128		local_fn, headers = urllib.urlretrieve(url, \
129		filename=path_to_zip_file)
130		else: #python3
131		local_fn, headers = urllib.request.urlretrieve(url,\
132		filename=path_to_zip_file)
133		print('Download complete and stored in: ' + path_to_zip_file)
134		else:
135		print('The data was previously downloaded and stored in ' +
136		path_to_zip_file)
137		# unzip
138		with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
139		zip_ref.extractall(targetdir)
140		return targetdir
141
142
143	1	def slidingwindow_store(y_list, x_list, X_name, y_name, outdatapath, shuffle):
		0 ignored issues – show Coding Style Naming introduced 2016-09-07 16:24 UTC by Report Bug Copy Issue Report The name `X_name` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
144		# Take sliding-window frames. Target is label of last time step
145		# Data is 100 Hz
146		frame_length = int(5.12 * 100)
147		step = 1 * 100
148		x_set = []
149		y_set = []
150		sliding_window(frame_length, step, x_set, y_set, x_list, y_list)
151		numpify_and_store(x_set, y_set, X_name, y_name, \
152		outdatapath, shuffle)
153
154	1	def map_clas(datasets_filled):
155		ysetall = [set(np.array(data.activityID)) - set([0]) \
156		for data in datasets_filled]
157		classlabels = list(set.union(*[set(y) for y in ysetall]))
158		nr_classes = len(classlabels)
159		mapclasses = {classlabels[i] : i for i in range(len(classlabels))}
160		return classlabels, nr_classes, mapclasses
161
162	1	def split_data(Xlists, ybinarylists, indices):
		0 ignored issues – show Coding Style Naming introduced 2016-09-07 16:26 UTC by Report Bug Copy Issue Report The name `Xlists` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
163		""" Function takes subset from list given indices
164		Arguments:
165		- Xlists: tuple (samples) of lists (windows)
166		of numpy-arrays (time, variable)
167		- ybinarylist: list (samples) of numpy-arrays (window, class)
168		- indices: indices of the slice of data (samples) to be taken
169		Value (output):
170		- x_setlist: list (windows across samples) of numpy-arrays (time, variable)
171		- y_setlist: list (windows across samples) of numpy-arrays (class, )
172		"""
173	1	tty = str(type(indices))
174		# or statement in next line is to account for python2 and python3
175		# difference
176	1	if tty == "<class 'slice'>" or tty == "<type 'slice'>":
177	1	x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
178	1	y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
179		else:
180		x_setlist = [X for X in Xlists[indices]]
181		y_setlist = [y for y in ybinarylists[indices]]
182	1	return x_setlist, y_setlist
183
184	1	def preprocess(targetdir, outdatapath, columns_to_use):
185		""" Function to preprocess the PAMAP2 data after it is fetched
186		Arguments:
187		- targetdir: subdirectory of directory_to_extract_to, targetdir
188		is defined by function fetch_data
189		- outdatapath: a subdirectory of directory_to_extract_to, outdatapath
190		is the direcotry where the Numpy output will be stored.
191		Value (output):
192		- None
193		"""
194		datadir = targetdir + '/PAMAP2_Dataset/Protocol'
195		filenames = listdir(datadir)
196		print('Start pre-processing all ' + str(len(filenames)) + ' files...')
197		# load the files and put them in a list of pandas dataframes:
198		datasets = [pd.read_csv(datadir+'/'+fn, header=None, sep=' ') \
199		for fn in filenames]
200		datasets = addheader(datasets) # add headers to the datasets
201		#Interpolate dataset to get same sample rate between channels
202		datasets_filled = [d.interpolate() for d in datasets]
203		# Create mapping for class labels
204		classlabels, nr_classes, mapclasses = map_clas(datasets_filled)
205		#Create input (x) and output (y) sets
206		xall = [np.array(data[columns_to_use]) for data in datasets_filled]
207		yall = [np.array(data.activityID) for data in datasets_filled]
208		xylists = [split_activities(y, x) for x, y in zip(xall, yall)]
209		Xlists, ylists = zip(*xylists)
		0 ignored issues – show Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xlists` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
210		ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
211		# Split in train, test and val
212		x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
213		test_range = slice(7, len(datasets_filled))
214		x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
215		x_trainlist, y_trainlist = split_data(Xlists, ybinarylists, \
216		indices=slice(0, 6))
217		# Take sliding-window frames, target is label of last time step,
218		# and store as numpy file
219		slidingwindow_store(y_list=y_trainlist, x_list=x_trainlist, \
220		X_name='X_train', y_name='y_train', \
221		outdatapath=outdatapath, shuffle=True)
222		slidingwindow_store(y_list=y_vallist, x_list=x_vallist, \
223		X_name='X_val', y_name='y_val', \
224		outdatapath=outdatapath, shuffle=False)
225		slidingwindow_store(y_list=y_testlist, x_list=x_testlist, \
226		X_name='X_test', y_name='y_test', \
227		outdatapath=outdatapath, shuffle=False)
228		print('Processed data succesfully stored in ' + outdatapath)
229		return None
230
231	1	def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None):
232		"""
233		High level function to fetch_and_preprocess the PAMAP2 dataset
234		Arguments:
235		- directory_to_extract_to: the directory where the data will be stored
236		- columns_to_use: the columns to use
237		Values (output):
238		- outdatapath: The directory in which the numpy files are stored
239		"""
240		if columns_to_use is None:
241		columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
242		'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
243		'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
244		targetdir = fetch_data(directory_to_extract_to)
245		outdatapath = targetdir + '/PAMAP2_Dataset/slidingwindow512cleaned/'
246		if not os.path.exists(outdatapath):
247		os.makedirs(outdatapath)
248		if os.path.isfile(outdatapath+'x_train.npy'):
249		print('Data previously pre-processed and np-files saved to ' +
250		outdatapath)
251		else:
252		preprocess(targetdir, outdatapath, columns_to_use)
253		return outdatapath
254
255	1	def load_data(outputpath):
256		""" Function to load the numpy data as stored in directory
257		outputpath.
258		"""
259		ext = '.npy'
260		x_train = np.load(outputpath+'X_train'+ext)
261		y_train_binary = np.load(outputpath+'y_train'+ext)
262		x_val = np.load(outputpath+'X_val'+ext)
263		y_val_binary = np.load(outputpath+'y_val'+ext)
264		x_test = np.load(outputpath+'X_test'+ext)
265		y_test_binary = np.load(outputpath+'y_test'+ext)
266		return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
267

NLeSC / mcfly

Push — master ( 7b8a0a...e386d2 )

split_data() C

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like