download_preprocessed_data() - Code Metrics - Inspection of "Added function to load preprocessed data for tutor..." - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( e6f995...b6bff9 )

by Dafne van

created 2017-03-02 10:25 UTC

download_preprocessed_data() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	25
Ratio	100 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	4
c	1
b	0
f	0
dl	25
loc	25
rs	8.5806

"""
 Summary:
 Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
 preproces the data.
 Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
"""
import numpy as np
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import os.path
import zipfile
import keras
from keras.utils.np_utils import to_categorical
import sys
import six.moves.urllib as urllib


def split_activities(labels, X, exclude_activities, borders=10 * 100):
    """
    Splits up the data per activity and exclude activity=0.
    Also remove borders for each activity.
    Returns lists with subdatasets

    Parameters
    ----------
    labels : numpy array
        Activity labels
    X : numpy array
        Data points
    borders : int
        Nr of timesteps to remove from the borders of an activity
    exclude_activities : list or tuple
        activities to exclude from the

    Returns
    -------
    X_list
    y_list
    """
    tot_len = len(labels)
    startpoints = np.where([1] + [labels[i] != labels[i - 1]
                                  for i in range(1, tot_len)])[0]
    endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
    acts = [labels[s] for s, e in zip(startpoints, endpoints)]
    # Also split up the data, and only keep the non-zero activities
    xysplit = [(X[s + borders:e - borders + 1, :], a)
               for s, e, a in zip(startpoints, endpoints, acts)
               if a not in exclude_activities]
    xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
    Xlist = [X for X, y in xysplit]
    ylist = [y for X, y in xysplit]
    return Xlist, ylist


def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept

    Parameters
    ----------
    frame_length : int
        Length of sliding window
    step : int
        Stepsize between windows
    Xsamples : list
        Existing list of window fragments
    ysamples : list
        Existing list of window fragments
    Xsampleslist : list
        Samples to take sliding windows from
    ysampleslist
        Samples to take sliding windows from

    """
    Xsamples = []
    ysamples = []
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]
        ybinary = ysampleslist[j]
        for i in range(0, X.shape[0] - frame_length, step):
            xsub = X[i:i + frame_length, :]
            ysub = ybinary
            Xsamples.append(xsub)
            ysamples.append(ysub)
    return Xsamples, ysamples


def transform_y(y, mapclasses, nr_classes):
    """
    Transforms y, a list with one sequence of A timesteps
    and B unique classes into a binary Numpy matrix of
    shape (A, B)

    Parameters
    ----------
    y : list or array
        List of classes
    mapclasses : dict
        dictionary that maps the classes to numbers
    nr_classes : int
        total number of classes
    """
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
    ybinary = to_categorical(ymapped, nr_classes)
    return ybinary

def get_header():
    axes = ['x', 'y', 'z']
    IMUsensor_columns = ['temperature'] + \
        ['acc_16g_' + i for i in axes] + \
        ['acc_6g_' + i for i in axes] + \
        ['gyroscope_' + i for i in axes] + \
        ['magnometer_' + i for i in axes] + \
        ['orientation_' + str(i) for i in range(4)]
    header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
                                                         for s in IMUsensor_columns] \

        + ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
                                                       for s in IMUsensor_columns]

    return header

def addheader(datasets):
    """
    The columns of the pandas data frame are numbers
    this function adds the column labels

    Parameters
    ----------
    datasets : list
        List of pandas dataframes
    """
    header = get_header()
    for i in range(0, len(datasets)):
        datasets[i].columns = header
    return datasets


def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
    """
    Converts python lists x 3D and y 1D into numpy arrays
    and stores the numpy array in directory outdatapath
    shuffle is optional and shuffles the samples

    Parameters
    ----------
    X : list
        list with data
    y : list
        list with data
    X_name : str
        name to store the x arrays
    y_name : str
        name to store the y arrays
    outdatapath : str
        path to the directory to store the data
    shuffle : bool
        whether to shuffle the data before storing
    """
    X = np.array(X)
    y = np.array(y)
    # Shuffle the train set
    if shuffle is True:
        np.random.seed(123)
        neworder = np.random.permutation(X.shape[0])
        X = X[neworder, :, :]
        y = y[neworder, :]
    # Save binary file
    xpath = os.path.join(outdatapath, X_name)
    ypath = os.path.join(outdatapath, y_name)
    np.save(xpath, X)
    np.save(ypath, y)
    print('Stored ' + xpath, y_name)


def fetch_data(directory_to_extract_to):

    """
    Fetch the data and extract the contents of the zip file
    to the directory_to_extract_to.
    First check whether this was done before, if yes, then skip

    Parameters
    ----------
    directory_to_extract_to : str
        directory to create subfolder 'PAMAP2'

    Returns
    -------
    targetdir: str
        directory where the data is extracted
    """
    targetdir = os.path.join(directory_to_extract_to, "PAMAP2")
    if os.path.exists(targetdir):
        print('Data previously downloaded and stored in ' + targetdir)
    else:
        os.makedirs(targetdir)  # create target directory
        # Download the PAMAP2 data, this is 688 Mb
        path_to_zip_file = os.path.join(directory_to_extract_to, 'PAMAP2_Dataset.zip')

        test_file_exist = os.path.isfile(path_to_zip_file)
        if test_file_exist is False:
            url = str('https://archive.ics.uci.edu/ml/' +
                      'machine-learning-databases/00231/PAMAP2_Dataset.zip')
            # retrieve data from url
            local_fn, headers = urllib.request.urlretrieve(url,
                                                           filename=path_to_zip_file)

            print('Download complete and stored in: ' + path_to_zip_file)
        else:
            print('The data was previously downloaded and stored in ' +
                  path_to_zip_file)
        # unzip

        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
            zip_ref.extractall(targetdir)
        os.remove(path_to_zip_file)
    return targetdir


def map_class(datasets_filled):
    ysetall = [set(np.array(data.activityID)) - set([0])
               for data in datasets_filled]
    classlabels = list(set.union(*[set(y) for y in ysetall]))
    nr_classes = len(classlabels)
    mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
    return classlabels, nr_classes, mapclasses


def split_data(Xlists, ybinarylists, indices):
    """ Function takes subset from list given indices

    Parameters
    ----------
    Xlists: tuple
        tuple (samples) of lists (windows) of numpy-arrays (time, variable)
    ybinarylist :
        list (samples) of numpy-arrays (window, class)
    indices :
        indices of the slice of data (samples) to be taken

    Returns
    -------
    x_setlist : list
        list (windows across samples) of numpy-arrays (time, variable)
    y_setlist: list
        list (windows across samples) of numpy-arrays (class, )
    """
    tty = str(type(indices))
    # or statement in next line is to account for python2 and python3
    # difference
    if tty == "<class 'slice'>" or tty == "<type 'slice'>":
        x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
        y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
    else:
        x_setlist = [X for X in Xlists[indices]]
        y_setlist = [y for y in ybinarylists[indices]]
    return x_setlist, y_setlist

def split_data_random(X, y, val_size, test_size):
    X = np.array(X)
    y = np.array(y)
    size = len(X)
    train_size = size - val_size - test_size
    indices = np.random.permutation(size)
    X_train = X[indices[:train_size]]
    y_train = y[indices[:train_size]]
    X_val = X[indices[train_size:train_size+val_size]]
    y_val = y[indices[train_size:train_size+val_size]]
    X_test = X[indices[train_size+val_size:]]
    y_test = y[indices[train_size+val_size:]]
    return X_train, y_train, X_val, y_val, X_test, y_test

def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,

               val_test_size=None):
    """ Function to preprocess the PAMAP2 data after it is fetched

    Parameters
    ----------
    targetdir : str
        subdirectory of directory_to_extract_to, targetdir
        is defined by function fetch_data
    outdatapath : str
        a subdirectory of directory_to_extract_to, outdatapath
        is the direcotry where the Numpy output will be stored.
    columns_to_use : list
        list of column names to use
    exclude_activities : list or tuple
        activities to exclude from the
    fold : boolean
        Whether to store each fold seperately ('False' creates
        Train, Test and Validation sets)

    Returns
    -------
    None
    """
    datadir = os.path.join(targetdir, 'PAMAP2_Dataset', 'Protocol')
    filenames = listdir(datadir)
    filenames.sort()
    print('Start pre-processing all ' + str(len(filenames)) + ' files...')
    # load the files and put them in a list of pandas dataframes:
    datasets = [pd.read_csv(os.path.join(datadir, fn), header=None, sep=' ')
                for fn in filenames]
    datasets = addheader(datasets)  # add headers to the datasets
    # Interpolate dataset to get same sample rate between channels
    datasets_filled = [d.interpolate() for d in datasets]
    # Create mapping for class labels
    classlabels, nr_classes, mapclasses = map_class(datasets_filled)
    # Create input (x) and output (y) sets
    xall = [np.array(data[columns_to_use]) for data in datasets_filled]
    yall = [np.array(data.activityID) for data in datasets_filled]
    xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]

    Xlists, ylists = zip(*xylists)
    ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
    frame_length = int(5.12 * 100)
    step = 1 * 100
    if not fold:
        if val_test_size is None:
            # Split in train, test and val
            x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
            test_range = slice(7, len(datasets_filled))
            x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)

            x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
                                                  indices=slice(0, 6))
            # Take sliding-window frames, target is label of last time step,
            # and store as numpy file
            x_train, y_train = sliding_window(frame_length, step, x_trainlist,
                                              y_trainlist)
            x_val, y_val = sliding_window(frame_length, step, x_vallist,
                                              y_vallist)
            x_test, y_test = sliding_window(frame_length, step, x_testlist,
                                              y_testlist)

        else:
            val_size, test_size = val_test_size
            X_list, y_list = split_data(Xlists, ybinarylists,
                                        slice(0, len(datasets_filled)))
            X, y = sliding_window(frame_length, step, X_list,
                                  y_list)
            x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)



        numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
                            outdatapath=outdatapath, shuffle=True)
        numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
                            outdatapath=outdatapath, shuffle=False)
        numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
                            outdatapath=outdatapath, shuffle=False)
    else :

        for i in range(len(Xlists)):
            X_i, y_i = split_data(Xlists, ybinarylists, i)
            X, y = sliding_window(frame_length, step, X_i,
                                              y_i)
            numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
                            outdatapath=outdatapath, shuffle=True)


    print('Processed data succesfully stored in ' + outdatapath)
    return None


def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='preprocessed', exclude_activities=[0], fold=False,

                         val_test_size=None):
    """
    High level function to fetch_and_preprocess the PAMAP2 dataset

    Parameters
    ----------
    directory_to_extract_to : str
        the directory where the data will be stored
    columns_to_use : list
        the columns to use
    ouptput_dir : str
        name of the directory to write the outputdata to
    exclude_activities : list or tuple
        activities to exclude from the
    fold : boolean
        Whether to store each fold seperately ('False' creates
        Train, Test and Validation sets)

    Returns
    -------
    outdatapath: str
        The directory in which the numpy files are stored
    """
    if columns_to_use is None:
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
                          'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',

                          'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']

    targetdir = fetch_data(directory_to_extract_to)
    outdatapath = os.path.join(targetdir, output_dir)
    if not os.path.exists(outdatapath):
        os.makedirs(outdatapath)
    if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')):
        print('Data previously pre-processed and np-files saved to ' +
              outdatapath)
    else:
        preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)

    return outdatapath


def load_data(outputpath):
    """ Function to load the numpy data as stored in directory
    outputpath.

    Parameters
    ----------
    outputpath : str
        directory where the numpy files are stored

    Returns
    -------
    x_train
    y_train_binary
    x_val
    y_val_binary
    x_test
    y_test_binary
    """
    ext = '.npy'
    x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
    y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
    x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
    y_val_binary = np.load(os.path.join(outputpath,  'y_val' + ext))

    x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
    y_test_binary = np.load(os.path.join(outputpath,  'y_test' + ext))

    return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary


def download_preprocessed_data(directory_to_extract_to):

    data_path = os.path.join(directory_to_extract_to,
                             'data/PAMAP2/preprocessed')

    if not os.path.isdir(data_path):
        path_to_zip_file = os.path.join(directory_to_extract_to, 'data.zip')

        # Download zip file with data
        if not os.path.isfile(path_to_zip_file):
            print("Downloading data...")
            local_fn, headers = urllib.request.urlretrieve(
                'https://zenodo.org/record/345082/files/data.zip',
                filename=path_to_zip_file)
        else:
            print("Data already downloaded")

        # Extract the zip file
        with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
            print("Extracting data...")
            zip_ref.extractall(directory_to_extract_to)
        print("Done")
    else:
        print("Data already downloaded and extracted.")

    return data_path

1		"""
2		Summary:
3		Function fetch_and_preprocess from tutorial_pamap2.py helps to fetch and
4		preproces the data.
5		Example function calls in 'Tutorial mcfly on PAMAP2.ipynb'
6		"""
7		import numpy as np
8		from numpy import genfromtxt
9		import pandas as pd
10		import matplotlib.pyplot as plt
11		from os import listdir
12		import os.path
13		import zipfile
14		import keras
15		from keras.utils.np_utils import to_categorical
16		import sys
17		import six.moves.urllib as urllib
18
19
20		def split_activities(labels, X, exclude_activities, borders=10 * 100):
21		"""
22		Splits up the data per activity and exclude activity=0.
23		Also remove borders for each activity.
24		Returns lists with subdatasets
25
26		Parameters
27		----------
28		labels : numpy array
29		Activity labels
30		X : numpy array
31		Data points
32		borders : int
33		Nr of timesteps to remove from the borders of an activity
34		exclude_activities : list or tuple
35		activities to exclude from the
36
37		Returns
38		-------
39		X_list
40		y_list
41		"""
42		tot_len = len(labels)
43		startpoints = np.where([1] + [labels[i] != labels[i - 1]
44		for i in range(1, tot_len)])[0]
45		endpoints = np.append(startpoints[1:] - 1, tot_len - 1)
46		acts = [labels[s] for s, e in zip(startpoints, endpoints)]
47		# Also split up the data, and only keep the non-zero activities
48		xysplit = [(X[s + borders:e - borders + 1, :], a)
49		for s, e, a in zip(startpoints, endpoints, acts)
50		if a not in exclude_activities]
51		xysplit = [(X, y) for X, y in xysplit if len(X) > 0]
52		Xlist = [X for X, y in xysplit]
53		ylist = [y for X, y in xysplit]
54		return Xlist, ylist
55
56
57		def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
58		"""
59		Splits time series in ysampleslist and Xsampleslist
60		into segments by applying a sliding overlapping window
61		of size equal to frame_length with steps equal to step
62		it does this for all the samples and appends all the output together.
63		So, the participant distinction is not kept
64
65		Parameters
66		----------
67		frame_length : int
68		Length of sliding window
69		step : int
70		Stepsize between windows
71		Xsamples : list
72		Existing list of window fragments
73		ysamples : list
74		Existing list of window fragments
75		Xsampleslist : list
76		Samples to take sliding windows from
77		ysampleslist
78		Samples to take sliding windows from
79
80		"""
81		Xsamples = []
82		ysamples = []
83		for j in range(len(Xsampleslist)):
84		X = Xsampleslist[j]
85		ybinary = ysampleslist[j]
86		for i in range(0, X.shape[0] - frame_length, step):
87		xsub = X[i:i + frame_length, :]
88		ysub = ybinary
89		Xsamples.append(xsub)
90		ysamples.append(ysub)
91		return Xsamples, ysamples
92
93
94		def transform_y(y, mapclasses, nr_classes):
95		"""
96		Transforms y, a list with one sequence of A timesteps
97		and B unique classes into a binary Numpy matrix of
98		shape (A, B)
99
100		Parameters
101		----------
102		y : list or array
103		List of classes
104		mapclasses : dict
105		dictionary that maps the classes to numbers
106		nr_classes : int
107		total number of classes
108		"""
109		ymapped = np.array([mapclasses[c] for c in y], dtype='int')
110		ybinary = to_categorical(ymapped, nr_classes)
111		return ybinary
112
113		def get_header():
114		axes = ['x', 'y', 'z']
115		IMUsensor_columns = ['temperature'] + \
116		['acc_16g_' + i for i in axes] + \
117		['acc_6g_' + i for i in axes] + \
118		['gyroscope_' + i for i in axes] + \
119		['magnometer_' + i for i in axes] + \
120		['orientation_' + str(i) for i in range(4)]
121		header = ["timestamp", "activityID", "heartrate"] + ["hand_" + s
122		for s in IMUsensor_columns] \
		0 ignored issues – show Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
123		+ ["chest_" + s for s in IMUsensor_columns] + ["ankle_" + s
124		for s in IMUsensor_columns]
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
125		return header
126
127		def addheader(datasets):
128		"""
129		The columns of the pandas data frame are numbers
130		this function adds the column labels
131
132		Parameters
133		----------
134		datasets : list
135		List of pandas dataframes
136		"""
137		header = get_header()
138		for i in range(0, len(datasets)):
139		datasets[i].columns = header
140		return datasets
141
142
143		def numpify_and_store(X, y, X_name, y_name, outdatapath, shuffle=False):
144		"""
145		Converts python lists x 3D and y 1D into numpy arrays
146		and stores the numpy array in directory outdatapath
147		shuffle is optional and shuffles the samples
148
149		Parameters
150		----------
151		X : list
152		list with data
153		y : list
154		list with data
155		X_name : str
156		name to store the x arrays
157		y_name : str
158		name to store the y arrays
159		outdatapath : str
160		path to the directory to store the data
161		shuffle : bool
162		whether to shuffle the data before storing
163		"""
164		X = np.array(X)
165		y = np.array(y)
166		# Shuffle the train set
167		if shuffle is True:
168		np.random.seed(123)
169		neworder = np.random.permutation(X.shape[0])
170		X = X[neworder, :, :]
171		y = y[neworder, :]
172		# Save binary file
173		xpath = os.path.join(outdatapath, X_name)
174		ypath = os.path.join(outdatapath, y_name)
175		np.save(xpath, X)
176		np.save(ypath, y)
177		print('Stored ' + xpath, y_name)
178
179
180	View Code Duplication	def fetch_data(directory_to_extract_to):
		0 ignored issues – show Duplication introduced 2017-03-02 10:31 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
181		"""
182		Fetch the data and extract the contents of the zip file
183		to the directory_to_extract_to.
184		First check whether this was done before, if yes, then skip
185
186		Parameters
187		----------
188		directory_to_extract_to : str
189		directory to create subfolder 'PAMAP2'
190
191		Returns
192		-------
193		targetdir: str
194		directory where the data is extracted
195		"""
196		targetdir = os.path.join(directory_to_extract_to, "PAMAP2")
197		if os.path.exists(targetdir):
198		print('Data previously downloaded and stored in ' + targetdir)
199		else:
200		os.makedirs(targetdir) # create target directory
201		# Download the PAMAP2 data, this is 688 Mb
202		path_to_zip_file = os.path.join(directory_to_extract_to, 'PAMAP2_Dataset.zip')
		0 ignored issues – show Coding Style introduced 2017-02-08 12:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
203		test_file_exist = os.path.isfile(path_to_zip_file)
204		if test_file_exist is False:
205		url = str('https://archive.ics.uci.edu/ml/' +
206		'machine-learning-databases/00231/PAMAP2_Dataset.zip')
207		# retrieve data from url
208		local_fn, headers = urllib.request.urlretrieve(url,
209		filename=path_to_zip_file)
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (85/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
210		print('Download complete and stored in: ' + path_to_zip_file)
211		else:
212		print('The data was previously downloaded and stored in ' +
213		path_to_zip_file)
214		# unzip
215
216		with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
217		zip_ref.extractall(targetdir)
218		os.remove(path_to_zip_file)
219		return targetdir
220
221
222		def map_class(datasets_filled):
223		ysetall = [set(np.array(data.activityID)) - set([0])
224		for data in datasets_filled]
225		classlabels = list(set.union(*[set(y) for y in ysetall]))
226		nr_classes = len(classlabels)
227		mapclasses = {classlabels[i]: i for i in range(len(classlabels))}
228		return classlabels, nr_classes, mapclasses
229
230
231		def split_data(Xlists, ybinarylists, indices):
232		""" Function takes subset from list given indices
233
234		Parameters
235		----------
236		Xlists: tuple
237		tuple (samples) of lists (windows) of numpy-arrays (time, variable)
238		ybinarylist :
239		list (samples) of numpy-arrays (window, class)
240		indices :
241		indices of the slice of data (samples) to be taken
242
243		Returns
244		-------
245		x_setlist : list
246		list (windows across samples) of numpy-arrays (time, variable)
247		y_setlist: list
248		list (windows across samples) of numpy-arrays (class, )
249		"""
250		tty = str(type(indices))
251		# or statement in next line is to account for python2 and python3
252		# difference
253		if tty == "<class 'slice'>" or tty == "<type 'slice'>":
254		x_setlist = [X for Xlist in Xlists[indices] for X in Xlist]
255		y_setlist = [y for ylist in ybinarylists[indices] for y in ylist]
256		else:
257		x_setlist = [X for X in Xlists[indices]]
258		y_setlist = [y for y in ybinarylists[indices]]
259		return x_setlist, y_setlist
260
261		def split_data_random(X, y, val_size, test_size):
262		X = np.array(X)
263		y = np.array(y)
264		size = len(X)
265		train_size = size - val_size - test_size
266		indices = np.random.permutation(size)
267		X_train = X[indices[:train_size]]
268		y_train = y[indices[:train_size]]
269		X_val = X[indices[train_size:train_size+val_size]]
270		y_val = y[indices[train_size:train_size+val_size]]
271		X_test = X[indices[train_size+val_size:]]
272		y_test = y[indices[train_size+val_size:]]
273		return X_train, y_train, X_val, y_val, X_test, y_test
274
275		def preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold,
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (80/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
276		val_test_size=None):
277		""" Function to preprocess the PAMAP2 data after it is fetched
278
279		Parameters
280		----------
281		targetdir : str
282		subdirectory of directory_to_extract_to, targetdir
283		is defined by function fetch_data
284		outdatapath : str
285		a subdirectory of directory_to_extract_to, outdatapath
286		is the direcotry where the Numpy output will be stored.
287		columns_to_use : list
288		list of column names to use
289		exclude_activities : list or tuple
290		activities to exclude from the
291		fold : boolean
292		Whether to store each fold seperately ('False' creates
293		Train, Test and Validation sets)
294
295		Returns
296		-------
297		None
298		"""
299		datadir = os.path.join(targetdir, 'PAMAP2_Dataset', 'Protocol')
300		filenames = listdir(datadir)
301		filenames.sort()
302		print('Start pre-processing all ' + str(len(filenames)) + ' files...')
303		# load the files and put them in a list of pandas dataframes:
304		datasets = [pd.read_csv(os.path.join(datadir, fn), header=None, sep=' ')
305		for fn in filenames]
306		datasets = addheader(datasets) # add headers to the datasets
307		# Interpolate dataset to get same sample rate between channels
308		datasets_filled = [d.interpolate() for d in datasets]
309		# Create mapping for class labels
310		classlabels, nr_classes, mapclasses = map_class(datasets_filled)
311		# Create input (x) and output (y) sets
312		xall = [np.array(data[columns_to_use]) for data in datasets_filled]
313		yall = [np.array(data.activityID) for data in datasets_filled]
314		xylists = [split_activities(y, x, exclude_activities) for x, y in zip(xall, yall)]
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (86/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
315		Xlists, ylists = zip(*xylists)
316		ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
317		frame_length = int(5.12 * 100)
318		step = 1 * 100
319		if not fold:
320		if val_test_size is None:
321		# Split in train, test and val
322		x_vallist, y_vallist = split_data(Xlists, ybinarylists, indices=6)
323		test_range = slice(7, len(datasets_filled))
324		x_testlist, y_testlist = split_data(Xlists, ybinarylists, test_range)
		0 ignored issues – show Coding Style introduced 2017-01-16 15:09 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (81/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
325		x_trainlist, y_trainlist = split_data(Xlists, ybinarylists,
326		indices=slice(0, 6))
327		# Take sliding-window frames, target is label of last time step,
328		# and store as numpy file
329		x_train, y_train = sliding_window(frame_length, step, x_trainlist,
330		y_trainlist)
331		x_val, y_val = sliding_window(frame_length, step, x_vallist,
332		y_vallist)
333		x_test, y_test = sliding_window(frame_length, step, x_testlist,
334		y_testlist)
335
336		else:
337		val_size, test_size = val_test_size
338		X_list, y_list = split_data(Xlists, ybinarylists,
339		slice(0, len(datasets_filled)))
340		X, y = sliding_window(frame_length, step, X_list,
341		y_list)
342		x_train, y_train, x_val, y_val, x_test, y_test = split_data_random(X, y, val_size, test_size)
		0 ignored issues – show Coding Style introduced 2017-01-16 15:09 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (105/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
343
344
345		numpify_and_store(x_train, y_train, X_name='X_train', y_name='y_train',
346		outdatapath=outdatapath, shuffle=True)
347		numpify_and_store(x_val, y_val, X_name='X_val', y_name='y_val',
348		outdatapath=outdatapath, shuffle=False)
349		numpify_and_store(x_test, y_test, X_name='X_test', y_name='y_test',
350		outdatapath=outdatapath, shuffle=False)
351		else :
		0 ignored issues – show Coding Style introduced 2017-02-08 12:43 UTC by Report Bug Copy Issue Report No space allowed before : else : ^ Loading history...
352		for i in range(len(Xlists)):
353		X_i, y_i = split_data(Xlists, ybinarylists, i)
354		X, y = sliding_window(frame_length, step, X_i,
355		y_i)
356		numpify_and_store(X, y, X_name='X_'+str(i), y_name='y_'+str(i),
357		outdatapath=outdatapath, shuffle=True)
358
359
360		print('Processed data succesfully stored in ' + outdatapath)
361		return None
362
363
364		def fetch_and_preprocess(directory_to_extract_to, columns_to_use=None, output_dir='preprocessed', exclude_activities=[0], fold=False,
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (133/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
365		val_test_size=None):
366		"""
367		High level function to fetch_and_preprocess the PAMAP2 dataset
368
369		Parameters
370		----------
371		directory_to_extract_to : str
372		the directory where the data will be stored
373		columns_to_use : list
374		the columns to use
375		ouptput_dir : str
376		name of the directory to write the outputdata to
377		exclude_activities : list or tuple
378		activities to exclude from the
379		fold : boolean
380		Whether to store each fold seperately ('False' creates
381		Train, Test and Validation sets)
382
383		Returns
384		-------
385		outdatapath: str
386		The directory in which the numpy files are stored
387		"""
388		if columns_to_use is None:
389		columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
390		'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
391		'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
		0 ignored issues – show Coding Style introduced 2016-10-04 14:26 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
392		targetdir = fetch_data(directory_to_extract_to)
393		outdatapath = os.path.join(targetdir, output_dir)
394		if not os.path.exists(outdatapath):
395		os.makedirs(outdatapath)
396		if os.path.isfile(os.path.join(outdatapath, 'X_train.npy')):
397		print('Data previously pre-processed and np-files saved to ' +
398		outdatapath)
399		else:
400		preprocess(targetdir, outdatapath, columns_to_use, exclude_activities, fold, val_test_size)
		0 ignored issues – show Coding Style introduced 2017-01-16 15:09 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (99/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
401		return outdatapath
402
403
404		def load_data(outputpath):
405		""" Function to load the numpy data as stored in directory
406		outputpath.
407
408		Parameters
409		----------
410		outputpath : str
411		directory where the numpy files are stored
412
413		Returns
414		-------
415		x_train
416		y_train_binary
417		x_val
418		y_val_binary
419		x_test
420		y_test_binary
421		"""
422		ext = '.npy'
423		x_train = np.load(os.path.join(outputpath, 'X_train' + ext))
424		y_train_binary = np.load(os.path.join(outputpath, 'y_train' + ext))
425		x_val = np.load(os.path.join(outputpath, 'X_val' + ext))
426		y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext))
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report Exactly one space required after comma y_val_binary = np.load(os.path.join(outputpath, 'y_val' + ext)) ^ Loading history...
427		x_test = np.load(os.path.join(outputpath, 'X_test' + ext))
428		y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext))
		0 ignored issues – show Coding Style introduced 2016-11-15 15:07 UTC by Report Bug Copy Issue Report Exactly one space required after comma y_test_binary = np.load(os.path.join(outputpath, 'y_test' + ext)) ^ Loading history...
429		return x_train, y_train_binary, x_val, y_val_binary, x_test, y_test_binary
430
431
432	View Code Duplication	def download_preprocessed_data(directory_to_extract_to):
		0 ignored issues – show Duplication introduced 2017-03-02 10:31 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
433		data_path = os.path.join(directory_to_extract_to,
434		'data/PAMAP2/preprocessed')
435
436		if not os.path.isdir(data_path):
437		path_to_zip_file = os.path.join(directory_to_extract_to, 'data.zip')
438
439		# Download zip file with data
440		if not os.path.isfile(path_to_zip_file):
441		print("Downloading data...")
442		local_fn, headers = urllib.request.urlretrieve(
443		'https://zenodo.org/record/345082/files/data.zip',
444		filename=path_to_zip_file)
445		else:
446		print("Data already downloaded")
447
448		# Extract the zip file
449		with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
450		print("Extracting data...")
451		zip_ref.extractall(directory_to_extract_to)
452		print("Done")
453		else:
454		print("Data already downloaded and extracted.")
455
456		return data_path

NLeSC / mcfly

Push — master ( e6f995...b6bff9 )

download_preprocessed_data() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like