fetch_data() - Code Metrics - Inspection of "code style improvements" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 5f17ca...c9250c )

unknown

created 2016-09-02 09:01 UTC

fetch_data() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	20

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	4
c	1
b	0
f	0
dl	0
loc	25
ccs	0
cts	15
cp	0
crap	20
rs	8.5806

#import required python modules
import numpy as np
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import os.path
import urllib.request
import zipfile
import keras
from keras.utils.np_utils import to_categorical

def split_activities(labels, X, borders=10*100):

    """
    Splits up the data per activity and exclude activity=0.
    Also remove borders for each activity.
    Returns lists with subdatasets
    """
    tot_len = len(labels)
    startpoints = np.where([1] + [labels[i]!=labels[i-1] for i in range(1, tot_len)])[0]

    endpoints = np.append(startpoints[1:]-1, tot_len-1)
    acts = [labels[s] for s,e in zip(startpoints, endpoints)]

    #Also split up the data, and only keep the non-zero activities
    Xysplit = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0]

    Xysplit = [(X, y) for X,y in Xysplit if len(X)>0]

    Xlist = [X for X,y in Xysplit]

    ylist = [y for X,y in Xysplit]

    return Xlist, ylist

def sliding_window(frame_length, step, Xsamples, ysamples,ysampleslist,Xsampleslist):

    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept
    """
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]

        ybinary = ysampleslist[j]
        for i in range(0, X.shape[0]-frame_length, step):
            Xsub = X[i:i+frame_length,:]

            ysub = ybinary
            Xsamples.append(Xsub)
            ysamples.append(ysub)

def transform_y(y,mapclasses,nr_classes):

    """
    Transforms y, a tuple with sequences of class per time segment per sample,
    into a binary matrix per sample
    """
    ymapped = np.array([mapclasses[c] for c in y], dtype='int')
    ybinary = to_categorical(ymapped, nr_classes)
    return ybinary

def addheader(datasets):
    """
    The columns of the pandas data frame are numbers
    this function adds the column labels
    """
    axes = ['x', 'y', 'z']
    IMUsensor_columns = ['temperature'] + \

                    ['acc_16g_' + i for i in axes] + \
                    ['acc_6g_' + i for i in axes] + \
                    ['gyroscope_'+ i for i in axes] + \
                    ['magnometer_'+ i for i in axes] + \
                    ['orientation_' + str(i) for i in range(4)]
    header = ["timestamp", "activityID", "heartrate"] + ["hand_"+s for s in IMUsensor_columns]\

        + ["chest_"+s for s in IMUsensor_columns]+ ["ankle_"+s for s in IMUsensor_columns]

    for i in range(0,len(datasets)):

            datasets[i].columns = header

    return datasets

def split_dataset(datasets_filled,Xlists,ybinarylists):

    """
    This function split Xlists and ybinarylists into
    a train, test and val subset
    """
    train_range = slice(0, 6)
    val_range = 6
    test_range = slice(7,len(datasets_filled))

    Xtrainlist = [X for Xlist in Xlists[train_range] for X in Xlist]

    Xvallist = [X for X in Xlists[val_range]]

    Xtestlist = [X for Xlist in Xlists[test_range] for X in Xlist]

    ytrainlist = [y for ylist in ybinarylists[train_range] for y in ylist]
    yvallist = [y for y in ybinarylists[val_range]]
    ytestlist = [y for ylist in ybinarylists[test_range] for y in ylist]
    return Xtrainlist, Xvallist, Xtestlist, ytrainlist, yvallist, ytestlist

def numpify_and_store(x,y,Xname,yname,outdatapath,shuffle=False):

    """
    Converts python lists x and y into numpy arrays
    and stores the numpy array in directory outdatapath
    shuffle is optional and shuffles the samples
    """
    x = np.array(x)
    y = np.array(y)
    #Shuffle around the train set
    if shuffle is True:
        np.random.seed(123)
        neworder = np.random.permutation(x.shape[0])
        x = x[neworder,:,:]

        y = y[neworder,:]

    # Save binary file
    np.save(outdatapath+ Xname, x)
    np.save(outdatapath+ yname, y)


def fetch_data(directory_to_extract_to):
    """
    Fetch the data and extract the contents of the zip file
    to the directory_to_extract_to.
    First check whether this was done before, if yes, then skip
    """
    targetdir = directory_to_extract_to + '/PAMAP2'
    if os.path.exists(targetdir):
        print('Data previously downloaded and stored in ' + targetdir)
    else:
        os.makedirs(targetdir) # create target directory
        #download the PAMAP2 data, this is 688 Mb
        path_to_zip_file = directory_to_extract_to + '/PAMAP2_Dataset.zip'
        test_file_exist = os.path.isfile(path_to_zip_file)
        if test_file_exist is False:
            url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00231/PAMAP2_Dataset.zip'

            #retrieve data from url
            local_fn, headers = urllib.request.urlretrieve(url,filename=path_to_zip_file)

            print('Download complete and stored in: ' + path_to_zip_file )

        else:
            print('The data was previously downloaded and stored in ' + path_to_zip_file )

        # unzip
        with zipfile.ZipFile(path_to_zip_file ,"r") as zip_ref:

            zip_ref.extractall(targetdir)
    return targetdir


def fetch_and_preprocess(directory_to_extract_to, columns_to_use = None):

    """
    High level function to fetch_and_preprocess the PAMAP2 dataset
    directory_to_extract_to: the directory where the data will be stored
    columns_to_use: the columns to use
    """
    if columns_to_use is None:
        columns_to_use = ['hand_acc_16g_x', 'hand_acc_16g_y', 'hand_acc_16g_z',
                     'ankle_acc_16g_x', 'ankle_acc_16g_y', 'ankle_acc_16g_z',
                     'chest_acc_16g_x', 'chest_acc_16g_y', 'chest_acc_16g_z']
    targetdir = fetch_data(directory_to_extract_to)
    outdatapath = targetdir + '/PAMAP2_Dataset' + '/slidingwindow512cleaned/'
    if not os.path.exists(outdatapath):
        os.makedirs(outdatapath)
    if os.path.isfile(outdatapath+'X_train.npy'):
        print('Data previously pre-processed and np-files saved to ' + outdatapath)

    else:
        datadir = targetdir + '/PAMAP2_Dataset/Protocol'
        filenames = listdir(datadir)
        print('Start pre-processing all ' + str(len(filenames)) + ' files...')
        # load the files and put them in a list of pandas dataframes:
        datasets = [pd.read_csv(datadir+'/'+fn, header=None, sep=' ') for fn in filenames]

        datasets = addheader(datasets) # add headers to the datasets
        #Interpolate dataset to get same sample rate between channels
        datasets_filled = [d.interpolate() for d in datasets]
        # Create mapping for class labels
        ysetall = [set(np.array(data.activityID)) - set([0]) for data in datasets_filled]

        classlabels = list(set.union(*[set(y) for y in ysetall]))
        nr_classes = len(classlabels)
        mapclasses = {classlabels[i] : i for i in range(len(classlabels))}
        #Create input (X) and output (y) sets
        Xall = [np.array(data[columns_to_use]) for data in datasets_filled]

        yall = [np.array(data.activityID) for data in datasets_filled]
        Xylists = [split_activities(y, X) for X, y in zip(Xall, yall)]

        Xlists, ylists = zip(*Xylists)

        ybinarylists = [transform_y(y, mapclasses, nr_classes) for y in ylists]
        # Split in train, test and val
        Xtrainlist, Xvallist, Xtestlist, ytrainlist, yvallist, ytestlist = split_dataset(datasets_filled, Xlists, ybinarylists)

        # Take sliding-window frames. Target is label of last time step
        # Data is 100 Hz
        frame_length = int(5.12 * 100)
        step = 1 * 100
        Xtrain = []

        ytrain = []
        Xval = []

        yval = []
        Xtest = []

        ytest = []
        sliding_window(frame_length, step, Xtrain, ytrain, ytrainlist, Xtrainlist)

        sliding_window(frame_length, step, Xval, yval, yvallist, Xvallist)
        sliding_window(frame_length, step, Xtest, ytest, ytestlist, Xtestlist)
        numpify_and_store(Xtrain, ytrain, 'X_train', 'y_train', outdatapath, shuffle=True)

        numpify_and_store(Xval, yval, 'X_val', 'y_val', outdatapath, shuffle=True)

        numpify_and_store(Xtest, ytest, 'X_test', 'y_test', outdatapath, shuffle=True)

        print('Processed data succesfully stored in ' + outdatapath)
    return outdatapath

def load_data(outputpath):
    ext = '.npy'
    Xtrain = np.load(outputpath+'X_train'+ext)

    ytrain_binary = np.load(outputpath+'y_train_binary'+ext)
    Xval = np.load(outputpath+'X_val'+ext)

    yval_binary = np.load(outputpath+'y_val_binary'+ext)
    Xtest = np.load(outputpath+'X_test'+ext)

    ytest_binary = np.load(outputpath+'y_test_binary'+ext)
    return Xtrain, ytrain_binary, Xval, yval_binary, X_test, ytest_binary


1			#import required python modules
2			import numpy as np
3			from numpy import genfromtxt
4			import pandas as pd
5			import matplotlib.pyplot as plt
6			from os import listdir
7			import os.path
8			import urllib.request
9			import zipfile
10			import keras
11			from keras.utils.np_utils import to_categorical
12
13			def split_activities(labels, X, borders=10*100):
			0 ignored issues – show Coding Style Naming introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report The name `X` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
14			"""
15			Splits up the data per activity and exclude activity=0.
16			Also remove borders for each activity.
17			Returns lists with subdatasets
18			"""
19			tot_len = len(labels)
20			startpoints = np.where([1] + [labels[i]!=labels[i-1] for i in range(1, tot_len)])[0]
			0 ignored issues – show Coding Style introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (88/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history... Coding Style introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report Exactly one space required around comparison startpoints = np.where([1] + [labels[i]!=labels[i-1] for i in range(1, tot_len)])[0] ^^ Loading history...
21			endpoints = np.append(startpoints[1:]-1, tot_len-1)
22			acts = [labels[s] for s,e in zip(startpoints, endpoints)]
			0 ignored issues – show Coding Style introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report Exactly one space required after comma acts = [labels[s] for s,e in zip(startpoints, endpoints)] ^ Loading history...
23			#Also split up the data, and only keep the non-zero activities
24			Xysplit = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0]
			0 ignored issues – show Coding Style introduced 2016-09-01 14:23 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (104/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history... Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma Xysplit = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0] ^ Loading history... Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma Xysplit = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0] ^ Loading history... Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma Xysplit = [(X[s+borders:e-borders+1,:], a) for s,e,a in zip(startpoints, endpoints, acts) if a != 0] ^ Loading history... Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xysplit` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
25			Xysplit = [(X, y) for X,y in Xysplit if len(X)>0]
			0 ignored issues – show Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma Xysplit = [(X, y) for X,y in Xysplit if len(X)>0] ^ Loading history... Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required around comparison Xysplit = [(X, y) for X,y in Xysplit if len(X)>0] ^ Loading history... Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xysplit` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
26			Xlist = [X for X,y in Xysplit]
			0 ignored issues – show Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma Xlist = [X for X,y in Xysplit] ^ Loading history... Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xlist` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
27			ylist = [y for X,y in Xysplit]
			0 ignored issues – show Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma ylist = [y for X,y in Xysplit] ^ Loading history...
28			return Xlist, ylist
29
30			def sliding_window(frame_length, step, Xsamples, ysamples,ysampleslist,Xsampleslist):
			0 ignored issues – show Coding Style introduced 2016-09-01 15:34 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (85/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history... Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma def sliding_window(frame_length, step, Xsamples, ysamples,ysampleslist,Xsampleslist): ^ Loading history... Coding Style introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report Exactly one space required after comma def sliding_window(frame_length, step, Xsamples, ysamples,ysampleslist,Xsampleslist): ^ Loading history... Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xsamples` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2016-09-02 09:06 UTC by Report Bug Copy Issue Report The name `Xsampleslist` does not conform to the argument naming conventions (`[a-z_][a-z0-9_]{1,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
31			"""
32			Splits time series in ysampleslist and Xsampleslist

NLeSC / mcfly

Push — master ( 5f17ca...c9250c )

fetch_data() B

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like