kNN_accuracy() - Code Metrics - Inspection of "Renamed visualization file" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( d10d5f...edd44f )

by Dafne van

created 2017-01-10 14:48 UTC

kNN_accuracy() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	2

Importance

Changes

Metric	Value
cc	1
c	0
b	0
f	0
dl	0
loc	35
ccs	0
cts	7
cp	0
crap	2
rs	8.8571

"""
 Summary:
 This module provides the main functionality of mcfly: searching for an
 optimal model architecture. The work flow is as follows:
 Function generate_models from modelgen.py generates and compiles models.
 Function train_models_on_samples trains those models.
 Function plotTrainingProcess plots the training process.
 Function find_best_architecture is wrapper function that combines
 these steps.
 Example function calls can be found in the tutorial notebook
 'EvaluateDifferentModels.ipynb'.
"""
import numpy as np
from matplotlib import pyplot as plt
from . import modelgen
from sklearn import neighbors, metrics
import warnings
import json
import os
from keras.callbacks import EarlyStopping

def train_models_on_samples(X_train, y_train, X_val, y_val, models,
                            nr_epochs=5, subset_size=100, verbose=True,
                            outputfile=None, early_stopping=False):
    """
    Given a list of compiled models, this function trains
    them all on a subset of the train data. If the given size of the subset is
    smaller then the size of the data, the complete data set is used.

    Parameters
    ----------
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
        The input dataset for training
    y_train : numpy array of shape (num_samples, num_classes)
        The output classes for the train data, in binary format
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
        The input dataset for validation
    y_val : numpy array of shape (num_samples_val, num_classes)
        The output classes for the validation data, in binary format
    models : list of model, params, modeltypes
        List of keras models to train
    nr_epochs : int, optional
        nr of epochs to use for training one model
    subset_size :
        The number of samples used from the complete train set
    verbose : bool, optional
        flag for displaying verbose output
    outputfile : str, optional
        File location to store the model results
    early_stopping: bool
        Stop when validation loss does not decrease

    Returns
    ----------
    histories : list of Keras History objects
        train histories for all models
    val_accuracies : list of floats
        validation accuraracies of the models
    val_losses : list of floats
        validation losses of the models
    """
    # if subset_size is smaller then X_train, this will work fine
    X_train_sub = X_train[:subset_size, :, :]
    y_train_sub = y_train[:subset_size, :]

    histories = []
    val_accuracies = []
    val_losses = []
    for i, (model, params, model_types) in enumerate(models):
        if verbose:
            print('Training model %d' % i, model_types)
        if early_stopping:
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]

        else:
            callbacks = []
        history = model.fit(X_train_sub, y_train_sub,
                            nb_epoch=nr_epochs, batch_size=20,
                            # see comment on subsize_set
                            validation_data=(X_val, y_val),
                            verbose=verbose,
                            callbacks=callbacks)
        histories.append(history)
        val_accuracies.append(history.history['val_acc'][-1])
        val_losses.append(history.history['val_loss'][-1])
        if outputfile is not None:
            store_train_hist_as_json(params, model_types,
                                     history.history, outputfile)
    return histories, val_accuracies, val_losses


def store_train_hist_as_json(params, model_type, history, outputfile):
    """
    This function stores the model parameters, the loss and accuracy history
    of one model in a JSON file. It appends the model information to the
    existing models in the file.

    Parameters
    ----------
    params : dict
        parameters for one model
    model_type : Keras model object
        Keras model object for one model
    history : dict
        training history from one model
    outputfile : str
        path where the json file needs to be stored
    """
    jsondata = params.copy()
    for k in jsondata.keys():
        if isinstance(jsondata[k], np.ndarray):
            jsondata[k] = jsondata[k].tolist()
    jsondata['train_acc'] = history['acc']
    jsondata['train_loss'] = history['loss']
    jsondata['val_acc'] = history['val_acc']
    jsondata['val_loss'] = history['val_loss']
    jsondata['modeltype'] = model_type
    jsondata['modeltype'] = model_type
    if os.path.isfile(outputfile):
        with open(outputfile, 'r') as outfile:
            previousdata = json.load(outfile)
    else:
        previousdata = []
    previousdata.append(jsondata)
    with open(outputfile, 'w') as outfile:
        json.dump(previousdata, outfile, sort_keys=True,
                  indent=4, ensure_ascii=False)


def plotTrainingProcess(history, name='Model', ax=None):
    """
    This function plots the loss and accuracy on the train and validation set,
    for each epoch in the history of one model.

    Parameters
    ----------
    history : keras History object
        The history object of the training process corresponding to one model
    name : str
        Name of the model, to display in the title
    ax : Axis, optional
        Specific axis to plot on

    """
    if ax is None:
        fig, ax = plt.subplots()
    ax2 = ax.twinx()
    LN = len(history.history['val_loss'])
    val_loss, = ax.plot(range(LN), history.history['val_loss'], 'g--',
                        label='validation loss')
    train_loss, = ax.plot(range(LN), history.history['loss'], 'g-',
                          label='train loss')
    val_acc, = ax2.plot(range(LN), history.history['val_acc'], 'b--',
                        label='validation accuracy')
    train_acc, = ax2.plot(range(LN), history.history['acc'], 'b-',
                          label='train accuracy')
    ax.set_xlabel('epoch')
    ax.set_ylabel('loss', color='g')
    ax2.set_ylabel('accuracy', color='b')
    plt.legend(handles=[val_loss, train_loss, val_acc, train_acc],
               loc=2, bbox_to_anchor=(1.1, 1))
    plt.title(name)


def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
                           number_of_models=5, nr_epochs=5, subset_size=100,
                           outputpath=None, **kwargs
                           ):
    """
    Tries out a number of models on a subsample of the data,
    and outputs the best found architecture and hyperparameters.

    Parameters
    ----------
    X_train : numpy array
        The input dataset for training of shape
        (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        The output classes for the train data, in binary format of shape
        (num_samples, num_classes)
    X_val : numpy array
        The input dataset for validation of shape
        (num_samples_val, num_timesteps, num_channels)
    y_val : numpy array
        The output classes for the validation data, in binary format of shape
        (num_samples_val, num_classes)
    verbose : bool, optional
        flag for displaying verbose output
    number_of_models : int, optiona
        The number of models to generate and test
    nr_epochs : int, optional
        The number of epochs that each model is trained
    subset_size : int, optional
        The size of the subset of the data that is used for finding
        the optimal architecture
    outputpath : str, optional
        File location to store the model results
    **kwargs: key-value parameters
        parameters for generating the models
        (see docstring for modelgen.generate_models)

    Returns
    ----------
    best_model : Keras model
        Best performing model, already trained on a small sample data set.
    best_params : dict
        Dictionary containing the hyperparameters for the best model
    best_model_type : str
        Type of the best model
    knn_acc : float
        accuaracy for kNN prediction on validation set
    """
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
                                      number_of_models=number_of_models,
                                      **kwargs)
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
                                                                    y_train,
                                                                    X_val,
                                                                    y_val,
                                                                    models,
                                                                    nr_epochs,
                                                                    subset_size=subset_size,

                                                                    verbose=verbose,

                                                                    outputfile=outputpath)

    best_model_index = np.argmax(val_accuracies)
    best_model, best_params, best_model_type = models[best_model_index]
    knn_acc = kNN_accuracy(
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
    if verbose:
        for i in range(len(models)):  # now one plot per model, ultimately we
            # may want all models in one plot to allow for direct comparison
            name = str(models[i][1])
            plotTrainingProcess(histories[i], name)
        print('Best model: model ', best_model_index)
        print('Model type: ', best_model_type)
        print('Hyperparameters: ', best_params)
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
        print('Accuracy of kNN on validation set', knn_acc)

    if val_accuracies[best_model_index] < knn_acc:
        warnings.warn('Best model not better than kNN: ' +
                      str(val_accuracies[best_model_index]) + ' vs  ' +
                      str(knn_acc)
                      )
    return best_model, best_params, best_model_type, knn_acc


def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
    """
    Performs k-Neigherst Neighbors and returns the accuracy score.

    Parameters
    ----------
    X_train : numpy array
        Train set of shape (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        Class labels for train set
    X_val : numpy array
        Validation set of shape (num_samples, num_timesteps, num_channels)
    y_val : numpy array
        Class labels for validation set
    k : int
        number of neighbors to use for classifying

    Returns
    -------
    accuracy: float
        accuracy score on the validation set
    """
    num_samples, num_timesteps, num_channels = X_train.shape
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(
        X_train.reshape(
            num_samples,
            num_timesteps *
            num_channels),
        y_train)
    num_samples, num_timesteps, num_channels = X_val.shape
    val_predict = clf.predict(
        X_val.reshape(num_samples,
                      num_timesteps * num_channels))
    return metrics.accuracy_score(val_predict, y_val)


1			"""
2			Summary:
3			This module provides the main functionality of mcfly: searching for an
4			optimal model architecture. The work flow is as follows:
5			Function generate_models from modelgen.py generates and compiles models.
6			Function train_models_on_samples trains those models.
7			Function plotTrainingProcess plots the training process.
8			Function find_best_architecture is wrapper function that combines
9			these steps.
10			Example function calls can be found in the tutorial notebook
11			'EvaluateDifferentModels.ipynb'.
12			"""
13			import numpy as np
14			from matplotlib import pyplot as plt
15			from . import modelgen
16			from sklearn import neighbors, metrics
17			import warnings
18			import json
19			import os
20			from keras.callbacks import EarlyStopping
21
22			def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23			nr_epochs=5, subset_size=100, verbose=True,
24			outputfile=None, early_stopping=False):
25			"""
26			Given a list of compiled models, this function trains
27			them all on a subset of the train data. If the given size of the subset is
28			smaller then the size of the data, the complete data set is used.
29
30			Parameters
31			----------
32			X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
33			The input dataset for training
34			y_train : numpy array of shape (num_samples, num_classes)
35			The output classes for the train data, in binary format
36			X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
37			The input dataset for validation
38			y_val : numpy array of shape (num_samples_val, num_classes)
39			The output classes for the validation data, in binary format
40			models : list of model, params, modeltypes
41			List of keras models to train
42			nr_epochs : int, optional
43			nr of epochs to use for training one model
44			subset_size :
45			The number of samples used from the complete train set
46			verbose : bool, optional
47			flag for displaying verbose output
48			outputfile : str, optional
49			File location to store the model results
50			early_stopping: bool
51			Stop when validation loss does not decrease
52
53			Returns
54			----------
55			histories : list of Keras History objects
56			train histories for all models
57			val_accuracies : list of floats
58			validation accuraracies of the models
59			val_losses : list of floats
60			validation losses of the models
61			"""
62			# if subset_size is smaller then X_train, this will work fine
63			X_train_sub = X_train[:subset_size, :, :]
64			y_train_sub = y_train[:subset_size, :]
65
66			histories = []
67			val_accuracies = []
68			val_losses = []
69			for i, (model, params, model_types) in enumerate(models):
70			if verbose:
71			print('Training model %d' % i, model_types)
72			if early_stopping:
73			callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
			0 ignored issues – show Coding Style introduced 2016-10-28 15:13 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (101/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
74			else:
75			callbacks = []
76			history = model.fit(X_train_sub, y_train_sub,
77			nb_epoch=nr_epochs, batch_size=20,
78			# see comment on subsize_set
79			validation_data=(X_val, y_val),
80			verbose=verbose,
81			callbacks=callbacks)
82			histories.append(history)
83			val_accuracies.append(history.history['val_acc'][-1])
84			val_losses.append(history.history['val_loss'][-1])
85			if outputfile is not None:
86			store_train_hist_as_json(params, model_types,
87			history.history, outputfile)
88			return histories, val_accuracies, val_losses
89
90
91			def store_train_hist_as_json(params, model_type, history, outputfile):
92			"""
93			This function stores the model parameters, the loss and accuracy history
94			of one model in a JSON file. It appends the model information to the
95			existing models in the file.
96
97			Parameters
98			----------
99			params : dict
100			parameters for one model
101			model_type : Keras model object
102			Keras model object for one model
103			history : dict
104			training history from one model
105			outputfile : str
106			path where the json file needs to be stored
107			"""
108			jsondata = params.copy()
109			for k in jsondata.keys():
110			if isinstance(jsondata[k], np.ndarray):
111			jsondata[k] = jsondata[k].tolist()
112			jsondata['train_acc'] = history['acc']
113			jsondata['train_loss'] = history['loss']
114			jsondata['val_acc'] = history['val_acc']
115			jsondata['val_loss'] = history['val_loss']
116			jsondata['modeltype'] = model_type
117			jsondata['modeltype'] = model_type
118			if os.path.isfile(outputfile):
119			with open(outputfile, 'r') as outfile:
120			previousdata = json.load(outfile)
121			else:
122			previousdata = []
123			previousdata.append(jsondata)
124			with open(outputfile, 'w') as outfile:
125			json.dump(previousdata, outfile, sort_keys=True,
126			indent=4, ensure_ascii=False)
127
128
129			def plotTrainingProcess(history, name='Model', ax=None):
130			"""
131			This function plots the loss and accuracy on the train and validation set,
132			for each epoch in the history of one model.
133
134			Parameters
135			----------
136			history : keras History object
137			The history object of the training process corresponding to one model
138			name : str
139			Name of the model, to display in the title
140			ax : Axis, optional
141			Specific axis to plot on
142
143			"""
144			if ax is None:
145			fig, ax = plt.subplots()
146			ax2 = ax.twinx()
147			LN = len(history.history['val_loss'])
148			val_loss, = ax.plot(range(LN), history.history['val_loss'], 'g--',
149			label='validation loss')
150			train_loss, = ax.plot(range(LN), history.history['loss'], 'g-',
151			label='train loss')
152			val_acc, = ax2.plot(range(LN), history.history['val_acc'], 'b--',
153			label='validation accuracy')
154			train_acc, = ax2.plot(range(LN), history.history['acc'], 'b-',
155			label='train accuracy')
156			ax.set_xlabel('epoch')
157			ax.set_ylabel('loss', color='g')
158			ax2.set_ylabel('accuracy', color='b')
159			plt.legend(handles=[val_loss, train_loss, val_acc, train_acc],
160			loc=2, bbox_to_anchor=(1.1, 1))
161			plt.title(name)
162
163
164			def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
165			number_of_models=5, nr_epochs=5, subset_size=100,
166			outputpath=None, **kwargs
167			):
168			"""
169			Tries out a number of models on a subsample of the data,
170			and outputs the best found architecture and hyperparameters.
171
172			Parameters
173			----------
174			X_train : numpy array
175			The input dataset for training of shape
176			(num_samples, num_timesteps, num_channels)
177			y_train : numpy array
178			The output classes for the train data, in binary format of shape
179			(num_samples, num_classes)
180			X_val : numpy array
181			The input dataset for validation of shape
182			(num_samples_val, num_timesteps, num_channels)
183			y_val : numpy array
184			The output classes for the validation data, in binary format of shape
185			(num_samples_val, num_classes)
186			verbose : bool, optional
187			flag for displaying verbose output
188			number_of_models : int, optiona
189			The number of models to generate and test
190			nr_epochs : int, optional
191			The number of epochs that each model is trained
192			subset_size : int, optional
193			The size of the subset of the data that is used for finding
194			the optimal architecture
195			outputpath : str, optional
196			File location to store the model results
197			**kwargs: key-value parameters
198			parameters for generating the models
199			(see docstring for modelgen.generate_models)
200
201			Returns
202			----------
203			best_model : Keras model
204			Best performing model, already trained on a small sample data set.
205			best_params : dict
206			Dictionary containing the hyperparameters for the best model
207			best_model_type : str
208			Type of the best model
209			knn_acc : float
210			accuaracy for kNN prediction on validation set
211			"""
212			models = modelgen.generate_models(X_train.shape, y_train.shape[1],
213			number_of_models=number_of_models,
214			**kwargs)
215			histories, val_accuracies, val_losses = train_models_on_samples(X_train,
216			y_train,
217			X_val,
218			y_val,
219			models,
220			nr_epochs,
221			subset_size=subset_size,
			0 ignored issues – show Coding Style introduced 2016-08-04 09:16 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
222			verbose=verbose,
			0 ignored issues – show Coding Style introduced 2016-07-07 14:47 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (84/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
223			outputfile=outputpath)
			0 ignored issues – show Coding Style introduced 2016-08-09 14:55 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
224			best_model_index = np.argmax(val_accuracies)
225			best_model, best_params, best_model_type = models[best_model_index]
226			knn_acc = kNN_accuracy(
227			X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
228			if verbose:
229			for i in range(len(models)): # now one plot per model, ultimately we
230			# may want all models in one plot to allow for direct comparison
231			name = str(models[i][1])
232			plotTrainingProcess(histories[i], name)
233			print('Best model: model ', best_model_index)
234			print('Model type: ', best_model_type)
235			print('Hyperparameters: ', best_params)
236			print('Accuracy on validation set: ', val_accuracies[best_model_index])
237			print('Accuracy of kNN on validation set', knn_acc)
238
239			if val_accuracies[best_model_index] < knn_acc:
240			warnings.warn('Best model not better than kNN: ' +
241			str(val_accuracies[best_model_index]) + ' vs ' +
242			str(knn_acc)
243			)
244			return best_model, best_params, best_model_type, knn_acc
245
246
247			def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
248			"""
249			Performs k-Neigherst Neighbors and returns the accuracy score.
250
251			Parameters
252			----------
253			X_train : numpy array
254			Train set of shape (num_samples, num_timesteps, num_channels)
255			y_train : numpy array
256			Class labels for train set
257			X_val : numpy array
258			Validation set of shape (num_samples, num_timesteps, num_channels)
259			y_val : numpy array
260			Class labels for validation set
261			k : int
262			number of neighbors to use for classifying
263
264			Returns
265			-------
266			accuracy: float
267			accuracy score on the validation set
268			"""
269			num_samples, num_timesteps, num_channels = X_train.shape
270			clf = neighbors.KNeighborsClassifier(k)
271			clf.fit(
272			X_train.reshape(
273			num_samples,
274			num_timesteps *
275			num_channels),
276			y_train)
277			num_samples, num_timesteps, num_channels = X_val.shape
278			val_predict = clf.predict(
279			X_val.reshape(num_samples,
280			num_timesteps * num_channels))
281			return metrics.accuracy_score(val_predict, y_val)
282

NLeSC / mcfly

Push — master ( d10d5f...edd44f )

kNN_accuracy() B

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like