find_best_architecture() - Code Metrics - Inspection of "Removed plotting function and matplotlib dependenc..." - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 95c5a7...9dd26d )

by Dafne van

created 2017-06-26 11:51 UTC

find_best_architecture() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	9
CRAP Score	3.576

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
cc	3
c	1
b	0
f	1
dl	0
loc	77
ccs	9
cts	15
cp	0.6
crap	3.576
rs	8.9342

How to fix Long Method

"""
 Summary:
 This module provides the main functionality of mcfly: searching for an
 optimal model architecture. The work flow is as follows:
 Function generate_models from modelgen.py generates and compiles models.
 Function train_models_on_samples trains those models.
 Function plotTrainingProcess plots the training process.
 Function find_best_architecture is wrapper function that combines
 these steps.
 Example function calls can be found in the tutorial notebook
 'EvaluateDifferentModels.ipynb'.
"""
import numpy as np
from . import modelgen
from sklearn import neighbors, metrics
import warnings
import json
import os
from keras.callbacks import EarlyStopping

def train_models_on_samples(X_train, y_train, X_val, y_val, models,
                            nr_epochs=5, subset_size=100, verbose=True,
                            outputfile=None, early_stopping=False):
    """
    Given a list of compiled models, this function trains
    them all on a subset of the train data. If the given size of the subset is
    smaller then the size of the data, the complete data set is used.

    Parameters
    ----------
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
        The input dataset for training
    y_train : numpy array of shape (num_samples, num_classes)
        The output classes for the train data, in binary format
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
        The input dataset for validation
    y_val : numpy array of shape (num_samples_val, num_classes)
        The output classes for the validation data, in binary format
    models : list of model, params, modeltypes
        List of keras models to train
    nr_epochs : int, optional
        nr of epochs to use for training one model
    subset_size :
        The number of samples used from the complete train set
    verbose : bool, optional
        flag for displaying verbose output
    outputfile : str, optional
        File location to store the model results
    early_stopping: bool
        Stop when validation loss does not decrease

    Returns
    ----------
    histories : list of Keras History objects
        train histories for all models
    val_accuracies : list of floats
        validation accuraracies of the models
    val_losses : list of floats
        validation losses of the models
    """
    # if subset_size is smaller then X_train, this will work fine
    X_train_sub = X_train[:subset_size, :, :]
    y_train_sub = y_train[:subset_size, :]

    histories = []
    val_accuracies = []
    val_losses = []
    for i, (model, params, model_types) in enumerate(models):
        if verbose:
            print('Training model %d' % i, model_types)
        if early_stopping:
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]

        else:
            callbacks = []
        history = model.fit(X_train_sub, y_train_sub,
                            epochs=nr_epochs, batch_size=20,
                            # see comment on subsize_set
                            validation_data=(X_val, y_val),
                            verbose=verbose,
                            callbacks=callbacks)
        histories.append(history)
        val_accuracies.append(history.history['val_acc'][-1])
        val_losses.append(history.history['val_loss'][-1])
        if outputfile is not None:
            store_train_hist_as_json(params, model_types,
                                     history.history, outputfile)
    return histories, val_accuracies, val_losses


def store_train_hist_as_json(params, model_type, history, outputfile):
    """
    This function stores the model parameters, the loss and accuracy history
    of one model in a JSON file. It appends the model information to the
    existing models in the file.

    Parameters
    ----------
    params : dict
        parameters for one model
    model_type : Keras model object
        Keras model object for one model
    history : dict
        training history from one model
    outputfile : str
        path where the json file needs to be stored
    """
    jsondata = params.copy()
    for k in jsondata.keys():
        if isinstance(jsondata[k], np.ndarray):
            jsondata[k] = jsondata[k].tolist()
    jsondata['train_acc'] = history['acc']
    jsondata['train_loss'] = history['loss']
    jsondata['val_acc'] = history['val_acc']
    jsondata['val_loss'] = history['val_loss']
    jsondata['modeltype'] = model_type
    jsondata['modeltype'] = model_type
    if os.path.isfile(outputfile):
        with open(outputfile, 'r') as outfile:
            previousdata = json.load(outfile)
    else:
        previousdata = []
    previousdata.append(jsondata)
    with open(outputfile, 'w') as outfile:
        json.dump(previousdata, outfile, sort_keys=True,
                  indent=4, ensure_ascii=False)


def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
                           number_of_models=5, nr_epochs=5, subset_size=100,
                           outputpath=None, **kwargs
                           ):
    """
    Tries out a number of models on a subsample of the data,
    and outputs the best found architecture and hyperparameters.

    Parameters
    ----------
    X_train : numpy array
        The input dataset for training of shape
        (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        The output classes for the train data, in binary format of shape
        (num_samples, num_classes)
    X_val : numpy array
        The input dataset for validation of shape
        (num_samples_val, num_timesteps, num_channels)
    y_val : numpy array
        The output classes for the validation data, in binary format of shape
        (num_samples_val, num_classes)
    verbose : bool, optional
        flag for displaying verbose output
    number_of_models : int, optiona
        The number of models to generate and test
    nr_epochs : int, optional
        The number of epochs that each model is trained
    subset_size : int, optional
        The size of the subset of the data that is used for finding
        the optimal architecture
    outputpath : str, optional
        File location to store the model results
    **kwargs: key-value parameters
        parameters for generating the models
        (see docstring for modelgen.generate_models)

    Returns
    ----------
    best_model : Keras model
        Best performing model, already trained on a small sample data set.
    best_params : dict
        Dictionary containing the hyperparameters for the best model
    best_model_type : str
        Type of the best model
    knn_acc : float
        accuaracy for kNN prediction on validation set
    """
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
                                      number_of_models=number_of_models,
                                      **kwargs)
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
                                                                    y_train,
                                                                    X_val,
                                                                    y_val,
                                                                    models,
                                                                    nr_epochs,
                                                                    subset_size=subset_size,

                                                                    verbose=verbose,

                                                                    outputfile=outputpath)

    best_model_index = np.argmax(val_accuracies)
    best_model, best_params, best_model_type = models[best_model_index]
    knn_acc = kNN_accuracy(
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
    if verbose:
        print('Best model: model ', best_model_index)
        print('Model type: ', best_model_type)
        print('Hyperparameters: ', best_params)
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
        print('Accuracy of kNN on validation set', knn_acc)

    if val_accuracies[best_model_index] < knn_acc:
        warnings.warn('Best model not better than kNN: ' +
                      str(val_accuracies[best_model_index]) + ' vs  ' +
                      str(knn_acc)
                      )
    return best_model, best_params, best_model_type, knn_acc


def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
    """
    Performs k-Neigherst Neighbors and returns the accuracy score.

    Parameters
    ----------
    X_train : numpy array
        Train set of shape (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        Class labels for train set
    X_val : numpy array
        Validation set of shape (num_samples, num_timesteps, num_channels)
    y_val : numpy array
        Class labels for validation set
    k : int
        number of neighbors to use for classifying

    Returns
    -------
    accuracy: float
        accuracy score on the validation set
    """
    num_samples, num_timesteps, num_channels = X_train.shape
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(
        X_train.reshape(
            num_samples,
            num_timesteps *
            num_channels),
        y_train)
    num_samples, num_timesteps, num_channels = X_val.shape
    val_predict = clf.predict(
        X_val.reshape(num_samples,
                      num_timesteps * num_channels))
    return metrics.accuracy_score(val_predict, y_val)


1		"""
2		Summary:
3		This module provides the main functionality of mcfly: searching for an
4		optimal model architecture. The work flow is as follows:
5		Function generate_models from modelgen.py generates and compiles models.
6		Function train_models_on_samples trains those models.
7		Function plotTrainingProcess plots the training process.
8		Function find_best_architecture is wrapper function that combines
9		these steps.
10		Example function calls can be found in the tutorial notebook
11		'EvaluateDifferentModels.ipynb'.
12		"""
13	1	import numpy as np
14	1	from . import modelgen
15	1	from sklearn import neighbors, metrics
16	1	import warnings
17	1	import json
18	1	import os
19	1	from keras.callbacks import EarlyStopping
20
21	1	def train_models_on_samples(X_train, y_train, X_val, y_val, models,
22		nr_epochs=5, subset_size=100, verbose=True,
23		outputfile=None, early_stopping=False):
24		"""
25		Given a list of compiled models, this function trains
26		them all on a subset of the train data. If the given size of the subset is
27		smaller then the size of the data, the complete data set is used.
28
29		Parameters
30		----------
31		X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
32		The input dataset for training
33		y_train : numpy array of shape (num_samples, num_classes)
34		The output classes for the train data, in binary format
35		X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
36		The input dataset for validation
37		y_val : numpy array of shape (num_samples_val, num_classes)
38		The output classes for the validation data, in binary format
39		models : list of model, params, modeltypes
40		List of keras models to train
41		nr_epochs : int, optional
42		nr of epochs to use for training one model
43		subset_size :
44		The number of samples used from the complete train set
45		verbose : bool, optional
46		flag for displaying verbose output
47		outputfile : str, optional
48		File location to store the model results
49		early_stopping: bool
50		Stop when validation loss does not decrease
51
52		Returns
53		----------
54		histories : list of Keras History objects
55		train histories for all models
56		val_accuracies : list of floats
57		validation accuraracies of the models
58		val_losses : list of floats
59		validation losses of the models
60		"""
61		# if subset_size is smaller then X_train, this will work fine
62	1	X_train_sub = X_train[:subset_size, :, :]
63	1	y_train_sub = y_train[:subset_size, :]
64
65	1	histories = []
66	1	val_accuracies = []
67	1	val_losses = []
68	1	for i, (model, params, model_types) in enumerate(models):
69	1	if verbose:
70		print('Training model %d' % i, model_types)
71	1	if early_stopping:
72		callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
		0 ignored issues – show Coding Style introduced 2016-10-28 15:13 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (101/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
73		else:
74	1	callbacks = []
75	1	history = model.fit(X_train_sub, y_train_sub,
76		epochs=nr_epochs, batch_size=20,
77		# see comment on subsize_set
78		validation_data=(X_val, y_val),
79		verbose=verbose,
80		callbacks=callbacks)
81	1	histories.append(history)
82	1	val_accuracies.append(history.history['val_acc'][-1])
83	1	val_losses.append(history.history['val_loss'][-1])
84	1	if outputfile is not None:
85		store_train_hist_as_json(params, model_types,
86		history.history, outputfile)
87	1	return histories, val_accuracies, val_losses
88
89
90	1	def store_train_hist_as_json(params, model_type, history, outputfile):
91		"""
92		This function stores the model parameters, the loss and accuracy history
93		of one model in a JSON file. It appends the model information to the
94		existing models in the file.
95
96		Parameters
97		----------
98		params : dict
99		parameters for one model
100		model_type : Keras model object
101		Keras model object for one model
102		history : dict
103		training history from one model
104		outputfile : str
105		path where the json file needs to be stored
106		"""
107	1	jsondata = params.copy()
108	1	for k in jsondata.keys():
109	1	if isinstance(jsondata[k], np.ndarray):
110	1	jsondata[k] = jsondata[k].tolist()
111	1	jsondata['train_acc'] = history['acc']
112	1	jsondata['train_loss'] = history['loss']
113	1	jsondata['val_acc'] = history['val_acc']
114	1	jsondata['val_loss'] = history['val_loss']
115	1	jsondata['modeltype'] = model_type
116	1	jsondata['modeltype'] = model_type
117	1	if os.path.isfile(outputfile):
118		with open(outputfile, 'r') as outfile:
119		previousdata = json.load(outfile)
120		else:
121	1	previousdata = []
122	1	previousdata.append(jsondata)
123	1	with open(outputfile, 'w') as outfile:
124	1	json.dump(previousdata, outfile, sort_keys=True,
125		indent=4, ensure_ascii=False)
126
127
128	1	def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
129		number_of_models=5, nr_epochs=5, subset_size=100,
130		outputpath=None, **kwargs
131		):
132		"""
133		Tries out a number of models on a subsample of the data,
134		and outputs the best found architecture and hyperparameters.
135
136		Parameters
137		----------
138		X_train : numpy array
139		The input dataset for training of shape
140		(num_samples, num_timesteps, num_channels)
141		y_train : numpy array
142		The output classes for the train data, in binary format of shape
143		(num_samples, num_classes)
144		X_val : numpy array
145		The input dataset for validation of shape
146		(num_samples_val, num_timesteps, num_channels)
147		y_val : numpy array
148		The output classes for the validation data, in binary format of shape
149		(num_samples_val, num_classes)
150		verbose : bool, optional
151		flag for displaying verbose output
152		number_of_models : int, optiona
153		The number of models to generate and test
154		nr_epochs : int, optional
155		The number of epochs that each model is trained
156		subset_size : int, optional
157		The size of the subset of the data that is used for finding
158		the optimal architecture
159		outputpath : str, optional
160		File location to store the model results
161		**kwargs: key-value parameters
162		parameters for generating the models
163		(see docstring for modelgen.generate_models)
164
165		Returns
166		----------
167		best_model : Keras model
168		Best performing model, already trained on a small sample data set.
169		best_params : dict
170		Dictionary containing the hyperparameters for the best model
171		best_model_type : str
172		Type of the best model
173		knn_acc : float
174		accuaracy for kNN prediction on validation set
175		"""
176	1	models = modelgen.generate_models(X_train.shape, y_train.shape[1],
177		number_of_models=number_of_models,
178		**kwargs)
179	1	histories, val_accuracies, val_losses = train_models_on_samples(X_train,
180		y_train,
181		X_val,
182		y_val,
183		models,
184		nr_epochs,
185		subset_size=subset_size,
		0 ignored issues – show Coding Style introduced 2016-08-04 09:16 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
186		verbose=verbose,
		0 ignored issues – show Coding Style introduced 2016-07-07 14:47 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (84/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
187		outputfile=outputpath)
		0 ignored issues – show Coding Style introduced 2016-08-09 14:55 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
188	1	best_model_index = np.argmax(val_accuracies)
189	1	best_model, best_params, best_model_type = models[best_model_index]
190	1	knn_acc = kNN_accuracy(
191		X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
192	1	if verbose:
193		print('Best model: model ', best_model_index)
194		print('Model type: ', best_model_type)
195		print('Hyperparameters: ', best_params)
196		print('Accuracy on validation set: ', val_accuracies[best_model_index])
197		print('Accuracy of kNN on validation set', knn_acc)
198
199	1	if val_accuracies[best_model_index] < knn_acc:
200		warnings.warn('Best model not better than kNN: ' +
201		str(val_accuracies[best_model_index]) + ' vs ' +
202		str(knn_acc)
203		)
204	1	return best_model, best_params, best_model_type, knn_acc
205
206
207	1	def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
208		"""
209		Performs k-Neigherst Neighbors and returns the accuracy score.
210
211		Parameters
212		----------
213		X_train : numpy array
214		Train set of shape (num_samples, num_timesteps, num_channels)
215		y_train : numpy array
216		Class labels for train set
217		X_val : numpy array
218		Validation set of shape (num_samples, num_timesteps, num_channels)
219		y_val : numpy array
220		Class labels for validation set
221		k : int
222		number of neighbors to use for classifying
223
224		Returns
225		-------
226		accuracy: float
227		accuracy score on the validation set
228		"""
229	1	num_samples, num_timesteps, num_channels = X_train.shape
230	1	clf = neighbors.KNeighborsClassifier(k)
231	1	clf.fit(
232		X_train.reshape(
233		num_samples,
234		num_timesteps *
235		num_channels),
236		y_train)
237	1	num_samples, num_timesteps, num_channels = X_val.shape
238	1	val_predict = clf.predict(
239		X_val.reshape(num_samples,
240		num_timesteps * num_channels))
241		return metrics.accuracy_score(val_predict, y_val)
242

NLeSC / mcfly

Push — master ( 95c5a7...9dd26d )

find_best_architecture() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like