store_train_hist_as_json() - Code Metrics - Inspection of "really delete removed notebooks #76" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f4f7b5...bb3685 )

by Christiaan

created 2016-10-12 14:47 UTC

store_train_hist_as_json() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	16
CRAP Score	6.0493

Importance

Changes

Metric	Value
cc	6
dl	0
loc	36
ccs	16
cts	18
cp	0.8889
crap	6.0493
rs	7.5384
c	0
b	0
f	0

"""
 Summary:
 This module provides the main functionality of mcfly: searching for an
 optimal model architecture. The work flow is as follows:
 Function generate_models from modelgen.py generates and compiles models.
 Function train_models_on_samples trains those models.
 Function plotTrainingProcess plots the training process.
 Function find_best_architecture is wrapper function that combines
 these steps.
 Example function calls can be found in the tutorial notebook
 'EvaluateDifferentModels.ipynb'.
"""
import numpy as np
from matplotlib import pyplot as plt
from . import modelgen
from sklearn import neighbors, metrics
import warnings
import json
import os


def train_models_on_samples(X_train, y_train, X_val, y_val, models,
                            nr_epochs=5, subset_size=100, verbose=True,
                            outputfile=None):
    """
    Given a list of compiled models, this function trains
    them all on a subset of the train data. If the given size of the subset is
    smaller then the size of the data, the complete data set is used.

    Parameters
    ----------
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
        The input dataset for training
    y_train : numpy array of shape (num_samples, num_classes)
        The output classes for the train data, in binary format
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
        The input dataset for validation
    y_val : numpy array of shape (num_samples_val, num_classes)
        The output classes for the validation data, in binary format
    models : list of model, params, modeltypes
        List of keras models to train
    nr_epochs : int, optional
        nr of epochs to use for training one model
    subset_size :
        The number of samples used from the complete train set
    verbose : bool, optional
        flag for displaying verbose output
    outputfile : str, optional
        File location to store the model results

    Returns
    ----------
    histories : list of Keras History objects
        train histories for all models
    val_accuracies : list of floats
        validation accuraracies of the models
    val_losses : list of floats
        validation losses of the models
    """
    # if subset_size is smaller then X_train, this will work fine
    X_train_sub = X_train[:subset_size, :, :]
    y_train_sub = y_train[:subset_size, :]

    histories = []
    val_accuracies = []
    val_losses = []
    for i, (model, params, model_types) in enumerate(models):
        if verbose:
            print('Training model %d' % i, model_types)
        history = model.fit(X_train_sub, y_train_sub,
                            nb_epoch=nr_epochs, batch_size=20,
                            # see comment on subsize_set
                            validation_data=(X_val, y_val),
                            verbose=verbose)
        histories.append(history)
        val_accuracies.append(history.history['val_acc'][-1])
        val_losses.append(history.history['val_loss'][-1])
        if outputfile is not None:
            store_train_hist_as_json(params, model_types,
                                     history.history, outputfile)
    return histories, val_accuracies, val_losses


def store_train_hist_as_json(params, model_type, history, outputfile):
    """
    This function stores the model parameters, the loss and accuracy history
    of one model in a JSON file. It appends the model information to the
    existing models in the file.

    Parameters
    ----------
    params : dict
        parameters for one model
    model_type : Keras model object
        Keras model object for one model
    history : dict
        training history from one model
    outputfile : str
        path where the json file needs to be stored
    """
    jsondata = params.copy()
    for k in jsondata.keys():
        if isinstance(jsondata[k], np.ndarray):
            jsondata[k] = jsondata[k].tolist()
    jsondata['train_acc'] = history['acc']
    jsondata['train_loss'] = history['loss']
    jsondata['val_acc'] = history['val_acc']
    jsondata['val_loss'] = history['val_loss']
    jsondata['modeltype'] = model_type
    jsondata['modeltype'] = model_type
    if os.path.isfile(outputfile):
        with open(outputfile, 'r') as outfile:
            previousdata = json.load(outfile)
    else:
        previousdata = []
    previousdata.append(jsondata)
    with open(outputfile, 'w') as outfile:
        json.dump(previousdata, outfile, sort_keys=True,
                  indent=4, ensure_ascii=False)


def plotTrainingProcess(history, name='Model', ax=None):
    """
    This function plots the loss and accuracy on the train and validation set,
    for each epoch in the history of one model.

    Parameters
    ----------
    history : keras History object
        The history object of the training process corresponding to one model
    name : str
        Name of the model, to display in the title
    ax : Axis, optional
        Specific axis to plot on

    """
    if ax is None:
        fig, ax = plt.subplots()
    ax2 = ax.twinx()
    LN = len(history.history['val_loss'])
    val_loss, = ax.plot(range(LN), history.history['val_loss'], 'g--',
                        label='validation loss')
    train_loss, = ax.plot(range(LN), history.history['loss'], 'g-',
                          label='train loss')
    val_acc, = ax2.plot(range(LN), history.history['val_acc'], 'b--',
                        label='validation accuracy')
    train_acc, = ax2.plot(range(LN), history.history['acc'], 'b-',
                          label='train accuracy')
    ax.set_xlabel('epoch')
    ax.set_ylabel('loss', color='g')
    ax2.set_ylabel('accuracy', color='b')
    plt.legend(handles=[val_loss, train_loss, val_acc, train_acc],
               loc=2, bbox_to_anchor=(1.1, 1))
    plt.title(name)


def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
                           number_of_models=5, nr_epochs=5, subset_size=100,
                           outputpath=None, **kwargs
                           ):
    """
    Tries out a number of models on a subsample of the data,
    and outputs the best found architecture and hyperparameters.

    Parameters
    ----------
    X_train : numpy array
        The input dataset for training of shape
        (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        The output classes for the train data, in binary format of shape
        (num_samples, num_classes)
    X_val : numpy array
        The input dataset for validation of shape
        (num_samples_val, num_timesteps, num_channels)
    y_val : numpy array
        The output classes for the validation data, in binary format of shape
        (num_samples_val, num_classes)
    verbose : bool, optional
        flag for displaying verbose output
    number_of_models : int, optiona
        The number of models to generate and test
    nr_epochs : int, optional
        The number of epochs that each model is trained
    subset_size : int, optional
        The size of the subset of the data that is used for finding
        the optimal architecture
    outputpath : str, optional
        File location to store the model results
    **kwargs: key-value parameters
        parameters for generating the models
        (see docstring for modelgen.generate_models)

    Returns
    ----------
    best_model : Keras model
        Best performing model, already trained on a small sample data set.
    best_params : dict
        Dictionary containing the hyperparameters for the best model
    best_model_type : str
        Type of the best model
    knn_acc : float
        accuaracy for kNN prediction on validation set
    """
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
                                      number_of_models=number_of_models,
                                      **kwargs)
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
                                                                    y_train,
                                                                    X_val,
                                                                    y_val,
                                                                    models,
                                                                    nr_epochs,
                                                                    subset_size=subset_size,

                                                                    verbose=verbose,

                                                                    outputfile=outputpath)

    best_model_index = np.argmax(val_accuracies)
    best_model, best_params, best_model_type = models[best_model_index]
    knn_acc = kNN_accuracy(
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
    if verbose:
        for i in range(len(models)):  # now one plot per model, ultimately we
            # may want all models in one plot to allow for direct comparison
            name = str(models[i][1])
            plotTrainingProcess(histories[i], name)
        print('Best model: model ', best_model_index)
        print('Model type: ', best_model_type)
        print('Hyperparameters: ', best_params)
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
        print('Accuracy of kNN on validation set', knn_acc)

    if val_accuracies[best_model_index] < knn_acc:
        warnings.warn('Best model not better than kNN: ' +
                      str(val_accuracies[best_model_index]) + ' vs  ' +
                      str(knn_acc)
                      )
    return best_model, best_params, best_model_type, knn_acc


def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
    """
    Performs k-Neigherst Neighbors and returns the accuracy score.

    Parameters
    ----------
    X_train : numpy array
        Train set of shape (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        Class labels for train set
    X_val : numpy array
        Validation set of shape (num_samples, num_timesteps, num_channels)
    y_val : numpy array
        Class labels for validation set
    k : int
        number of neighbors to use for classifying

    Returns
    -------
    accuracy: float
        accuracy score on the validation set
    """
    num_samples, num_timesteps, num_channels = X_train.shape
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(
        X_train.reshape(
            num_samples,
            num_timesteps *
            num_channels),
        y_train)
    num_samples, num_timesteps, num_channels = X_val.shape
    val_predict = clf.predict(
        X_val.reshape(num_samples,
                      num_timesteps * num_channels))
    return metrics.accuracy_score(val_predict, y_val)


1		"""
2		Summary:
3		This module provides the main functionality of mcfly: searching for an
4		optimal model architecture. The work flow is as follows:
5		Function generate_models from modelgen.py generates and compiles models.
6		Function train_models_on_samples trains those models.
7		Function plotTrainingProcess plots the training process.
8		Function find_best_architecture is wrapper function that combines
9		these steps.
10		Example function calls can be found in the tutorial notebook
11		'EvaluateDifferentModels.ipynb'.
12		"""
13	1	import numpy as np
14	1	from matplotlib import pyplot as plt
15	1	from . import modelgen
16	1	from sklearn import neighbors, metrics
17	1	import warnings
18	1	import json
19	1	import os
20
21
22	1	def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23		nr_epochs=5, subset_size=100, verbose=True,
24		outputfile=None):
25		"""
26		Given a list of compiled models, this function trains
27		them all on a subset of the train data. If the given size of the subset is
28		smaller then the size of the data, the complete data set is used.
29
30		Parameters
31		----------
32		X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
33		The input dataset for training
34		y_train : numpy array of shape (num_samples, num_classes)
35		The output classes for the train data, in binary format
36		X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
37		The input dataset for validation
38		y_val : numpy array of shape (num_samples_val, num_classes)
39		The output classes for the validation data, in binary format
40		models : list of model, params, modeltypes
41		List of keras models to train
42		nr_epochs : int, optional
43		nr of epochs to use for training one model
44		subset_size :
45		The number of samples used from the complete train set
46		verbose : bool, optional
47		flag for displaying verbose output
48		outputfile : str, optional
49		File location to store the model results
50
51		Returns
52		----------
53		histories : list of Keras History objects
54		train histories for all models
55		val_accuracies : list of floats
56		validation accuraracies of the models
57		val_losses : list of floats
58		validation losses of the models
59		"""
60		# if subset_size is smaller then X_train, this will work fine
61	1	X_train_sub = X_train[:subset_size, :, :]
62	1	y_train_sub = y_train[:subset_size, :]
63
64	1	histories = []
65	1	val_accuracies = []
66	1	val_losses = []
67	1	for i, (model, params, model_types) in enumerate(models):
68	1	if verbose:
69		print('Training model %d' % i, model_types)
70	1	history = model.fit(X_train_sub, y_train_sub,
71		nb_epoch=nr_epochs, batch_size=20,
72		# see comment on subsize_set
73		validation_data=(X_val, y_val),
74		verbose=verbose)
75	1	histories.append(history)
76	1	val_accuracies.append(history.history['val_acc'][-1])
77	1	val_losses.append(history.history['val_loss'][-1])
78	1	if outputfile is not None:
79		store_train_hist_as_json(params, model_types,
80		history.history, outputfile)
81	1	return histories, val_accuracies, val_losses
82
83
84	1	def store_train_hist_as_json(params, model_type, history, outputfile):
85		"""
86		This function stores the model parameters, the loss and accuracy history
87		of one model in a JSON file. It appends the model information to the
88		existing models in the file.
89
90		Parameters
91		----------
92		params : dict
93		parameters for one model
94		model_type : Keras model object
95		Keras model object for one model
96		history : dict
97		training history from one model
98		outputfile : str
99		path where the json file needs to be stored
100		"""
101	1	jsondata = params.copy()
102	1	for k in jsondata.keys():
103	1	if isinstance(jsondata[k], np.ndarray):
104	1	jsondata[k] = jsondata[k].tolist()
105	1	jsondata['train_acc'] = history['acc']
106	1	jsondata['train_loss'] = history['loss']
107	1	jsondata['val_acc'] = history['val_acc']
108	1	jsondata['val_loss'] = history['val_loss']
109	1	jsondata['modeltype'] = model_type
110	1	jsondata['modeltype'] = model_type
111	1	if os.path.isfile(outputfile):
112		with open(outputfile, 'r') as outfile:
113		previousdata = json.load(outfile)
114		else:
115	1	previousdata = []
116	1	previousdata.append(jsondata)
117	1	with open(outputfile, 'w') as outfile:
118	1	json.dump(previousdata, outfile, sort_keys=True,
119		indent=4, ensure_ascii=False)
120
121
122	1	def plotTrainingProcess(history, name='Model', ax=None):
123		"""
124		This function plots the loss and accuracy on the train and validation set,
125		for each epoch in the history of one model.
126
127		Parameters
128		----------
129		history : keras History object
130		The history object of the training process corresponding to one model
131		name : str
132		Name of the model, to display in the title
133		ax : Axis, optional
134		Specific axis to plot on
135
136		"""
137		if ax is None:
138		fig, ax = plt.subplots()
139		ax2 = ax.twinx()
140		LN = len(history.history['val_loss'])
141		val_loss, = ax.plot(range(LN), history.history['val_loss'], 'g--',
142		label='validation loss')
143		train_loss, = ax.plot(range(LN), history.history['loss'], 'g-',
144		label='train loss')
145		val_acc, = ax2.plot(range(LN), history.history['val_acc'], 'b--',
146		label='validation accuracy')
147		train_acc, = ax2.plot(range(LN), history.history['acc'], 'b-',
148		label='train accuracy')
149		ax.set_xlabel('epoch')
150		ax.set_ylabel('loss', color='g')
151		ax2.set_ylabel('accuracy', color='b')
152		plt.legend(handles=[val_loss, train_loss, val_acc, train_acc],
153		loc=2, bbox_to_anchor=(1.1, 1))
154		plt.title(name)
155
156
157	1	def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
158		number_of_models=5, nr_epochs=5, subset_size=100,
159		outputpath=None, **kwargs
160		):
161		"""
162		Tries out a number of models on a subsample of the data,
163		and outputs the best found architecture and hyperparameters.
164
165		Parameters
166		----------
167		X_train : numpy array
168		The input dataset for training of shape
169		(num_samples, num_timesteps, num_channels)
170		y_train : numpy array
171		The output classes for the train data, in binary format of shape
172		(num_samples, num_classes)
173		X_val : numpy array
174		The input dataset for validation of shape
175		(num_samples_val, num_timesteps, num_channels)
176		y_val : numpy array
177		The output classes for the validation data, in binary format of shape
178		(num_samples_val, num_classes)
179		verbose : bool, optional
180		flag for displaying verbose output
181		number_of_models : int, optiona
182		The number of models to generate and test
183		nr_epochs : int, optional
184		The number of epochs that each model is trained
185		subset_size : int, optional
186		The size of the subset of the data that is used for finding
187		the optimal architecture
188		outputpath : str, optional
189		File location to store the model results
190		**kwargs: key-value parameters
191		parameters for generating the models
192		(see docstring for modelgen.generate_models)
193
194		Returns
195		----------
196		best_model : Keras model
197		Best performing model, already trained on a small sample data set.
198		best_params : dict
199		Dictionary containing the hyperparameters for the best model
200		best_model_type : str
201		Type of the best model
202		knn_acc : float
203		accuaracy for kNN prediction on validation set
204		"""
205	1	models = modelgen.generate_models(X_train.shape, y_train.shape[1],
206		number_of_models=number_of_models,
207		**kwargs)
208	1	histories, val_accuracies, val_losses = train_models_on_samples(X_train,
209		y_train,
210		X_val,
211		y_val,
212		models,
213		nr_epochs,
214		subset_size=subset_size,
		0 ignored issues – show Coding Style introduced 2016-08-04 09:16 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
215		verbose=verbose,
		0 ignored issues – show Coding Style introduced 2016-07-07 14:47 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (84/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
216		outputfile=outputpath)
		0 ignored issues – show Coding Style introduced 2016-08-09 14:55 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
217	1	best_model_index = np.argmax(val_accuracies)
218	1	best_model, best_params, best_model_type = models[best_model_index]
219	1	knn_acc = kNN_accuracy(
220		X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
221	1	if verbose:
222		for i in range(len(models)): # now one plot per model, ultimately we
223		# may want all models in one plot to allow for direct comparison
224		name = str(models[i][1])
225		plotTrainingProcess(histories[i], name)
226		print('Best model: model ', best_model_index)
227		print('Model type: ', best_model_type)
228		print('Hyperparameters: ', best_params)
229		print('Accuracy on validation set: ', val_accuracies[best_model_index])
230		print('Accuracy of kNN on validation set', knn_acc)
231
232	1	if val_accuracies[best_model_index] < knn_acc:
233		warnings.warn('Best model not better than kNN: ' +
234		str(val_accuracies[best_model_index]) + ' vs ' +
235		str(knn_acc)
236		)
237	1	return best_model, best_params, best_model_type, knn_acc
238
239
240	1	def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
241		"""
242		Performs k-Neigherst Neighbors and returns the accuracy score.
243
244		Parameters
245		----------
246		X_train : numpy array
247		Train set of shape (num_samples, num_timesteps, num_channels)
248		y_train : numpy array
249		Class labels for train set
250		X_val : numpy array
251		Validation set of shape (num_samples, num_timesteps, num_channels)
252		y_val : numpy array
253		Class labels for validation set
254		k : int
255		number of neighbors to use for classifying
256
257		Returns
258		-------
259		accuracy: float
260		accuracy score on the validation set
261		"""
262	1	num_samples, num_timesteps, num_channels = X_train.shape
263	1	clf = neighbors.KNeighborsClassifier(k)
264	1	clf.fit(
265		X_train.reshape(
266		num_samples,
267		num_timesteps *
268		num_channels),
269		y_train)
270	1	num_samples, num_timesteps, num_channels = X_val.shape
271	1	val_predict = clf.predict(
272		X_val.reshape(num_samples,
273		num_timesteps * num_channels))
274		return metrics.accuracy_score(val_predict, y_val)
275

NLeSC / mcfly

Push — master ( f4f7b5...bb3685 )

store_train_hist_as_json() B

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like