train_models_on_samples() - Code Metrics - Inspection of "Merge pull request #150 from NLeSC/savemodels" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 681a50...95647a )

by Dafne van

created 2017-08-31 08:10 UTC

train_models_on_samples() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	17
CRAP Score	6.2488

Importance

Changes	4
Bugs	1	Features	0

Metric	Value
cc	6
c	4
b	1
f	0
dl	0
loc	74
ccs	17
cts	21
cp	0.8095
crap	6.2488
rs	7.4965

How to fix Long Method

"""
 Summary:
 This module provides the main functionality of mcfly: searching for an
 optimal model architecture. The work flow is as follows:
 Function generate_models from modelgen.py generates and compiles models.
 Function train_models_on_samples trains those models.
 Function plotTrainingProcess plots the training process.
 Function find_best_architecture is wrapper function that combines
 these steps.
 Example function calls can be found in the tutorial notebook
 'EvaluateDifferentModels.ipynb'.
"""
import numpy as np
from . import modelgen
from sklearn import neighbors, metrics
import warnings
import json
import os
from keras.callbacks import EarlyStopping

def train_models_on_samples(X_train, y_train, X_val, y_val, models,
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,

                            model_path=None, early_stopping=False,
                            batch_size=20):
    """
    Given a list of compiled models, this function trains
    them all on a subset of the train data. If the given size of the subset is
    smaller then the size of the data, the complete data set is used.

    Parameters
    ----------
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
        The input dataset for training
    y_train : numpy array of shape (num_samples, num_classes)
        The output classes for the train data, in binary format
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
        The input dataset for validation
    y_val : numpy array of shape (num_samples_val, num_classes)
        The output classes for the validation data, in binary format
    models : list of model, params, modeltypes
        List of keras models to train
    nr_epochs : int, optional
        nr of epochs to use for training one model
    subset_size :
        The number of samples used from the complete train set
    verbose : bool, optional
        flag for displaying verbose output
    outputfile: str, optional
        Filename to store the model training results
    model_path : str, optional
        Directory to store the models as HDF5 files
    early_stopping: bool
        Stop when validation loss does not decrease
    batch_size : int
        nr of samples per batch

    Returns
    ----------
    histories : list of Keras History objects
        train histories for all models
    val_accuracies : list of floats
        validation accuraracies of the models
    val_losses : list of floats
        validation losses of the models
    """
    # if subset_size is smaller then X_train, this will work fine
    X_train_sub = X_train[:subset_size, :, :]
    y_train_sub = y_train[:subset_size, :]

    histories = []
    val_accuracies = []
    val_losses = []
    for i, (model, params, model_types) in enumerate(models):
        if verbose:
            print('Training model %d' % i, model_types)
        if early_stopping:
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]

        else:
            callbacks = []
        history = model.fit(X_train_sub, y_train_sub,
                            epochs=nr_epochs, batch_size=batch_size,
                            # see comment on subsize_set
                            validation_data=(X_val, y_val),
                            verbose=verbose,
                            callbacks=callbacks)
        histories.append(history)
        val_accuracies.append(history.history['val_acc'][-1])
        val_losses.append(history.history['val_loss'][-1])
        if outputfile is not None:
            store_train_hist_as_json(params, model_types,
                                     history.history, outputfile)
        if model_path is not None:
                model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))

    return histories, val_accuracies, val_losses


def store_train_hist_as_json(params, model_type, history, outputfile):
    """
    This function stores the model parameters, the loss and accuracy history
    of one model in a JSON file. It appends the model information to the
    existing models in the file.

    Parameters
    ----------
    params : dict
        parameters for one model
    model_type : Keras model object
        Keras model object for one model
    history : dict
        training history from one model
    outputfile : str
        path where the json file needs to be stored
    """
    jsondata = params.copy()
    for k in jsondata.keys():
        if isinstance(jsondata[k], np.ndarray):
            jsondata[k] = jsondata[k].tolist()
    jsondata['train_acc'] = history['acc']
    jsondata['train_loss'] = history['loss']
    jsondata['val_acc'] = history['val_acc']
    jsondata['val_loss'] = history['val_loss']
    jsondata['modeltype'] = model_type
    jsondata['modeltype'] = model_type
    if os.path.isfile(outputfile):
        with open(outputfile, 'r') as outfile:
            previousdata = json.load(outfile)
    else:
        previousdata = []
    previousdata.append(jsondata)
    with open(outputfile, 'w') as outfile:
        json.dump(previousdata, outfile, sort_keys=True,
                  indent=4, ensure_ascii=False)


def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
                           number_of_models=5, nr_epochs=5, subset_size=100,
                           outputpath=None, model_path=None, **kwargs
                           ):
    """
    Tries out a number of models on a subsample of the data,
    and outputs the best found architecture and hyperparameters.

    Parameters
    ----------
    X_train : numpy array
        The input dataset for training of shape
        (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        The output classes for the train data, in binary format of shape
        (num_samples, num_classes)
    X_val : numpy array
        The input dataset for validation of shape
        (num_samples_val, num_timesteps, num_channels)
    y_val : numpy array
        The output classes for the validation data, in binary format of shape
        (num_samples_val, num_classes)
    verbose : bool, optional
        flag for displaying verbose output
    number_of_models : int, optiona
        The number of models to generate and test
    nr_epochs : int, optional
        The number of epochs that each model is trained
    subset_size : int, optional
        The size of the subset of the data that is used for finding
        the optimal architecture
    outputpath : str, optional
        Filename to store the model training history
    model_path: str, optional
        Directory to save the models as HDF5 files
    **kwargs: key-value parameters
        parameters for generating the models
        (see docstring for modelgen.generate_models)

    Returns
    ----------
    best_model : Keras model
        Best performing model, already trained on a small sample data set.
    best_params : dict
        Dictionary containing the hyperparameters for the best model
    best_model_type : str
        Type of the best model
    knn_acc : float
        accuaracy for kNN prediction on validation set
    """
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
                                      number_of_models=number_of_models,
                                      **kwargs)
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
                                                                    y_train,
                                                                    X_val,
                                                                    y_val,
                                                                    models,
                                                                    nr_epochs,
                                                                    subset_size=subset_size,

                                                                    verbose=verbose,

                                                                    outputfile=outputpath,

                                                                    model_path=model_path)

    best_model_index = np.argmax(val_accuracies)
    best_model, best_params, best_model_type = models[best_model_index]
    knn_acc = kNN_accuracy(
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
    if verbose:
        print('Best model: model ', best_model_index)
        print('Model type: ', best_model_type)
        print('Hyperparameters: ', best_params)
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
        print('Accuracy of kNN on validation set', knn_acc)

    if val_accuracies[best_model_index] < knn_acc:
        warnings.warn('Best model not better than kNN: ' +
                      str(val_accuracies[best_model_index]) + ' vs  ' +
                      str(knn_acc)
                      )
    return best_model, best_params, best_model_type, knn_acc


def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
    """
    Performs k-Neigherst Neighbors and returns the accuracy score.

    Parameters
    ----------
    X_train : numpy array
        Train set of shape (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        Class labels for train set
    X_val : numpy array
        Validation set of shape (num_samples, num_timesteps, num_channels)
    y_val : numpy array
        Class labels for validation set
    k : int
        number of neighbors to use for classifying

    Returns
    -------
    accuracy: float
        accuracy score on the validation set
    """
    num_samples, num_timesteps, num_channels = X_train.shape
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(
        X_train.reshape(
            num_samples,
            num_timesteps *
            num_channels),
        y_train)
    num_samples, num_timesteps, num_channels = X_val.shape
    val_predict = clf.predict(
        X_val.reshape(num_samples,
                      num_timesteps * num_channels))
    return metrics.accuracy_score(val_predict, y_val)


1		"""
2		Summary:
3		This module provides the main functionality of mcfly: searching for an
4		optimal model architecture. The work flow is as follows:
5		Function generate_models from modelgen.py generates and compiles models.
6		Function train_models_on_samples trains those models.
7		Function plotTrainingProcess plots the training process.
8		Function find_best_architecture is wrapper function that combines
9		these steps.
10		Example function calls can be found in the tutorial notebook
11		'EvaluateDifferentModels.ipynb'.
12		"""
13	1	import numpy as np
14	1	from . import modelgen
15	1	from sklearn import neighbors, metrics
16	1	import warnings
17	1	import json
18	1	import os
19	1	from keras.callbacks import EarlyStopping
20
21	1	def train_models_on_samples(X_train, y_train, X_val, y_val, models,
22		nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
		0 ignored issues – show Coding Style introduced 2017-07-27 12:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (88/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
23		model_path=None, early_stopping=False,
24		batch_size=20):
25		"""
26		Given a list of compiled models, this function trains
27		them all on a subset of the train data. If the given size of the subset is
28		smaller then the size of the data, the complete data set is used.
29
30		Parameters
31		----------
32		X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
33		The input dataset for training
34		y_train : numpy array of shape (num_samples, num_classes)
35		The output classes for the train data, in binary format
36		X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
37		The input dataset for validation
38		y_val : numpy array of shape (num_samples_val, num_classes)
39		The output classes for the validation data, in binary format
40		models : list of model, params, modeltypes
41		List of keras models to train
42		nr_epochs : int, optional
43		nr of epochs to use for training one model
44		subset_size :
45		The number of samples used from the complete train set
46		verbose : bool, optional
47		flag for displaying verbose output
48		outputfile: str, optional
49		Filename to store the model training results
50		model_path : str, optional
51		Directory to store the models as HDF5 files
52		early_stopping: bool
53		Stop when validation loss does not decrease
54		batch_size : int
55		nr of samples per batch
56
57		Returns
58		----------
59		histories : list of Keras History objects
60		train histories for all models
61		val_accuracies : list of floats
62		validation accuraracies of the models
63		val_losses : list of floats
64		validation losses of the models
65		"""
66		# if subset_size is smaller then X_train, this will work fine
67	1	X_train_sub = X_train[:subset_size, :, :]
68	1	y_train_sub = y_train[:subset_size, :]
69
70	1	histories = []
71	1	val_accuracies = []
72	1	val_losses = []
73	1	for i, (model, params, model_types) in enumerate(models):
74	1	if verbose:
75		print('Training model %d' % i, model_types)
76	1	if early_stopping:
77		callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
		0 ignored issues – show Coding Style introduced 2016-10-28 15:13 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (101/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
78		else:
79	1	callbacks = []
80	1	history = model.fit(X_train_sub, y_train_sub,
81		epochs=nr_epochs, batch_size=batch_size,
82		# see comment on subsize_set
83		validation_data=(X_val, y_val),
84		verbose=verbose,
85		callbacks=callbacks)
86	1	histories.append(history)
87	1	val_accuracies.append(history.history['val_acc'][-1])
88	1	val_losses.append(history.history['val_loss'][-1])
89	1	if outputfile is not None:
90		store_train_hist_as_json(params, model_types,
91		history.history, outputfile)
92	1	if model_path is not None:
93		model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
		0 ignored issues – show Coding Style introduced 2017-07-27 12:18 UTC by Report Bug Copy Issue Report The indentation here looks off. 12 spaces were expected, but 16 were found. Loading history...
94	1	return histories, val_accuracies, val_losses
95
96
97	1	def store_train_hist_as_json(params, model_type, history, outputfile):
98		"""
99		This function stores the model parameters, the loss and accuracy history
100		of one model in a JSON file. It appends the model information to the
101		existing models in the file.
102
103		Parameters
104		----------
105		params : dict
106		parameters for one model
107		model_type : Keras model object
108		Keras model object for one model
109		history : dict
110		training history from one model
111		outputfile : str
112		path where the json file needs to be stored
113		"""
114	1	jsondata = params.copy()
115	1	for k in jsondata.keys():
116	1	if isinstance(jsondata[k], np.ndarray):
117	1	jsondata[k] = jsondata[k].tolist()
118	1	jsondata['train_acc'] = history['acc']
119	1	jsondata['train_loss'] = history['loss']
120	1	jsondata['val_acc'] = history['val_acc']
121	1	jsondata['val_loss'] = history['val_loss']
122	1	jsondata['modeltype'] = model_type
123	1	jsondata['modeltype'] = model_type
124	1	if os.path.isfile(outputfile):
125		with open(outputfile, 'r') as outfile:
126		previousdata = json.load(outfile)
127		else:
128	1	previousdata = []
129	1	previousdata.append(jsondata)
130	1	with open(outputfile, 'w') as outfile:
131	1	json.dump(previousdata, outfile, sort_keys=True,
132		indent=4, ensure_ascii=False)
133
134
135	1	def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
136		number_of_models=5, nr_epochs=5, subset_size=100,
137		outputpath=None, model_path=None, **kwargs
138		):
139		"""
140		Tries out a number of models on a subsample of the data,
141		and outputs the best found architecture and hyperparameters.
142
143		Parameters
144		----------
145		X_train : numpy array
146		The input dataset for training of shape
147		(num_samples, num_timesteps, num_channels)
148		y_train : numpy array
149		The output classes for the train data, in binary format of shape
150		(num_samples, num_classes)
151		X_val : numpy array
152		The input dataset for validation of shape
153		(num_samples_val, num_timesteps, num_channels)
154		y_val : numpy array
155		The output classes for the validation data, in binary format of shape
156		(num_samples_val, num_classes)
157		verbose : bool, optional
158		flag for displaying verbose output
159		number_of_models : int, optiona
160		The number of models to generate and test
161		nr_epochs : int, optional
162		The number of epochs that each model is trained
163		subset_size : int, optional
164		The size of the subset of the data that is used for finding
165		the optimal architecture
166		outputpath : str, optional
167		Filename to store the model training history
168		model_path: str, optional
169		Directory to save the models as HDF5 files
170		**kwargs: key-value parameters
171		parameters for generating the models
172		(see docstring for modelgen.generate_models)
173
174		Returns
175		----------
176		best_model : Keras model
177		Best performing model, already trained on a small sample data set.
178		best_params : dict
179		Dictionary containing the hyperparameters for the best model
180		best_model_type : str
181		Type of the best model
182		knn_acc : float
183		accuaracy for kNN prediction on validation set
184		"""
185	1	models = modelgen.generate_models(X_train.shape, y_train.shape[1],
186		number_of_models=number_of_models,
187		**kwargs)
188	1	histories, val_accuracies, val_losses = train_models_on_samples(X_train,
189		y_train,
190		X_val,
191		y_val,
192		models,
193		nr_epochs,
194		subset_size=subset_size,
		0 ignored issues – show Coding Style introduced 2016-08-04 09:16 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
195		verbose=verbose,
		0 ignored issues – show Coding Style introduced 2016-07-07 14:47 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (84/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
196		outputfile=outputpath,
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
197		model_path=model_path)
		0 ignored issues – show Coding Style introduced 2017-07-27 12:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
198	1	best_model_index = np.argmax(val_accuracies)
199	1	best_model, best_params, best_model_type = models[best_model_index]
200	1	knn_acc = kNN_accuracy(
201		X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
202	1	if verbose:
203		print('Best model: model ', best_model_index)
204		print('Model type: ', best_model_type)
205		print('Hyperparameters: ', best_params)
206		print('Accuracy on validation set: ', val_accuracies[best_model_index])
207		print('Accuracy of kNN on validation set', knn_acc)
208
209	1	if val_accuracies[best_model_index] < knn_acc:
210		warnings.warn('Best model not better than kNN: ' +
211		str(val_accuracies[best_model_index]) + ' vs ' +
212		str(knn_acc)
213		)
214	1	return best_model, best_params, best_model_type, knn_acc
215
216
217	1	def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
218		"""
219		Performs k-Neigherst Neighbors and returns the accuracy score.
220
221		Parameters
222		----------
223		X_train : numpy array
224		Train set of shape (num_samples, num_timesteps, num_channels)
225		y_train : numpy array
226		Class labels for train set
227		X_val : numpy array
228		Validation set of shape (num_samples, num_timesteps, num_channels)
229		y_val : numpy array
230		Class labels for validation set
231		k : int
232		number of neighbors to use for classifying
233
234		Returns
235		-------
236		accuracy: float
237		accuracy score on the validation set
238		"""
239	1	num_samples, num_timesteps, num_channels = X_train.shape
240	1	clf = neighbors.KNeighborsClassifier(k)
241	1	clf.fit(
242		X_train.reshape(
243		num_samples,
244		num_timesteps *
245		num_channels),
246		y_train)
247	1	num_samples, num_timesteps, num_channels = X_val.shape
248	1	val_predict = clf.predict(
249		X_val.reshape(num_samples,
250		num_timesteps * num_channels))
251		return metrics.accuracy_score(val_predict, y_val)
252

NLeSC / mcfly

Push — master ( 681a50...95647a )

train_models_on_samples() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like