get_metric_name() - Code Metrics - Inspection of "Merge pull request #147 from NLeSC/newmetrics" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 3e0ec6...69dab7 )

by Christiaan

created 2017-09-18 08:47 UTC

get_metric_name() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	6
CRAP Score	4.5923

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	4
c	1
b	0
f	0
dl	0
loc	20
ccs	6
cts	9
cp	0.6667
crap	4.5923
rs	9.2

"""
 Summary:
 This module provides the main functionality of mcfly: searching for an
 optimal model architecture. The work flow is as follows:
 Function generate_models from modelgen.py generates and compiles models.
 Function train_models_on_samples trains those models.
 Function plotTrainingProcess plots the training process.
 Function find_best_architecture is wrapper function that combines
 these steps.
 Example function calls can be found in the tutorial notebook
 'EvaluateDifferentModels.ipynb'.
"""
import numpy as np
from . import modelgen
from sklearn import neighbors, metrics as sklearnmetrics
import warnings
import json
import os
from keras.callbacks import EarlyStopping
from keras import metrics


def train_models_on_samples(X_train, y_train, X_val, y_val, models,
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,

                            model_path=None, early_stopping=False,
                            batch_size=20, metric='accuracy'):
    """
    Given a list of compiled models, this function trains
    them all on a subset of the train data. If the given size of the subset is
    smaller then the size of the data, the complete data set is used.

    Parameters
    ----------
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
        The input dataset for training
    y_train : numpy array of shape (num_samples, num_classes)
        The output classes for the train data, in binary format
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
        The input dataset for validation
    y_val : numpy array of shape (num_samples_val, num_classes)
        The output classes for the validation data, in binary format
    models : list of model, params, modeltypes
        List of keras models to train
    nr_epochs : int, optional
        nr of epochs to use for training one model
    subset_size :
        The number of samples used from the complete train set
    verbose : bool, optional
        flag for displaying verbose output
    outputfile: str, optional
        Filename to store the model training results
    model_path : str, optional
        Directory to store the models as HDF5 files
    early_stopping: bool
        Stop when validation loss does not decrease
    batch_size : int
        nr of samples per batch
    metric : str
        metric to store in the history object

    Returns
    ----------
    histories : list of Keras History objects
        train histories for all models
    val_metrics : list of floats
        validation accuraracies of the models
    val_losses : list of floats
        validation losses of the models
    """
    # if subset_size is smaller then X_train, this will work fine
    X_train_sub = X_train[:subset_size, :, :]
    y_train_sub = y_train[:subset_size, :]

    metric_name = get_metric_name(metric)

    histories = []
    val_metrics = []
    val_losses = []
    for i, (model, params, model_types) in enumerate(models):
        if verbose:
            print('Training model %d' % i, model_types)
        model_metrics = [get_metric_name(name) for name in model.metrics]
        if metric_name not in model_metrics:
            raise ValueError(
                'Invalid metric. The model was not compiled with {} as metric'.format(metric_name))

        if early_stopping:
            callbacks = [
                EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]

        else:
            callbacks = []
        history = model.fit(X_train_sub, y_train_sub,
                            epochs=nr_epochs, batch_size=batch_size,
                            # see comment on subsize_set
                            validation_data=(X_val, y_val),
                            verbose=verbose,
                            callbacks=callbacks)
        histories.append(history)

        val_metrics.append(history.history['val_' + metric_name][-1])
        val_losses.append(history.history['val_loss'][-1])
        if outputfile is not None:
            store_train_hist_as_json(params, model_types,
                                     history.history, outputfile)
        if model_path is not None:
                model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))


    return histories, val_metrics, val_losses


def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):

    """
    This function stores the model parameters, the loss and accuracy history
    of one model in a JSON file. It appends the model information to the
    existing models in the file.

    Parameters
    ----------
    params : dict
        parameters for one model
    model_type : Keras model object
        Keras model object for one model
    history : dict
        training history from one model
    outputfile : str
        path where the json file needs to be stored
    metric_name : str, optional
        name of metric from history to store
    """
    jsondata = params.copy()
    for k in jsondata.keys():
        if isinstance(jsondata[k], np.ndarray):
            jsondata[k] = jsondata[k].tolist()
    jsondata['train_metric'] = history[metric_name]
    jsondata['train_loss'] = history['loss']
    jsondata['val_metric'] = history['val_' + metric_name]
    jsondata['val_loss'] = history['val_loss']
    jsondata['modeltype'] = model_type
    jsondata['metric'] = metric_name
    if os.path.isfile(outputfile):
        with open(outputfile, 'r') as outfile:
            previousdata = json.load(outfile)
    else:
        previousdata = []
    previousdata.append(jsondata)
    with open(outputfile, 'w') as outfile:
        json.dump(previousdata, outfile, sort_keys=True,
                  indent=4, ensure_ascii=False)


def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
                           number_of_models=5, nr_epochs=5, subset_size=100,
                           outputpath=None, model_path=None, metric='accuracy',
                           **kwargs):
    """
    Tries out a number of models on a subsample of the data,
    and outputs the best found architecture and hyperparameters.

    Parameters
    ----------
    X_train : numpy array
        The input dataset for training of shape
        (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        The output classes for the train data, in binary format of shape
        (num_samples, num_classes)
    X_val : numpy array
        The input dataset for validation of shape
        (num_samples_val, num_timesteps, num_channels)
    y_val : numpy array
        The output classes for the validation data, in binary format of shape
        (num_samples_val, num_classes)
    verbose : bool, optional
        flag for displaying verbose output
    number_of_models : int, optiona
        The number of models to generate and test
    nr_epochs : int, optional
        The number of epochs that each model is trained
    subset_size : int, optional
        The size of the subset of the data that is used for finding
        the optimal architecture
    outputpath : str, optional
        File location to store the model results
    model_path: str, optional
        Directory to save the models as HDF5 files
    metric: str, optional
        metric that is used to evaluate the model on the validation set.
        See https://keras.io/metrics/ for possible metrics
    **kwargs: key-value parameters
        parameters for generating the models
        (see docstring for modelgen.generate_models)

    Returns
    ----------
    best_model : Keras model
        Best performing model, already trained on a small sample data set.
    best_params : dict
        Dictionary containing the hyperparameters for the best model
    best_model_type : str
        Type of the best model
    knn_acc : float
        accuaracy for kNN prediction on validation set
    """
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
                                      number_of_models=number_of_models,
                                      metrics=[metric],
                                      **kwargs)
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
                                                                    y_train,
                                                                    X_val,
                                                                    y_val,
                                                                    models,
                                                                    nr_epochs,
                                                                    subset_size=subset_size,

                                                                    verbose=verbose,

                                                                    outputfile=outputpath,

                                                                    model_path=model_path,

                                                                    metric=metric)

    best_model_index = np.argmax(val_accuracies)
    best_model, best_params, best_model_type = models[best_model_index]
    knn_acc = kNN_accuracy(
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
    if verbose:
        print('Best model: model ', best_model_index)
        print('Model type: ', best_model_type)
        print('Hyperparameters: ', best_params)
        print(str(metric) + ' on validation set: ',
              val_accuracies[best_model_index])
        print('Accuracy of kNN on validation set', knn_acc)

    if val_accuracies[best_model_index] < knn_acc:
        warnings.warn('Best model not better than kNN: ' +
                      str(val_accuracies[best_model_index]) + ' vs  ' +
                      str(knn_acc)
                      )
    return best_model, best_params, best_model_type, knn_acc


def get_metric_name(name):
    """
    Gives the keras name for a metric

    Parameters
    ----------
    name : str
        original name of the metric
    Returns
    -------

    """
    if name == 'acc' or name == 'accuracy':
        return 'acc'
    try:
        metric_fn = metrics.get(name)
        return metric_fn.__name__
    except:
        pass
    return name


def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
    """
    Performs k-Neigherst Neighbors and returns the accuracy score.

    Parameters
    ----------
    X_train : numpy array
        Train set of shape (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        Class labels for train set
    X_val : numpy array
        Validation set of shape (num_samples, num_timesteps, num_channels)
    y_val : numpy array
        Class labels for validation set
    k : int
        number of neighbors to use for classifying

    Returns
    -------
    accuracy: float
        accuracy score on the validation set
    """
    num_samples, num_timesteps, num_channels = X_train.shape
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(
        X_train.reshape(
            num_samples,
            num_timesteps *
            num_channels),
        y_train)
    num_samples, num_timesteps, num_channels = X_val.shape
    val_predict = clf.predict(
        X_val.reshape(num_samples,
                      num_timesteps * num_channels))
    return sklearnmetrics.accuracy_score(val_predict, y_val)


1		"""
2		Summary:
3		This module provides the main functionality of mcfly: searching for an
4		optimal model architecture. The work flow is as follows:
5		Function generate_models from modelgen.py generates and compiles models.
6		Function train_models_on_samples trains those models.
7		Function plotTrainingProcess plots the training process.
8		Function find_best_architecture is wrapper function that combines
9		these steps.
10		Example function calls can be found in the tutorial notebook
11		'EvaluateDifferentModels.ipynb'.
12		"""
13	1	import numpy as np
14	1	from . import modelgen
15	1	from sklearn import neighbors, metrics as sklearnmetrics
16	1	import warnings
17	1	import json
18	1	import os
19	1	from keras.callbacks import EarlyStopping
20	1	from keras import metrics
21
22
23	1	def train_models_on_samples(X_train, y_train, X_val, y_val, models,
24		nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
		0 ignored issues – show Coding Style introduced 2017-07-27 12:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (88/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
25		model_path=None, early_stopping=False,
26		batch_size=20, metric='accuracy'):
27		"""
28		Given a list of compiled models, this function trains
29		them all on a subset of the train data. If the given size of the subset is
30		smaller then the size of the data, the complete data set is used.
31
32		Parameters
33		----------
34		X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
35		The input dataset for training
36		y_train : numpy array of shape (num_samples, num_classes)
37		The output classes for the train data, in binary format
38		X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
39		The input dataset for validation
40		y_val : numpy array of shape (num_samples_val, num_classes)
41		The output classes for the validation data, in binary format
42		models : list of model, params, modeltypes
43		List of keras models to train
44		nr_epochs : int, optional
45		nr of epochs to use for training one model
46		subset_size :
47		The number of samples used from the complete train set
48		verbose : bool, optional
49		flag for displaying verbose output
50		outputfile: str, optional
51		Filename to store the model training results
52		model_path : str, optional
53		Directory to store the models as HDF5 files
54		early_stopping: bool
55		Stop when validation loss does not decrease
56		batch_size : int
57		nr of samples per batch
58		metric : str
59		metric to store in the history object
60
61		Returns
62		----------
63		histories : list of Keras History objects
64		train histories for all models
65		val_metrics : list of floats
66		validation accuraracies of the models
67		val_losses : list of floats
68		validation losses of the models
69		"""
70		# if subset_size is smaller then X_train, this will work fine
71	1	X_train_sub = X_train[:subset_size, :, :]
72	1	y_train_sub = y_train[:subset_size, :]
73
74	1	metric_name = get_metric_name(metric)
75
76	1	histories = []
77	1	val_metrics = []
78	1	val_losses = []
79	1	for i, (model, params, model_types) in enumerate(models):
80	1	if verbose:
81		print('Training model %d' % i, model_types)
82	1	model_metrics = [get_metric_name(name) for name in model.metrics]
83	1	if metric_name not in model_metrics:
84		raise ValueError(
85		'Invalid metric. The model was not compiled with {} as metric'.format(metric_name))
		0 ignored issues – show Coding Style introduced 2017-08-31 08:54 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (99/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
86	1	if early_stopping:
87		callbacks = [
88		EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
		0 ignored issues – show Coding Style introduced 2017-08-31 08:54 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
89		else:
90	1	callbacks = []
91	1	history = model.fit(X_train_sub, y_train_sub,
92		epochs=nr_epochs, batch_size=batch_size,
93		# see comment on subsize_set
94		validation_data=(X_val, y_val),
95		verbose=verbose,
96		callbacks=callbacks)
97	1	histories.append(history)
98
99	1	val_metrics.append(history.history['val_' + metric_name][-1])
100	1	val_losses.append(history.history['val_loss'][-1])
101	1	if outputfile is not None:
102		store_train_hist_as_json(params, model_types,
103		history.history, outputfile)
104	1	if model_path is not None:
105		model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
		0 ignored issues – show Coding Style introduced 2017-07-27 12:18 UTC by Report Bug Copy Issue Report The indentation here looks off. 12 spaces were expected, but 16 were found. Loading history...
106
107	1	return histories, val_metrics, val_losses
108
109
110	1	def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (89/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
111		"""
112		This function stores the model parameters, the loss and accuracy history
113		of one model in a JSON file. It appends the model information to the
114		existing models in the file.
115
116		Parameters
117		----------
118		params : dict
119		parameters for one model
120		model_type : Keras model object
121		Keras model object for one model
122		history : dict
123		training history from one model
124		outputfile : str
125		path where the json file needs to be stored
126		metric_name : str, optional
127		name of metric from history to store
128		"""
129	1	jsondata = params.copy()
130	1	for k in jsondata.keys():
131	1	if isinstance(jsondata[k], np.ndarray):
132	1	jsondata[k] = jsondata[k].tolist()
133	1	jsondata['train_metric'] = history[metric_name]
134	1	jsondata['train_loss'] = history['loss']
135	1	jsondata['val_metric'] = history['val_' + metric_name]
136	1	jsondata['val_loss'] = history['val_loss']
137	1	jsondata['modeltype'] = model_type
138	1	jsondata['metric'] = metric_name
139	1	if os.path.isfile(outputfile):
140		with open(outputfile, 'r') as outfile:
141		previousdata = json.load(outfile)
142		else:
143	1	previousdata = []
144	1	previousdata.append(jsondata)
145	1	with open(outputfile, 'w') as outfile:
146	1	json.dump(previousdata, outfile, sort_keys=True,
147		indent=4, ensure_ascii=False)
148
149
150	1	def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
151		number_of_models=5, nr_epochs=5, subset_size=100,
152		outputpath=None, model_path=None, metric='accuracy',
153		**kwargs):
154		"""
155		Tries out a number of models on a subsample of the data,
156		and outputs the best found architecture and hyperparameters.
157
158		Parameters
159		----------
160		X_train : numpy array
161		The input dataset for training of shape
162		(num_samples, num_timesteps, num_channels)
163		y_train : numpy array
164		The output classes for the train data, in binary format of shape
165		(num_samples, num_classes)
166		X_val : numpy array
167		The input dataset for validation of shape
168		(num_samples_val, num_timesteps, num_channels)
169		y_val : numpy array
170		The output classes for the validation data, in binary format of shape
171		(num_samples_val, num_classes)
172		verbose : bool, optional
173		flag for displaying verbose output
174		number_of_models : int, optiona
175		The number of models to generate and test
176		nr_epochs : int, optional
177		The number of epochs that each model is trained
178		subset_size : int, optional
179		The size of the subset of the data that is used for finding
180		the optimal architecture
181		outputpath : str, optional
182		File location to store the model results
183		model_path: str, optional
184		Directory to save the models as HDF5 files
185		metric: str, optional
186		metric that is used to evaluate the model on the validation set.
187		See https://keras.io/metrics/ for possible metrics
188		**kwargs: key-value parameters
189		parameters for generating the models
190		(see docstring for modelgen.generate_models)
191
192		Returns
193		----------
194		best_model : Keras model
195		Best performing model, already trained on a small sample data set.
196		best_params : dict
197		Dictionary containing the hyperparameters for the best model
198		best_model_type : str
199		Type of the best model
200		knn_acc : float
201		accuaracy for kNN prediction on validation set
202		"""
203	1	models = modelgen.generate_models(X_train.shape, y_train.shape[1],
204		number_of_models=number_of_models,
205		metrics=[metric],
206		**kwargs)
207	1	histories, val_accuracies, val_losses = train_models_on_samples(X_train,
208		y_train,
209		X_val,
210		y_val,
211		models,
212		nr_epochs,
213		subset_size=subset_size,
		0 ignored issues – show Coding Style introduced 2016-08-04 09:16 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
214		verbose=verbose,
		0 ignored issues – show Coding Style introduced 2016-07-07 14:47 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (84/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
215		outputfile=outputpath,
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
216		model_path=model_path,
		0 ignored issues – show Coding Style introduced 2017-08-31 08:29 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
217		metric=metric)
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
218	1	best_model_index = np.argmax(val_accuracies)
219	1	best_model, best_params, best_model_type = models[best_model_index]
220	1	knn_acc = kNN_accuracy(
221		X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
222	1	if verbose:
223		print('Best model: model ', best_model_index)
224		print('Model type: ', best_model_type)
225		print('Hyperparameters: ', best_params)
226		print(str(metric) + ' on validation set: ',
227		val_accuracies[best_model_index])
228		print('Accuracy of kNN on validation set', knn_acc)
229
230	1	if val_accuracies[best_model_index] < knn_acc:
231		warnings.warn('Best model not better than kNN: ' +
232		str(val_accuracies[best_model_index]) + ' vs ' +
233		str(knn_acc)
234		)
235	1	return best_model, best_params, best_model_type, knn_acc
236
237
238	1	def get_metric_name(name):
239		"""
240		Gives the keras name for a metric
241
242		Parameters
243		----------
244		name : str
245		original name of the metric
246		Returns
247		-------
248
249		"""
250	1	if name == 'acc' or name == 'accuracy':
251	1	return 'acc'
252	1	try:
253	1	metric_fn = metrics.get(name)
254	1	return metric_fn.__name__
255		except:
256		pass
257		return name
258
259
260	1	def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
261		"""
262		Performs k-Neigherst Neighbors and returns the accuracy score.
263
264		Parameters
265		----------
266		X_train : numpy array
267		Train set of shape (num_samples, num_timesteps, num_channels)
268		y_train : numpy array
269		Class labels for train set
270		X_val : numpy array
271		Validation set of shape (num_samples, num_timesteps, num_channels)
272		y_val : numpy array
273		Class labels for validation set
274		k : int
275		number of neighbors to use for classifying
276
277		Returns
278		-------
279		accuracy: float
280		accuracy score on the validation set
281		"""
282	1	num_samples, num_timesteps, num_channels = X_train.shape
283	1	clf = neighbors.KNeighborsClassifier(k)
284	1	clf.fit(
285		X_train.reshape(
286		num_samples,
287		num_timesteps *
288		num_channels),
289		y_train)
290	1	num_samples, num_timesteps, num_channels = X_val.shape
291	1	val_predict = clf.predict(
292		X_val.reshape(num_samples,
293		num_timesteps * num_channels))
294		return sklearnmetrics.accuracy_score(val_predict, y_val)
295

NLeSC / mcfly

Push — master ( 3e0ec6...69dab7 )

get_metric_name() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like