get_metric_name() - Code Metrics - Inspection of "Other metrics than accuracy" - NLeSC/mcfly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#147)

by Dafne van

created 2017-06-29 14:27 UTC

get_metric_name() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	6
CRAP Score	4.5923

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	4
c	1
b	0
f	0
dl	0
loc	20
ccs	6
cts	9
cp	0.6667
crap	4.5923
rs	9.2

"""
 Summary:
 This module provides the main functionality of mcfly: searching for an
 optimal model architecture. The work flow is as follows:
 Function generate_models from modelgen.py generates and compiles models.
 Function train_models_on_samples trains those models.
 Function plotTrainingProcess plots the training process.
 Function find_best_architecture is wrapper function that combines
 these steps.
 Example function calls can be found in the tutorial notebook
 'EvaluateDifferentModels.ipynb'.
"""
import numpy as np
from . import modelgen
from sklearn import neighbors, metrics as sklearnmetrics
import warnings
import json
import os
from keras.callbacks import EarlyStopping
from keras import metrics

def train_models_on_samples(X_train, y_train, X_val, y_val, models,
                            nr_epochs=5, subset_size=100, verbose=True,
                            outputfile=None, early_stopping=False,
                            batch_size=20, metric='accuracy'):
    """
    Given a list of compiled models, this function trains
    them all on a subset of the train data. If the given size of the subset is
    smaller then the size of the data, the complete data set is used.

    Parameters
    ----------
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
        The input dataset for training
    y_train : numpy array of shape (num_samples, num_classes)
        The output classes for the train data, in binary format
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
        The input dataset for validation
    y_val : numpy array of shape (num_samples_val, num_classes)
        The output classes for the validation data, in binary format
    models : list of model, params, modeltypes
        List of keras models to train
    nr_epochs : int, optional
        nr of epochs to use for training one model
    subset_size :
        The number of samples used from the complete train set
    verbose : bool, optional
        flag for displaying verbose output
    outputfile : str, optional
        File location to store the model results
    early_stopping: bool
        Stop when validation loss does not decrease
    batch_size : int
        nr of samples per batch
    metric : str
        metric to store in the history object

    Returns
    ----------
    histories : list of Keras History objects
        train histories for all models
    val_metrics : list of floats
        validation accuraracies of the models
    val_losses : list of floats
        validation losses of the models
    """
    # if subset_size is smaller then X_train, this will work fine
    X_train_sub = X_train[:subset_size, :, :]
    y_train_sub = y_train[:subset_size, :]

    metric_name = get_metric_name(metric)

    histories = []
    val_metrics = []
    val_losses = []
    for i, (model, params, model_types) in enumerate(models):
        if verbose:
            print('Training model %d' % i, model_types)
        model_metrics = [get_metric_name(name) for name in model.metrics]
        if metric_name not in model_metrics:
            raise ValueError('Invalid metric, the model should be compiled with the same metric!')

        if early_stopping:
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]

        else:
            callbacks = []
        history = model.fit(X_train_sub, y_train_sub,
                            epochs=nr_epochs, batch_size=batch_size,
                            # see comment on subsize_set
                            validation_data=(X_val, y_val),
                            verbose=verbose,
                            callbacks=callbacks)
        histories.append(history)

        val_metrics.append(history.history['val_'+metric_name][-1])
        val_losses.append(history.history['val_loss'][-1])
        if outputfile is not None:
            store_train_hist_as_json(params, model_types,
                                     history.history, outputfile,
                                     metric_name)
    return histories, val_metrics, val_losses


def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):

    """
    This function stores the model parameters, the loss and accuracy history
    of one model in a JSON file. It appends the model information to the
    existing models in the file.

    Parameters
    ----------
    params : dict
        parameters for one model
    model_type : Keras model object
        Keras model object for one model
    history : dict
        training history from one model
    outputfile : str
        path where the json file needs to be stored
    metric_name : str, optional
        name of metric from history to store
    """
    jsondata = params.copy()
    for k in jsondata.keys():
        if isinstance(jsondata[k], np.ndarray):
            jsondata[k] = jsondata[k].tolist()
    jsondata['train_metric'] = history[metric_name]
    jsondata['train_loss'] = history['loss']
    jsondata['val_metric'] = history['val_'+metric_name]
    jsondata['val_loss'] = history['val_loss']
    jsondata['modeltype'] = model_type
    jsondata['metric'] = metric_name
    if os.path.isfile(outputfile):
        with open(outputfile, 'r') as outfile:
            previousdata = json.load(outfile)
    else:
        previousdata = []
    previousdata.append(jsondata)
    with open(outputfile, 'w') as outfile:
        json.dump(previousdata, outfile, sort_keys=True,
                  indent=4, ensure_ascii=False)


def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
                           number_of_models=5, nr_epochs=5, subset_size=100,
                           outputpath=None, metric='accuracy', **kwargs
                           ):
    """
    Tries out a number of models on a subsample of the data,
    and outputs the best found architecture and hyperparameters.

    Parameters
    ----------
    X_train : numpy array
        The input dataset for training of shape
        (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        The output classes for the train data, in binary format of shape
        (num_samples, num_classes)
    X_val : numpy array
        The input dataset for validation of shape
        (num_samples_val, num_timesteps, num_channels)
    y_val : numpy array
        The output classes for the validation data, in binary format of shape
        (num_samples_val, num_classes)
    verbose : bool, optional
        flag for displaying verbose output
    number_of_models : int, optiona
        The number of models to generate and test
    nr_epochs : int, optional
        The number of epochs that each model is trained
    subset_size : int, optional
        The size of the subset of the data that is used for finding
        the optimal architecture
    outputpath : str, optional
        File location to store the model results
    metric: str, optional
        metric that is used to evaluate the model on the validation set.
        See https://keras.io/metrics/ for possible metrics
    **kwargs: key-value parameters
        parameters for generating the models
        (see docstring for modelgen.generate_models)

    Returns
    ----------
    best_model : Keras model
        Best performing model, already trained on a small sample data set.
    best_params : dict
        Dictionary containing the hyperparameters for the best model
    best_model_type : str
        Type of the best model
    knn_acc : float
        accuaracy for kNN prediction on validation set
    """
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
                                      number_of_models=number_of_models,
                                      metrics=[metric],
                                      **kwargs)
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
                                                                    y_train,
                                                                    X_val,
                                                                    y_val,
                                                                    models,
                                                                    nr_epochs,
                                                                    subset_size=subset_size,

                                                                    verbose=verbose,

                                                                    outputfile=outputpath,

                                                                    metric=metric)

    best_model_index = np.argmax(val_accuracies)
    best_model, best_params, best_model_type = models[best_model_index]
    knn_acc = kNN_accuracy(
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
    if verbose:
        print('Best model: model ', best_model_index)
        print('Model type: ', best_model_type)
        print('Hyperparameters: ', best_params)
        print(str(metric) + ' on validation set: ', val_accuracies[best_model_index])

        print('Accuracy of kNN on validation set', knn_acc)

    if val_accuracies[best_model_index] < knn_acc:
        warnings.warn('Best model not better than kNN: ' +
                      str(val_accuracies[best_model_index]) + ' vs  ' +
                      str(knn_acc)
                      )
    return best_model, best_params, best_model_type, knn_acc


def get_metric_name(name):
    """
    Gives the keras name for a metric

    Parameters
    ----------
    name : str
        original name of the metric
    Returns
    -------

    """
    if name=='acc' or name=='accuracy':

        return 'acc'
    try:
        metric_fn = metrics.get(name)
        return metric_fn.__name__
    except:
        pass
    return name


def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
    """
    Performs k-Neigherst Neighbors and returns the accuracy score.

    Parameters
    ----------
    X_train : numpy array
        Train set of shape (num_samples, num_timesteps, num_channels)
    y_train : numpy array
        Class labels for train set
    X_val : numpy array
        Validation set of shape (num_samples, num_timesteps, num_channels)
    y_val : numpy array
        Class labels for validation set
    k : int
        number of neighbors to use for classifying

    Returns
    -------
    accuracy: float
        accuracy score on the validation set
    """
    num_samples, num_timesteps, num_channels = X_train.shape
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(
        X_train.reshape(
            num_samples,
            num_timesteps *
            num_channels),
        y_train)
    num_samples, num_timesteps, num_channels = X_val.shape
    val_predict = clf.predict(
        X_val.reshape(num_samples,
                      num_timesteps * num_channels))
    return sklearnmetrics.accuracy_score(val_predict, y_val)


1		"""
2		Summary:
3		This module provides the main functionality of mcfly: searching for an
4		optimal model architecture. The work flow is as follows:
5		Function generate_models from modelgen.py generates and compiles models.
6		Function train_models_on_samples trains those models.
7		Function plotTrainingProcess plots the training process.
8		Function find_best_architecture is wrapper function that combines
9		these steps.
10		Example function calls can be found in the tutorial notebook
11		'EvaluateDifferentModels.ipynb'.
12		"""
13	1	import numpy as np
14	1	from . import modelgen
15	1	from sklearn import neighbors, metrics as sklearnmetrics
16	1	import warnings
17	1	import json
18	1	import os
19	1	from keras.callbacks import EarlyStopping
20	1	from keras import metrics
21
22	1	def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23		nr_epochs=5, subset_size=100, verbose=True,
24		outputfile=None, early_stopping=False,
25		batch_size=20, metric='accuracy'):
26		"""
27		Given a list of compiled models, this function trains
28		them all on a subset of the train data. If the given size of the subset is
29		smaller then the size of the data, the complete data set is used.
30
31		Parameters
32		----------
33		X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
34		The input dataset for training
35		y_train : numpy array of shape (num_samples, num_classes)
36		The output classes for the train data, in binary format
37		X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
38		The input dataset for validation
39		y_val : numpy array of shape (num_samples_val, num_classes)
40		The output classes for the validation data, in binary format
41		models : list of model, params, modeltypes
42		List of keras models to train
43		nr_epochs : int, optional
44		nr of epochs to use for training one model
45		subset_size :
46		The number of samples used from the complete train set
47		verbose : bool, optional
48		flag for displaying verbose output
49		outputfile : str, optional
50		File location to store the model results
51		early_stopping: bool
52		Stop when validation loss does not decrease
53		batch_size : int
54		nr of samples per batch
55		metric : str
56		metric to store in the history object
57
58		Returns
59		----------
60		histories : list of Keras History objects
61		train histories for all models
62		val_metrics : list of floats
63		validation accuraracies of the models
64		val_losses : list of floats
65		validation losses of the models
66		"""
67		# if subset_size is smaller then X_train, this will work fine
68	1	X_train_sub = X_train[:subset_size, :, :]
69	1	y_train_sub = y_train[:subset_size, :]
70
71	1	metric_name = get_metric_name(metric)
72
73	1	histories = []
74	1	val_metrics = []
75	1	val_losses = []
76	1	for i, (model, params, model_types) in enumerate(models):
77	1	if verbose:
78		print('Training model %d' % i, model_types)
79	1	model_metrics = [get_metric_name(name) for name in model.metrics]
80	1	if metric_name not in model_metrics:
81		raise ValueError('Invalid metric, the model should be compiled with the same metric!')
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (98/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
82	1	if early_stopping:
83		callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
		0 ignored issues – show Coding Style introduced 2016-10-28 15:13 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (101/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
84		else:
85	1	callbacks = []
86	1	history = model.fit(X_train_sub, y_train_sub,
87		epochs=nr_epochs, batch_size=batch_size,
88		# see comment on subsize_set
89		validation_data=(X_val, y_val),
90		verbose=verbose,
91		callbacks=callbacks)
92	1	histories.append(history)
93
94	1	val_metrics.append(history.history['val_'+metric_name][-1])
95	1	val_losses.append(history.history['val_loss'][-1])
96	1	if outputfile is not None:
97		store_train_hist_as_json(params, model_types,
98		history.history, outputfile,
99		metric_name)
100	1	return histories, val_metrics, val_losses
101
102
103	1	def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (89/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
104		"""
105		This function stores the model parameters, the loss and accuracy history
106		of one model in a JSON file. It appends the model information to the
107		existing models in the file.
108
109		Parameters
110		----------
111		params : dict
112		parameters for one model
113		model_type : Keras model object
114		Keras model object for one model
115		history : dict
116		training history from one model
117		outputfile : str
118		path where the json file needs to be stored
119		metric_name : str, optional
120		name of metric from history to store
121		"""
122	1	jsondata = params.copy()
123	1	for k in jsondata.keys():
124	1	if isinstance(jsondata[k], np.ndarray):
125	1	jsondata[k] = jsondata[k].tolist()
126	1	jsondata['train_metric'] = history[metric_name]
127	1	jsondata['train_loss'] = history['loss']
128	1	jsondata['val_metric'] = history['val_'+metric_name]
129	1	jsondata['val_loss'] = history['val_loss']
130	1	jsondata['modeltype'] = model_type
131	1	jsondata['metric'] = metric_name
132	1	if os.path.isfile(outputfile):
133		with open(outputfile, 'r') as outfile:
134		previousdata = json.load(outfile)
135		else:
136	1	previousdata = []
137	1	previousdata.append(jsondata)
138	1	with open(outputfile, 'w') as outfile:
139	1	json.dump(previousdata, outfile, sort_keys=True,
140		indent=4, ensure_ascii=False)
141
142
143	1	def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
144		number_of_models=5, nr_epochs=5, subset_size=100,
145		outputpath=None, metric='accuracy', **kwargs
146		):
147		"""
148		Tries out a number of models on a subsample of the data,
149		and outputs the best found architecture and hyperparameters.
150
151		Parameters
152		----------
153		X_train : numpy array
154		The input dataset for training of shape
155		(num_samples, num_timesteps, num_channels)
156		y_train : numpy array
157		The output classes for the train data, in binary format of shape
158		(num_samples, num_classes)
159		X_val : numpy array
160		The input dataset for validation of shape
161		(num_samples_val, num_timesteps, num_channels)
162		y_val : numpy array
163		The output classes for the validation data, in binary format of shape
164		(num_samples_val, num_classes)
165		verbose : bool, optional
166		flag for displaying verbose output
167		number_of_models : int, optiona
168		The number of models to generate and test
169		nr_epochs : int, optional
170		The number of epochs that each model is trained
171		subset_size : int, optional
172		The size of the subset of the data that is used for finding
173		the optimal architecture
174		outputpath : str, optional
175		File location to store the model results
176		metric: str, optional
177		metric that is used to evaluate the model on the validation set.
178		See https://keras.io/metrics/ for possible metrics
179		**kwargs: key-value parameters
180		parameters for generating the models
181		(see docstring for modelgen.generate_models)
182
183		Returns
184		----------
185		best_model : Keras model
186		Best performing model, already trained on a small sample data set.
187		best_params : dict
188		Dictionary containing the hyperparameters for the best model
189		best_model_type : str
190		Type of the best model
191		knn_acc : float
192		accuaracy for kNN prediction on validation set
193		"""
194	1	models = modelgen.generate_models(X_train.shape, y_train.shape[1],
195		number_of_models=number_of_models,
196		metrics=[metric],
197		**kwargs)
198	1	histories, val_accuracies, val_losses = train_models_on_samples(X_train,
199		y_train,
200		X_val,
201		y_val,
202		models,
203		nr_epochs,
204		subset_size=subset_size,
		0 ignored issues – show Coding Style introduced 2016-08-04 09:16 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (92/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
205		verbose=verbose,
		0 ignored issues – show Coding Style introduced 2016-07-07 14:47 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (84/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
206		outputfile=outputpath,
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (90/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
207		metric=metric)
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (82/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
208	1	best_model_index = np.argmax(val_accuracies)
209	1	best_model, best_params, best_model_type = models[best_model_index]
210	1	knn_acc = kNN_accuracy(
211		X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
212	1	if verbose:
213		print('Best model: model ', best_model_index)
214		print('Model type: ', best_model_type)
215		print('Hyperparameters: ', best_params)
216		print(str(metric) + ' on validation set: ', val_accuracies[best_model_index])
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (85/79). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
217		print('Accuracy of kNN on validation set', knn_acc)
218
219	1	if val_accuracies[best_model_index] < knn_acc:
220		warnings.warn('Best model not better than kNN: ' +
221		str(val_accuracies[best_model_index]) + ' vs ' +
222		str(knn_acc)
223		)
224	1	return best_model, best_params, best_model_type, knn_acc
225
226
227	1	def get_metric_name(name):
228		"""
229		Gives the keras name for a metric
230
231		Parameters
232		----------
233		name : str
234		original name of the metric
235		Returns
236		-------
237
238		"""
239	1	if name=='acc' or name=='accuracy':
		0 ignored issues – show Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report Exactly one space required around comparison if name=='acc' or name=='accuracy': ^^ Loading history... Coding Style introduced 2017-06-29 15:18 UTC by Report Bug Copy Issue Report Exactly one space required around comparison if name=='acc' or name=='accuracy': ^^ Loading history...
240	1	return 'acc'
241	1	try:
242	1	metric_fn = metrics.get(name)
243	1	return metric_fn.__name__
244		except:
245		pass
246		return name
247
248
249	1	def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
250		"""
251		Performs k-Neigherst Neighbors and returns the accuracy score.
252
253		Parameters
254		----------
255		X_train : numpy array
256		Train set of shape (num_samples, num_timesteps, num_channels)
257		y_train : numpy array
258		Class labels for train set
259		X_val : numpy array
260		Validation set of shape (num_samples, num_timesteps, num_channels)
261		y_val : numpy array
262		Class labels for validation set
263		k : int
264		number of neighbors to use for classifying
265
266		Returns
267		-------
268		accuracy: float
269		accuracy score on the validation set
270		"""
271	1	num_samples, num_timesteps, num_channels = X_train.shape
272	1	clf = neighbors.KNeighborsClassifier(k)
273	1	clf.fit(
274		X_train.reshape(
275		num_samples,
276		num_timesteps *
277		num_channels),
278		y_train)
279	1	num_samples, num_timesteps, num_channels = X_val.shape
280	1	val_predict = clf.predict(
281		X_val.reshape(num_samples,
282		num_timesteps * num_channels))
283		return sklearnmetrics.accuracy_score(val_predict, y_val)
284

NLeSC / mcfly

Pull Request — master (#147)

get_metric_name() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like