Completed
Pull Request — master (#147)
by Dafne van
50:16
created

get_metric_name()   A

Complexity

Conditions 4

Size

Total Lines 20

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 4.5923

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
c 1
b 0
f 0
dl 0
loc 20
ccs 6
cts 9
cp 0.6667
crap 4.5923
rs 9.2
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from sklearn import neighbors, metrics as sklearnmetrics
16 1
import warnings
17 1
import json
18 1
import os
19 1
from keras.callbacks import EarlyStopping
20 1
from keras import metrics
21
22 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23
                            nr_epochs=5, subset_size=100, verbose=True,
24
                            outputfile=None, early_stopping=False,
25
                            batch_size=20, metric='accuracy'):
26
    """
27
    Given a list of compiled models, this function trains
28
    them all on a subset of the train data. If the given size of the subset is
29
    smaller then the size of the data, the complete data set is used.
30
31
    Parameters
32
    ----------
33
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
34
        The input dataset for training
35
    y_train : numpy array of shape (num_samples, num_classes)
36
        The output classes for the train data, in binary format
37
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
38
        The input dataset for validation
39
    y_val : numpy array of shape (num_samples_val, num_classes)
40
        The output classes for the validation data, in binary format
41
    models : list of model, params, modeltypes
42
        List of keras models to train
43
    nr_epochs : int, optional
44
        nr of epochs to use for training one model
45
    subset_size :
46
        The number of samples used from the complete train set
47
    verbose : bool, optional
48
        flag for displaying verbose output
49
    outputfile : str, optional
50
        File location to store the model results
51
    early_stopping: bool
52
        Stop when validation loss does not decrease
53
    batch_size : int
54
        nr of samples per batch
55
    metric : str
56
        metric to store in the history object
57
58
    Returns
59
    ----------
60
    histories : list of Keras History objects
61
        train histories for all models
62
    val_metrics : list of floats
63
        validation accuraracies of the models
64
    val_losses : list of floats
65
        validation losses of the models
66
    """
67
    # if subset_size is smaller then X_train, this will work fine
68 1
    X_train_sub = X_train[:subset_size, :, :]
69 1
    y_train_sub = y_train[:subset_size, :]
70
71 1
    metric_name = get_metric_name(metric)
72
73 1
    histories = []
74 1
    val_metrics = []
75 1
    val_losses = []
76 1
    for i, (model, params, model_types) in enumerate(models):
77 1
        if verbose:
78
            print('Training model %d' % i, model_types)
79 1
        model_metrics = [get_metric_name(name) for name in model.metrics]
80 1
        if metric_name not in model_metrics:
81
            raise ValueError('Invalid metric, the model should be compiled with the same metric!')
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (98/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
82 1
        if early_stopping:
83
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (101/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
84
        else:
85 1
            callbacks = []
86 1
        history = model.fit(X_train_sub, y_train_sub,
87
                            epochs=nr_epochs, batch_size=batch_size,
88
                            # see comment on subsize_set
89
                            validation_data=(X_val, y_val),
90
                            verbose=verbose,
91
                            callbacks=callbacks)
92 1
        histories.append(history)
93
94 1
        val_metrics.append(history.history['val_'+metric_name][-1])
95 1
        val_losses.append(history.history['val_loss'][-1])
96 1
        if outputfile is not None:
97
            store_train_hist_as_json(params, model_types,
98
                                     history.history, outputfile,
99
                                     metric_name)
100 1
    return histories, val_metrics, val_losses
101
102
103 1
def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (89/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
104
    """
105
    This function stores the model parameters, the loss and accuracy history
106
    of one model in a JSON file. It appends the model information to the
107
    existing models in the file.
108
109
    Parameters
110
    ----------
111
    params : dict
112
        parameters for one model
113
    model_type : Keras model object
114
        Keras model object for one model
115
    history : dict
116
        training history from one model
117
    outputfile : str
118
        path where the json file needs to be stored
119
    metric_name : str, optional
120
        name of metric from history to store
121
    """
122 1
    jsondata = params.copy()
123 1
    for k in jsondata.keys():
124 1
        if isinstance(jsondata[k], np.ndarray):
125 1
            jsondata[k] = jsondata[k].tolist()
126 1
    jsondata['train_metric'] = history[metric_name]
127 1
    jsondata['train_loss'] = history['loss']
128 1
    jsondata['val_metric'] = history['val_'+metric_name]
129 1
    jsondata['val_loss'] = history['val_loss']
130 1
    jsondata['modeltype'] = model_type
131 1
    jsondata['metric'] = metric_name
132 1
    if os.path.isfile(outputfile):
133
        with open(outputfile, 'r') as outfile:
134
            previousdata = json.load(outfile)
135
    else:
136 1
        previousdata = []
137 1
    previousdata.append(jsondata)
138 1
    with open(outputfile, 'w') as outfile:
139 1
        json.dump(previousdata, outfile, sort_keys=True,
140
                  indent=4, ensure_ascii=False)
141
142
143 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
144
                           number_of_models=5, nr_epochs=5, subset_size=100,
145
                           outputpath=None, metric='accuracy', **kwargs
146
                           ):
147
    """
148
    Tries out a number of models on a subsample of the data,
149
    and outputs the best found architecture and hyperparameters.
150
151
    Parameters
152
    ----------
153
    X_train : numpy array
154
        The input dataset for training of shape
155
        (num_samples, num_timesteps, num_channels)
156
    y_train : numpy array
157
        The output classes for the train data, in binary format of shape
158
        (num_samples, num_classes)
159
    X_val : numpy array
160
        The input dataset for validation of shape
161
        (num_samples_val, num_timesteps, num_channels)
162
    y_val : numpy array
163
        The output classes for the validation data, in binary format of shape
164
        (num_samples_val, num_classes)
165
    verbose : bool, optional
166
        flag for displaying verbose output
167
    number_of_models : int, optiona
168
        The number of models to generate and test
169
    nr_epochs : int, optional
170
        The number of epochs that each model is trained
171
    subset_size : int, optional
172
        The size of the subset of the data that is used for finding
173
        the optimal architecture
174
    outputpath : str, optional
175
        File location to store the model results
176
    metric: str, optional
177
        metric that is used to evaluate the model on the validation set.
178
        See https://keras.io/metrics/ for possible metrics
179
    **kwargs: key-value parameters
180
        parameters for generating the models
181
        (see docstring for modelgen.generate_models)
182
183
    Returns
184
    ----------
185
    best_model : Keras model
186
        Best performing model, already trained on a small sample data set.
187
    best_params : dict
188
        Dictionary containing the hyperparameters for the best model
189
    best_model_type : str
190
        Type of the best model
191
    knn_acc : float
192
        accuaracy for kNN prediction on validation set
193
    """
194 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
195
                                      number_of_models=number_of_models,
196
                                      metrics=[metric],
197
                                      **kwargs)
198 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
199
                                                                    y_train,
200
                                                                    X_val,
201
                                                                    y_val,
202
                                                                    models,
203
                                                                    nr_epochs,
204
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
205
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
206
                                                                    outputfile=outputpath,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
207
                                                                    metric=metric)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
208 1
    best_model_index = np.argmax(val_accuracies)
209 1
    best_model, best_params, best_model_type = models[best_model_index]
210 1
    knn_acc = kNN_accuracy(
211
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
212 1
    if verbose:
213
        print('Best model: model ', best_model_index)
214
        print('Model type: ', best_model_type)
215
        print('Hyperparameters: ', best_params)
216
        print(str(metric) + ' on validation set: ', val_accuracies[best_model_index])
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
217
        print('Accuracy of kNN on validation set', knn_acc)
218
219 1
    if val_accuracies[best_model_index] < knn_acc:
220
        warnings.warn('Best model not better than kNN: ' +
221
                      str(val_accuracies[best_model_index]) + ' vs  ' +
222
                      str(knn_acc)
223
                      )
224 1
    return best_model, best_params, best_model_type, knn_acc
225
226
227 1
def get_metric_name(name):
228
    """
229
    Gives the keras name for a metric
230
231
    Parameters
232
    ----------
233
    name : str
234
        original name of the metric
235
    Returns
236
    -------
237
238
    """
239 1
    if name=='acc' or name=='accuracy':
0 ignored issues
show
Coding Style introduced by
Exactly one space required around comparison
if name=='acc' or name=='accuracy':
^^
Loading history...
Coding Style introduced by
Exactly one space required around comparison
if name=='acc' or name=='accuracy':
^^
Loading history...
240 1
        return 'acc'
241 1
    try:
242 1
        metric_fn = metrics.get(name)
243 1
        return metric_fn.__name__
244
    except:
245
        pass
246
    return name
247
248
249 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
250
    """
251
    Performs k-Neigherst Neighbors and returns the accuracy score.
252
253
    Parameters
254
    ----------
255
    X_train : numpy array
256
        Train set of shape (num_samples, num_timesteps, num_channels)
257
    y_train : numpy array
258
        Class labels for train set
259
    X_val : numpy array
260
        Validation set of shape (num_samples, num_timesteps, num_channels)
261
    y_val : numpy array
262
        Class labels for validation set
263
    k : int
264
        number of neighbors to use for classifying
265
266
    Returns
267
    -------
268
    accuracy: float
269
        accuracy score on the validation set
270
    """
271 1
    num_samples, num_timesteps, num_channels = X_train.shape
272 1
    clf = neighbors.KNeighborsClassifier(k)
273 1
    clf.fit(
274
        X_train.reshape(
275
            num_samples,
276
            num_timesteps *
277
            num_channels),
278
        y_train)
279 1
    num_samples, num_timesteps, num_channels = X_val.shape
280 1
    val_predict = clf.predict(
281
        X_val.reshape(num_samples,
282
                      num_timesteps * num_channels))
283
    return sklearnmetrics.accuracy_score(val_predict, y_val)
284