Completed
Pull Request — master (#168)
by
unknown
05:26
created

train_model()   A

Complexity

Conditions 1

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 18
ccs 3
cts 3
cp 1
crap 1
rs 9.4285
c 0
b 0
f 0
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from .storage import TrainedModel
16
17 1
try:
18 1
    import noodles
19
    from .storage import serial_registry
20 1
except ImportError:
21 1
    has_noodles = False
22
else:
23
    has_noodles = True
24
25 1
from sklearn import neighbors, metrics as sklearnmetrics
26 1
import warnings
27 1
import json
28 1
import os
29 1
from keras.callbacks import EarlyStopping
30 1
from keras import metrics
31
32
33 1
def train_model(
34
        model, X_train_sub, y_train_sub, epochs, batch_size,
35
        validation_data, verbose, callbacks):
36
37 1
    result = model.fit(
38
        X_train_sub,
39
        y_train_sub,
40
        epochs=epochs,
41
        batch_size=batch_size,  # see comment on subsize_set
42
        validation_data=validation_data,
43
        verbose=verbose,
44
        callbacks=callbacks)
45
46
    # metric = result.history['val_' + metric_name][-1]
47
    # loss = result.history['val_loss'][-1]
48
49 1
    return TrainedModel(
50
        history=result.history, model=model)  # , metric=metric, loss=loss)
51
52
53 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
54
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (88/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
55
                            model_path=None, early_stopping=False,
56
                            batch_size=20, metric='accuracy', use_noodles=None):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (80/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
57
    """
58
    Given a list of compiled models, this function trains
59
    them all on a subset of the train data. If the given size of the subset is
60
    smaller then the size of the data, the complete data set is used.
61
62
    Parameters
63
    ----------
64
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
65
        The input dataset for training
66
    y_train : numpy array of shape (num_samples, num_classes)
67
        The output classes for the train data, in binary format
68
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
69
        The input dataset for validation
70
    y_val : numpy array of shape (num_samples_val, num_classes)
71
        The output classes for the validation data, in binary format
72
    models : list of model, params, modeltypes
73
        List of keras models to train
74
    nr_epochs : int, optional
75
        nr of epochs to use for training one model
76
    subset_size :
77
        The number of samples used from the complete train set
78
    verbose : bool, optional
79
        flag for displaying verbose output
80
    outputfile: str, optional
81
        Filename to store the model training results
82
    model_path : str, optional
83
        Directory to store the models as HDF5 files
84
    early_stopping: bool
85
        Stop when validation loss does not decrease
86
    batch_size : int
87
        nr of samples per batch
88
    metric : str
89
        metric to store in the history object
90
91
    Returns
92
    ----------
93
    histories : list of Keras History objects
94
        train histories for all models
95
    val_metrics : list of floats
96
        validation accuraracies of the models
97
    val_losses : list of floats
98
        validation losses of the models
99
    """
100
    # if subset_size is smaller then X_train, this will work fine
101 1
    X_train_sub = X_train[:subset_size, :, :]
102 1
    y_train_sub = y_train[:subset_size, :]
103
104 1
    metric_name = get_metric_name(metric)
105
106 1
    val_metrics = []
107 1
    val_losses = []
108
109 1
    def make_history(model, i=None):
110 1
        model_metrics = [get_metric_name(name) for name in model.metrics]
111 1
        if metric_name not in model_metrics:
112
            raise ValueError(
113
                'Invalid metric. The model was not compiled with {} as metric'.format(metric_name))
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (99/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
114 1
        if early_stopping:
115
            callbacks = [
116
                EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
117
        else:
118 1
            callbacks = []
119
120 1
        args = (model, X_train_sub, y_train_sub)
121 1
        kwargs = {'epochs': nr_epochs,
122
                  'batch_size': batch_size,
123
                  'validation_data': (X_val, y_val),
124
                  'verbose': verbose,
125
                  'callbacks': callbacks}
126
127 1
        if use_noodles is None:
128
            # if not using noodles, save every nugget when it comes
129 1
            trained_model = train_model(*args, **kwargs)
130 1
            if outputfile is not None:
131
                store_train_hist_as_json(models[i][1], models[i][2],
132
                                         trained_model.history, outputfile)
133 1
            if model_path is not None:
134
                trained_model.save(
135
                        os.path.join(model_path, 'model_{}.h5'.format(i)))
136 1
            return trained_model
137
138
        else:
139
            assert has_noodles, "Noodles is not installed, or could not be imported."
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
140
            return noodles.schedule_hint(call_by_ref=['model']) \
141
                    (train_model)(*args, **kwargs)
142
143 1
    if use_noodles is None:
144 1
        trained_models = [
145
            make_history(model[0], i)
146
            for i, model in enumerate(models)]
147
148
    else:
149
        assert has_noodles, "Noodles is not installed, or could not be imported."
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (81/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
150
        
151
        # in case of noodles, first run everything
152
        training_wf = noodles.gather_all([make_history(model[0]) for model in models])
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
153
        trained_models = use_noodles(training_wf)
154
155
        # then save everything
156
        for i, (history, model) in enumerate(trained_models):
157
            if outputfile is not None:
158
                store_train_hist_as_json(models[i][1], models[i][2],
159
                                         history, outputfile)
160
            if model_path is not None:
161
                model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
162
163
    # accumulate results
164 1
    val_metrics = [tm.history['val_' + metric_name]
165
                   for tm in trained_models]
166 1
    val_losses = [tm.history['val_loss']
167
                  for tm in trained_models]
168 1
    return [tm.history for tm in trained_models], val_metrics, val_losses
169
170
171 1
def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (89/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
172
    """
173
    This function stores the model parameters, the loss and accuracy history
174
    of one model in a JSON file. It appends the model information to the
175
    existing models in the file.
176
177
    Parameters
178
    ----------
179
    params : dict
180
        parameters for one model
181
    model_type : Keras model object
182
        Keras model object for one model
183
    history : dict
184
        training history from one model
185
    outputfile : str
186
        path where the json file needs to be stored
187
    metric_name : str, optional
188
        name of metric from history to store
189
    """
190 1
    jsondata = params.copy()
191 1
    for k in jsondata.keys():
192 1
        if isinstance(jsondata[k], np.ndarray):
193 1
            jsondata[k] = jsondata[k].tolist()
194 1
    jsondata['train_metric'] = history[metric_name]
195 1
    jsondata['train_loss'] = history['loss']
196 1
    jsondata['val_metric'] = history['val_' + metric_name]
197 1
    jsondata['val_loss'] = history['val_loss']
198 1
    jsondata['modeltype'] = model_type
199 1
    jsondata['metric'] = metric_name
200 1
    if os.path.isfile(outputfile):
201
        with open(outputfile, 'r') as outfile:
202
            previousdata = json.load(outfile)
203
    else:
204 1
        previousdata = []
205 1
    previousdata.append(jsondata)
206 1
    with open(outputfile, 'w') as outfile:
207 1
        json.dump(previousdata, outfile, sort_keys=True,
208
                  indent=4, ensure_ascii=False)
209
210
211 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
212
                           number_of_models=5, nr_epochs=5, subset_size=100,
213
                           outputpath=None, model_path=None, metric='accuracy',
214
                           use_noodles=None, **kwargs):
215
    """
216
    Tries out a number of models on a subsample of the data,
217
    and outputs the best found architecture and hyperparameters.
218
219
    Parameters
220
    ----------
221
    X_train : numpy array
222
        The input dataset for training of shape
223
        (num_samples, num_timesteps, num_channels)
224
    y_train : numpy array
225
        The output classes for the train data, in binary format of shape
226
        (num_samples, num_classes)
227
    X_val : numpy array
228
        The input dataset for validation of shape
229
        (num_samples_val, num_timesteps, num_channels)
230
    y_val : numpy array
231
        The output classes for the validation data, in binary format of shape
232
        (num_samples_val, num_classes)
233
    verbose : bool, optional
234
        flag for displaying verbose output
235
    number_of_models : int, optiona
236
        The number of models to generate and test
237
    nr_epochs : int, optional
238
        The number of epochs that each model is trained
239
    subset_size : int, optional
240
        The size of the subset of the data that is used for finding
241
        the optimal architecture
242
    outputpath : str, optional
243
        File location to store the model results
244
    model_path: str, optional
245
        Directory to save the models as HDF5 files
246
    metric: str, optional
247
        metric that is used to evaluate the model on the validation set.
248
        See https://keras.io/metrics/ for possible metrics
249
    **kwargs: key-value parameters
250
        parameters for generating the models
251
        (see docstring for modelgen.generate_models)
252
253
    Returns
254
    ----------
255
    best_model : Keras model
256
        Best performing model, already trained on a small sample data set.
257
    best_params : dict
258
        Dictionary containing the hyperparameters for the best model
259
    best_model_type : str
260
        Type of the best model
261
    knn_acc : float
262
        accuaracy for kNN prediction on validation set
263
    """
264 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
265
                                      number_of_models=number_of_models,
266
                                      metrics=[metric],
267
                                      **kwargs)
268 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
269
                                                                    y_train,
270
                                                                    X_val,
271
                                                                    y_val,
272
                                                                    models,
273
                                                                    nr_epochs,
274
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
275
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
276
                                                                    outputfile=outputpath,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
277
                                                                    model_path=model_path,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
278
                                                                    metric=metric,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
279
                                                                    use_noodles=use_noodles)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
280 1
    best_model_index = np.argmax(val_accuracies)
281 1
    best_model, best_params, best_model_type = models[best_model_index]
282 1
    knn_acc = kNN_accuracy(
283
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
284 1
    if verbose:
285
        print('Best model: model ', best_model_index)
286
        print('Model type: ', best_model_type)
287
        print('Hyperparameters: ', best_params)
288
        print(str(metric) + ' on validation set: ',
289
              val_accuracies[best_model_index])
290
        print('Accuracy of kNN on validation set', knn_acc)
291
292 1
    if val_accuracies[best_model_index] < knn_acc:
293
        warnings.warn('Best model not better than kNN: ' +
294
                      str(val_accuracies[best_model_index]) + ' vs  ' +
295
                      str(knn_acc)
296
                      )
297 1
    return best_model, best_params, best_model_type, knn_acc
298
299
300 1
def get_metric_name(name):
301
    """
302
    Gives the keras name for a metric
303
304
    Parameters
305
    ----------
306
    name : str
307
        original name of the metric
308
    Returns
309
    -------
310
311
    """
312 1
    if name == 'acc' or name == 'accuracy':
313 1
        return 'acc'
314 1
    try:
315 1
        metric_fn = metrics.get(name)
316 1
        return metric_fn.__name__
317
    except:
318
        pass
319
    return name
320
321
322 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
323
    """
324
    Performs k-Neigherst Neighbors and returns the accuracy score.
325
326
    Parameters
327
    ----------
328
    X_train : numpy array
329
        Train set of shape (num_samples, num_timesteps, num_channels)
330
    y_train : numpy array
331
        Class labels for train set
332
    X_val : numpy array
333
        Validation set of shape (num_samples, num_timesteps, num_channels)
334
    y_val : numpy array
335
        Class labels for validation set
336
    k : int
337
        number of neighbors to use for classifying
338
339
    Returns
340
    -------
341
    accuracy: float
342
        accuracy score on the validation set
343
    """
344 1
    num_samples, num_timesteps, num_channels = X_train.shape
345 1
    clf = neighbors.KNeighborsClassifier(k)
346 1
    clf.fit(
347
        X_train.reshape(
348
            num_samples,
349
            num_timesteps *
350
            num_channels),
351
        y_train)
352 1
    num_samples, num_timesteps, num_channels = X_val.shape
353 1
    val_predict = clf.predict(
354
        X_val.reshape(num_samples,
355
                      num_timesteps * num_channels))
356
    return sklearnmetrics.accuracy_score(val_predict, y_val)
357