Completed
Pull Request — master (#168)
by
unknown
03:32
created

make_history()   B

Complexity

Conditions 6

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 42

Importance

Changes 0
Metric Value
cc 6
c 0
b 0
f 0
dl 0
loc 23
ccs 0
cts 13
cp 0
crap 42
rs 7.6949
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from .storage import TrainedModel
16
17 1
try:
18 1
    import noodles
19
    from .storage import serial_registry
20 1
except ImportError:
21 1
    has_noodles = False
22
else:
23
    has_noodles = True
24
25 1
from sklearn import neighbors, metrics as sklearnmetrics
26
import warnings
27
import json
28
import os
29
from keras.callbacks import EarlyStopping
30
from keras import metrics
31
32
33
def train_model(
34
        model, X_train_sub, y_train_sub, epochs, batch_size,
35
        validation_data, verbose, callbacks):
36
37
    result = model.fit(
38
        X_train_sub,
39
        y_train_sub,
40
        epochs=epochs,
41
        batch_size=batch_size,  # see comment on subsize_set
42
        validation_data=validation_data,
43
        verbose=verbose,
44
        callbacks=callbacks)
45
46
    # metric = result.history['val_' + metric_name][-1]
47
    # loss = result.history['val_loss'][-1]
48
49
    return TrainedModel(
50
        history=result.history, model=model)  # , metric=metric, loss=loss)
51
52
53
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
54
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (88/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
55
                            model_path=None, early_stopping=False,
56
                            batch_size=20, metric='accuracy', use_noodles=None):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (80/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
57
    """
58
    Given a list of compiled models, this function trains
59
    them all on a subset of the train data. If the given size of the subset is
60
    smaller then the size of the data, the complete data set is used.
61
62
    Parameters
63
    ----------
64
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
65
        The input dataset for training
66
    y_train : numpy array of shape (num_samples, num_classes)
67
        The output classes for the train data, in binary format
68
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
69
        The input dataset for validation
70
    y_val : numpy array of shape (num_samples_val, num_classes)
71
        The output classes for the validation data, in binary format
72
    models : list of model, params, modeltypes
73
        List of keras models to train
74
    nr_epochs : int, optional
75
        nr of epochs to use for training one model
76
    subset_size :
77
        The number of samples used from the complete train set
78
    verbose : bool, optional
79
        flag for displaying verbose output
80
    outputfile: str, optional
81
        Filename to store the model training results
82
    model_path : str, optional
83
        Directory to store the models as HDF5 files
84
    early_stopping: bool
85
        Stop when validation loss does not decrease
86
    batch_size : int
87
        nr of samples per batch
88
    metric : str
89
        metric to store in the history object
90
91
    Returns
92
    ----------
93
    histories : list of Keras History objects
94
        train histories for all models
95
    val_metrics : list of floats
96
        validation accuraracies of the models
97
    val_losses : list of floats
98
        validation losses of the models
99
    """
100
    # if subset_size is smaller then X_train, this will work fine
101
    X_train_sub = X_train[:subset_size, :, :]
102
    y_train_sub = y_train[:subset_size, :]
103
104
    metric_name = get_metric_name(metric)
105
106
    val_metrics = []
107
    val_losses = []
108
109
    def make_history(model):
110
        model_metrics = [get_metric_name(name) for name in model.metrics]
111
        if metric_name not in model_metrics:
112
            raise ValueError(
113
                'Invalid metric. The model was not compiled with {} as metric'.format(metric_name))
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (99/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
114
        if early_stopping:
115
            callbacks = [
116
                EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
117
        else:
118
            callbacks = []
119
120
        args = (model, X_train_sub, y_train_sub)
121
        kwargs = {'epochs': nr_epochs,
122
                  'batch_size': batch_size,
123
                  'validation_data': (X_val, y_val),
124
                  'verbose': verbose,
125
                  'callbacks': callbacks}
126
127
        if use_noodles is None:
128
            return train_model(*args, **kwargs)
129
        else:
130
            assert has_noodles, "Noodles is not installed, or could not be imported."
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
131
            return noodles.schedule_hint(call_by_ref=['model'])(train_model)(*args, **kwargs)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (93/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
132
133
    if use_noodles is None:
134
        trained_models = [make_history(model[0]) for model in models]
135
    else:
136
        assert has_noodles, "Noodles is not installed, or could not be imported."
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (81/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
137
        training_wf = noodles.gather_all([make_history(model[0]) for model in models])
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (86/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
138
        trained_models = use_noodles(training_wf)
139
        # noodles.run_process(training_wf, n_processes=4, registry=serial_registry)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (83/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
140
141
    val_metrics = [tm.history['val_' + metric_name]
142
                   for tm in trained_models]
143
    val_losses = [tm.history['val_loss']
144
                  for tm in trained_models]
145
146
    for i, (history, model) in enumerate(trained_models):
147
        if outputfile is not None:
148
            store_train_hist_as_json(models[i][1], models[i][2],
149
                                     history, outputfile)
150
        if model_path is not None:
151
            model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
152
153
    return [tm.history for tm in trained_models], val_metrics, val_losses
154
155
156
def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (89/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
157
    """
158
    This function stores the model parameters, the loss and accuracy history
159
    of one model in a JSON file. It appends the model information to the
160
    existing models in the file.
161
162
    Parameters
163
    ----------
164
    params : dict
165
        parameters for one model
166
    model_type : Keras model object
167
        Keras model object for one model
168
    history : dict
169
        training history from one model
170
    outputfile : str
171
        path where the json file needs to be stored
172
    metric_name : str, optional
173
        name of metric from history to store
174
    """
175
    jsondata = params.copy()
176
    for k in jsondata.keys():
177
        if isinstance(jsondata[k], np.ndarray):
178
            jsondata[k] = jsondata[k].tolist()
179
    jsondata['train_metric'] = history[metric_name]
180
    jsondata['train_loss'] = history['loss']
181
    jsondata['val_metric'] = history['val_' + metric_name]
182
    jsondata['val_loss'] = history['val_loss']
183
    jsondata['modeltype'] = model_type
184
    jsondata['metric'] = metric_name
185
    if os.path.isfile(outputfile):
186
        with open(outputfile, 'r') as outfile:
187
            previousdata = json.load(outfile)
188
    else:
189
        previousdata = []
190
    previousdata.append(jsondata)
191
    with open(outputfile, 'w') as outfile:
192
        json.dump(previousdata, outfile, sort_keys=True,
193
                  indent=4, ensure_ascii=False)
194
195
196
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
197
                           number_of_models=5, nr_epochs=5, subset_size=100,
198
                           outputpath=None, model_path=None, metric='accuracy',
199
                           use_noodles=None, **kwargs):
200
    """
201
    Tries out a number of models on a subsample of the data,
202
    and outputs the best found architecture and hyperparameters.
203
204
    Parameters
205
    ----------
206
    X_train : numpy array
207
        The input dataset for training of shape
208
        (num_samples, num_timesteps, num_channels)
209
    y_train : numpy array
210
        The output classes for the train data, in binary format of shape
211
        (num_samples, num_classes)
212
    X_val : numpy array
213
        The input dataset for validation of shape
214
        (num_samples_val, num_timesteps, num_channels)
215
    y_val : numpy array
216
        The output classes for the validation data, in binary format of shape
217
        (num_samples_val, num_classes)
218
    verbose : bool, optional
219
        flag for displaying verbose output
220
    number_of_models : int, optiona
221
        The number of models to generate and test
222
    nr_epochs : int, optional
223
        The number of epochs that each model is trained
224
    subset_size : int, optional
225
        The size of the subset of the data that is used for finding
226
        the optimal architecture
227
    outputpath : str, optional
228
        File location to store the model results
229
    model_path: str, optional
230
        Directory to save the models as HDF5 files
231
    metric: str, optional
232
        metric that is used to evaluate the model on the validation set.
233
        See https://keras.io/metrics/ for possible metrics
234
    **kwargs: key-value parameters
235
        parameters for generating the models
236
        (see docstring for modelgen.generate_models)
237
238
    Returns
239
    ----------
240
    best_model : Keras model
241
        Best performing model, already trained on a small sample data set.
242
    best_params : dict
243
        Dictionary containing the hyperparameters for the best model
244
    best_model_type : str
245
        Type of the best model
246
    knn_acc : float
247
        accuaracy for kNN prediction on validation set
248
    """
249
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
250
                                      number_of_models=number_of_models,
251
                                      metrics=[metric],
252
                                      **kwargs)
253
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
254
                                                                    y_train,
255
                                                                    X_val,
256
                                                                    y_val,
257
                                                                    models,
258
                                                                    nr_epochs,
259
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
260
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
261
                                                                    outputfile=outputpath,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
262
                                                                    model_path=model_path,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
263
                                                                    metric=metric,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
264
                                                                    use_noodles=use_noodles)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
265
    best_model_index = np.argmax(val_accuracies)
266
    best_model, best_params, best_model_type = models[best_model_index]
267
    knn_acc = kNN_accuracy(
268
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
269
    if verbose:
270
        print('Best model: model ', best_model_index)
271
        print('Model type: ', best_model_type)
272
        print('Hyperparameters: ', best_params)
273
        print(str(metric) + ' on validation set: ',
274
              val_accuracies[best_model_index])
275
        print('Accuracy of kNN on validation set', knn_acc)
276
277
    if val_accuracies[best_model_index] < knn_acc:
278
        warnings.warn('Best model not better than kNN: ' +
279
                      str(val_accuracies[best_model_index]) + ' vs  ' +
280
                      str(knn_acc)
281
                      )
282
    return best_model, best_params, best_model_type, knn_acc
283
284
285
def get_metric_name(name):
286
    """
287
    Gives the keras name for a metric
288
289
    Parameters
290
    ----------
291
    name : str
292
        original name of the metric
293
    Returns
294
    -------
295
296
    """
297
    if name == 'acc' or name == 'accuracy':
298
        return 'acc'
299
    try:
300
        metric_fn = metrics.get(name)
301
        return metric_fn.__name__
302
    except:
303
        pass
304
    return name
305
306
307
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
308
    """
309
    Performs k-Neigherst Neighbors and returns the accuracy score.
310
311
    Parameters
312
    ----------
313
    X_train : numpy array
314
        Train set of shape (num_samples, num_timesteps, num_channels)
315
    y_train : numpy array
316
        Class labels for train set
317
    X_val : numpy array
318
        Validation set of shape (num_samples, num_timesteps, num_channels)
319
    y_val : numpy array
320
        Class labels for validation set
321
    k : int
322
        number of neighbors to use for classifying
323
324
    Returns
325
    -------
326
    accuracy: float
327
        accuracy score on the validation set
328
    """
329
    num_samples, num_timesteps, num_channels = X_train.shape
330
    clf = neighbors.KNeighborsClassifier(k)
331
    clf.fit(
332
        X_train.reshape(
333
            num_samples,
334
            num_timesteps *
335
            num_channels),
336
        y_train)
337
    num_samples, num_timesteps, num_channels = X_val.shape
338
    val_predict = clf.predict(
339
        X_val.reshape(num_samples,
340
                      num_timesteps * num_channels))
341
    return sklearnmetrics.accuracy_score(val_predict, y_val)
342