Completed
Push — master ( f4f7b5...bb3685 )
by Christiaan
06:26
created

store_train_hist_as_json()   B

Complexity

Conditions 6

Size

Total Lines 36

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 16
CRAP Score 6.0493

Importance

Changes 0
Metric Value
cc 6
dl 0
loc 36
ccs 16
cts 18
cp 0.8889
crap 6.0493
rs 7.5384
c 0
b 0
f 0
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from matplotlib import pyplot as plt
15 1
from . import modelgen
16 1
from sklearn import neighbors, metrics
17 1
import warnings
18 1
import json
19 1
import os
20
21
22 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23
                            nr_epochs=5, subset_size=100, verbose=True,
24
                            outputfile=None):
25
    """
26
    Given a list of compiled models, this function trains
27
    them all on a subset of the train data. If the given size of the subset is
28
    smaller then the size of the data, the complete data set is used.
29
30
    Parameters
31
    ----------
32
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
33
        The input dataset for training
34
    y_train : numpy array of shape (num_samples, num_classes)
35
        The output classes for the train data, in binary format
36
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
37
        The input dataset for validation
38
    y_val : numpy array of shape (num_samples_val, num_classes)
39
        The output classes for the validation data, in binary format
40
    models : list of model, params, modeltypes
41
        List of keras models to train
42
    nr_epochs : int, optional
43
        nr of epochs to use for training one model
44
    subset_size :
45
        The number of samples used from the complete train set
46
    verbose : bool, optional
47
        flag for displaying verbose output
48
    outputfile : str, optional
49
        File location to store the model results
50
51
    Returns
52
    ----------
53
    histories : list of Keras History objects
54
        train histories for all models
55
    val_accuracies : list of floats
56
        validation accuraracies of the models
57
    val_losses : list of floats
58
        validation losses of the models
59
    """
60
    # if subset_size is smaller then X_train, this will work fine
61 1
    X_train_sub = X_train[:subset_size, :, :]
62 1
    y_train_sub = y_train[:subset_size, :]
63
64 1
    histories = []
65 1
    val_accuracies = []
66 1
    val_losses = []
67 1
    for i, (model, params, model_types) in enumerate(models):
68 1
        if verbose:
69
            print('Training model %d' % i, model_types)
70 1
        history = model.fit(X_train_sub, y_train_sub,
71
                            nb_epoch=nr_epochs, batch_size=20,
72
                            # see comment on subsize_set
73
                            validation_data=(X_val, y_val),
74
                            verbose=verbose)
75 1
        histories.append(history)
76 1
        val_accuracies.append(history.history['val_acc'][-1])
77 1
        val_losses.append(history.history['val_loss'][-1])
78 1
        if outputfile is not None:
79
            store_train_hist_as_json(params, model_types,
80
                                     history.history, outputfile)
81 1
    return histories, val_accuracies, val_losses
82
83
84 1
def store_train_hist_as_json(params, model_type, history, outputfile):
85
    """
86
    This function stores the model parameters, the loss and accuracy history
87
    of one model in a JSON file. It appends the model information to the
88
    existing models in the file.
89
90
    Parameters
91
    ----------
92
    params : dict
93
        parameters for one model
94
    model_type : Keras model object
95
        Keras model object for one model
96
    history : dict
97
        training history from one model
98
    outputfile : str
99
        path where the json file needs to be stored
100
    """
101 1
    jsondata = params.copy()
102 1
    for k in jsondata.keys():
103 1
        if isinstance(jsondata[k], np.ndarray):
104 1
            jsondata[k] = jsondata[k].tolist()
105 1
    jsondata['train_acc'] = history['acc']
106 1
    jsondata['train_loss'] = history['loss']
107 1
    jsondata['val_acc'] = history['val_acc']
108 1
    jsondata['val_loss'] = history['val_loss']
109 1
    jsondata['modeltype'] = model_type
110 1
    jsondata['modeltype'] = model_type
111 1
    if os.path.isfile(outputfile):
112
        with open(outputfile, 'r') as outfile:
113
            previousdata = json.load(outfile)
114
    else:
115 1
        previousdata = []
116 1
    previousdata.append(jsondata)
117 1
    with open(outputfile, 'w') as outfile:
118 1
        json.dump(previousdata, outfile, sort_keys=True,
119
                  indent=4, ensure_ascii=False)
120
121
122 1
def plotTrainingProcess(history, name='Model', ax=None):
123
    """
124
    This function plots the loss and accuracy on the train and validation set,
125
    for each epoch in the history of one model.
126
127
    Parameters
128
    ----------
129
    history : keras History object
130
        The history object of the training process corresponding to one model
131
    name : str
132
        Name of the model, to display in the title
133
    ax : Axis, optional
134
        Specific axis to plot on
135
136
    """
137
    if ax is None:
138
        fig, ax = plt.subplots()
139
    ax2 = ax.twinx()
140
    LN = len(history.history['val_loss'])
141
    val_loss, = ax.plot(range(LN), history.history['val_loss'], 'g--',
142
                        label='validation loss')
143
    train_loss, = ax.plot(range(LN), history.history['loss'], 'g-',
144
                          label='train loss')
145
    val_acc, = ax2.plot(range(LN), history.history['val_acc'], 'b--',
146
                        label='validation accuracy')
147
    train_acc, = ax2.plot(range(LN), history.history['acc'], 'b-',
148
                          label='train accuracy')
149
    ax.set_xlabel('epoch')
150
    ax.set_ylabel('loss', color='g')
151
    ax2.set_ylabel('accuracy', color='b')
152
    plt.legend(handles=[val_loss, train_loss, val_acc, train_acc],
153
               loc=2, bbox_to_anchor=(1.1, 1))
154
    plt.title(name)
155
156
157 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
158
                           number_of_models=5, nr_epochs=5, subset_size=100,
159
                           outputpath=None, **kwargs
160
                           ):
161
    """
162
    Tries out a number of models on a subsample of the data,
163
    and outputs the best found architecture and hyperparameters.
164
165
    Parameters
166
    ----------
167
    X_train : numpy array
168
        The input dataset for training of shape
169
        (num_samples, num_timesteps, num_channels)
170
    y_train : numpy array
171
        The output classes for the train data, in binary format of shape
172
        (num_samples, num_classes)
173
    X_val : numpy array
174
        The input dataset for validation of shape
175
        (num_samples_val, num_timesteps, num_channels)
176
    y_val : numpy array
177
        The output classes for the validation data, in binary format of shape
178
        (num_samples_val, num_classes)
179
    verbose : bool, optional
180
        flag for displaying verbose output
181
    number_of_models : int, optiona
182
        The number of models to generate and test
183
    nr_epochs : int, optional
184
        The number of epochs that each model is trained
185
    subset_size : int, optional
186
        The size of the subset of the data that is used for finding
187
        the optimal architecture
188
    outputpath : str, optional
189
        File location to store the model results
190
    **kwargs: key-value parameters
191
        parameters for generating the models
192
        (see docstring for modelgen.generate_models)
193
194
    Returns
195
    ----------
196
    best_model : Keras model
197
        Best performing model, already trained on a small sample data set.
198
    best_params : dict
199
        Dictionary containing the hyperparameters for the best model
200
    best_model_type : str
201
        Type of the best model
202
    knn_acc : float
203
        accuaracy for kNN prediction on validation set
204
    """
205 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
206
                                      number_of_models=number_of_models,
207
                                      **kwargs)
208 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
209
                                                                    y_train,
210
                                                                    X_val,
211
                                                                    y_val,
212
                                                                    models,
213
                                                                    nr_epochs,
214
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
215
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
216
                                                                    outputfile=outputpath)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
217 1
    best_model_index = np.argmax(val_accuracies)
218 1
    best_model, best_params, best_model_type = models[best_model_index]
219 1
    knn_acc = kNN_accuracy(
220
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
221 1
    if verbose:
222
        for i in range(len(models)):  # now one plot per model, ultimately we
223
            # may want all models in one plot to allow for direct comparison
224
            name = str(models[i][1])
225
            plotTrainingProcess(histories[i], name)
226
        print('Best model: model ', best_model_index)
227
        print('Model type: ', best_model_type)
228
        print('Hyperparameters: ', best_params)
229
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
230
        print('Accuracy of kNN on validation set', knn_acc)
231
232 1
    if val_accuracies[best_model_index] < knn_acc:
233
        warnings.warn('Best model not better than kNN: ' +
234
                      str(val_accuracies[best_model_index]) + ' vs  ' +
235
                      str(knn_acc)
236
                      )
237 1
    return best_model, best_params, best_model_type, knn_acc
238
239
240 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
241
    """
242
    Performs k-Neigherst Neighbors and returns the accuracy score.
243
244
    Parameters
245
    ----------
246
    X_train : numpy array
247
        Train set of shape (num_samples, num_timesteps, num_channels)
248
    y_train : numpy array
249
        Class labels for train set
250
    X_val : numpy array
251
        Validation set of shape (num_samples, num_timesteps, num_channels)
252
    y_val : numpy array
253
        Class labels for validation set
254
    k : int
255
        number of neighbors to use for classifying
256
257
    Returns
258
    -------
259
    accuracy: float
260
        accuracy score on the validation set
261
    """
262 1
    num_samples, num_timesteps, num_channels = X_train.shape
263 1
    clf = neighbors.KNeighborsClassifier(k)
264 1
    clf.fit(
265
        X_train.reshape(
266
            num_samples,
267
            num_timesteps *
268
            num_channels),
269
        y_train)
270 1
    num_samples, num_timesteps, num_channels = X_val.shape
271 1
    val_predict = clf.predict(
272
        X_val.reshape(num_samples,
273
                      num_timesteps * num_channels))
274
    return metrics.accuracy_score(val_predict, y_val)
275