Completed
Push — master ( d10d5f...edd44f )
by Dafne van
03:09
created

kNN_accuracy()   B

Complexity

Conditions 1

Size

Total Lines 35

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
c 0
b 0
f 0
dl 0
loc 35
ccs 0
cts 7
cp 0
crap 2
rs 8.8571
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13
import numpy as np
14
from matplotlib import pyplot as plt
15
from . import modelgen
16
from sklearn import neighbors, metrics
17
import warnings
18
import json
19
import os
20
from keras.callbacks import EarlyStopping
21
22
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23
                            nr_epochs=5, subset_size=100, verbose=True,
24
                            outputfile=None, early_stopping=False):
25
    """
26
    Given a list of compiled models, this function trains
27
    them all on a subset of the train data. If the given size of the subset is
28
    smaller then the size of the data, the complete data set is used.
29
30
    Parameters
31
    ----------
32
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
33
        The input dataset for training
34
    y_train : numpy array of shape (num_samples, num_classes)
35
        The output classes for the train data, in binary format
36
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
37
        The input dataset for validation
38
    y_val : numpy array of shape (num_samples_val, num_classes)
39
        The output classes for the validation data, in binary format
40
    models : list of model, params, modeltypes
41
        List of keras models to train
42
    nr_epochs : int, optional
43
        nr of epochs to use for training one model
44
    subset_size :
45
        The number of samples used from the complete train set
46
    verbose : bool, optional
47
        flag for displaying verbose output
48
    outputfile : str, optional
49
        File location to store the model results
50
    early_stopping: bool
51
        Stop when validation loss does not decrease
52
53
    Returns
54
    ----------
55
    histories : list of Keras History objects
56
        train histories for all models
57
    val_accuracies : list of floats
58
        validation accuraracies of the models
59
    val_losses : list of floats
60
        validation losses of the models
61
    """
62
    # if subset_size is smaller then X_train, this will work fine
63
    X_train_sub = X_train[:subset_size, :, :]
64
    y_train_sub = y_train[:subset_size, :]
65
66
    histories = []
67
    val_accuracies = []
68
    val_losses = []
69
    for i, (model, params, model_types) in enumerate(models):
70
        if verbose:
71
            print('Training model %d' % i, model_types)
72
        if early_stopping:
73
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (101/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
74
        else:
75
            callbacks = []
76
        history = model.fit(X_train_sub, y_train_sub,
77
                            nb_epoch=nr_epochs, batch_size=20,
78
                            # see comment on subsize_set
79
                            validation_data=(X_val, y_val),
80
                            verbose=verbose,
81
                            callbacks=callbacks)
82
        histories.append(history)
83
        val_accuracies.append(history.history['val_acc'][-1])
84
        val_losses.append(history.history['val_loss'][-1])
85
        if outputfile is not None:
86
            store_train_hist_as_json(params, model_types,
87
                                     history.history, outputfile)
88
    return histories, val_accuracies, val_losses
89
90
91
def store_train_hist_as_json(params, model_type, history, outputfile):
92
    """
93
    This function stores the model parameters, the loss and accuracy history
94
    of one model in a JSON file. It appends the model information to the
95
    existing models in the file.
96
97
    Parameters
98
    ----------
99
    params : dict
100
        parameters for one model
101
    model_type : Keras model object
102
        Keras model object for one model
103
    history : dict
104
        training history from one model
105
    outputfile : str
106
        path where the json file needs to be stored
107
    """
108
    jsondata = params.copy()
109
    for k in jsondata.keys():
110
        if isinstance(jsondata[k], np.ndarray):
111
            jsondata[k] = jsondata[k].tolist()
112
    jsondata['train_acc'] = history['acc']
113
    jsondata['train_loss'] = history['loss']
114
    jsondata['val_acc'] = history['val_acc']
115
    jsondata['val_loss'] = history['val_loss']
116
    jsondata['modeltype'] = model_type
117
    jsondata['modeltype'] = model_type
118
    if os.path.isfile(outputfile):
119
        with open(outputfile, 'r') as outfile:
120
            previousdata = json.load(outfile)
121
    else:
122
        previousdata = []
123
    previousdata.append(jsondata)
124
    with open(outputfile, 'w') as outfile:
125
        json.dump(previousdata, outfile, sort_keys=True,
126
                  indent=4, ensure_ascii=False)
127
128
129
def plotTrainingProcess(history, name='Model', ax=None):
130
    """
131
    This function plots the loss and accuracy on the train and validation set,
132
    for each epoch in the history of one model.
133
134
    Parameters
135
    ----------
136
    history : keras History object
137
        The history object of the training process corresponding to one model
138
    name : str
139
        Name of the model, to display in the title
140
    ax : Axis, optional
141
        Specific axis to plot on
142
143
    """
144
    if ax is None:
145
        fig, ax = plt.subplots()
146
    ax2 = ax.twinx()
147
    LN = len(history.history['val_loss'])
148
    val_loss, = ax.plot(range(LN), history.history['val_loss'], 'g--',
149
                        label='validation loss')
150
    train_loss, = ax.plot(range(LN), history.history['loss'], 'g-',
151
                          label='train loss')
152
    val_acc, = ax2.plot(range(LN), history.history['val_acc'], 'b--',
153
                        label='validation accuracy')
154
    train_acc, = ax2.plot(range(LN), history.history['acc'], 'b-',
155
                          label='train accuracy')
156
    ax.set_xlabel('epoch')
157
    ax.set_ylabel('loss', color='g')
158
    ax2.set_ylabel('accuracy', color='b')
159
    plt.legend(handles=[val_loss, train_loss, val_acc, train_acc],
160
               loc=2, bbox_to_anchor=(1.1, 1))
161
    plt.title(name)
162
163
164
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
165
                           number_of_models=5, nr_epochs=5, subset_size=100,
166
                           outputpath=None, **kwargs
167
                           ):
168
    """
169
    Tries out a number of models on a subsample of the data,
170
    and outputs the best found architecture and hyperparameters.
171
172
    Parameters
173
    ----------
174
    X_train : numpy array
175
        The input dataset for training of shape
176
        (num_samples, num_timesteps, num_channels)
177
    y_train : numpy array
178
        The output classes for the train data, in binary format of shape
179
        (num_samples, num_classes)
180
    X_val : numpy array
181
        The input dataset for validation of shape
182
        (num_samples_val, num_timesteps, num_channels)
183
    y_val : numpy array
184
        The output classes for the validation data, in binary format of shape
185
        (num_samples_val, num_classes)
186
    verbose : bool, optional
187
        flag for displaying verbose output
188
    number_of_models : int, optiona
189
        The number of models to generate and test
190
    nr_epochs : int, optional
191
        The number of epochs that each model is trained
192
    subset_size : int, optional
193
        The size of the subset of the data that is used for finding
194
        the optimal architecture
195
    outputpath : str, optional
196
        File location to store the model results
197
    **kwargs: key-value parameters
198
        parameters for generating the models
199
        (see docstring for modelgen.generate_models)
200
201
    Returns
202
    ----------
203
    best_model : Keras model
204
        Best performing model, already trained on a small sample data set.
205
    best_params : dict
206
        Dictionary containing the hyperparameters for the best model
207
    best_model_type : str
208
        Type of the best model
209
    knn_acc : float
210
        accuaracy for kNN prediction on validation set
211
    """
212
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
213
                                      number_of_models=number_of_models,
214
                                      **kwargs)
215
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
216
                                                                    y_train,
217
                                                                    X_val,
218
                                                                    y_val,
219
                                                                    models,
220
                                                                    nr_epochs,
221
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
222
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
223
                                                                    outputfile=outputpath)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
224
    best_model_index = np.argmax(val_accuracies)
225
    best_model, best_params, best_model_type = models[best_model_index]
226
    knn_acc = kNN_accuracy(
227
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
228
    if verbose:
229
        for i in range(len(models)):  # now one plot per model, ultimately we
230
            # may want all models in one plot to allow for direct comparison
231
            name = str(models[i][1])
232
            plotTrainingProcess(histories[i], name)
233
        print('Best model: model ', best_model_index)
234
        print('Model type: ', best_model_type)
235
        print('Hyperparameters: ', best_params)
236
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
237
        print('Accuracy of kNN on validation set', knn_acc)
238
239
    if val_accuracies[best_model_index] < knn_acc:
240
        warnings.warn('Best model not better than kNN: ' +
241
                      str(val_accuracies[best_model_index]) + ' vs  ' +
242
                      str(knn_acc)
243
                      )
244
    return best_model, best_params, best_model_type, knn_acc
245
246
247
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
248
    """
249
    Performs k-Neigherst Neighbors and returns the accuracy score.
250
251
    Parameters
252
    ----------
253
    X_train : numpy array
254
        Train set of shape (num_samples, num_timesteps, num_channels)
255
    y_train : numpy array
256
        Class labels for train set
257
    X_val : numpy array
258
        Validation set of shape (num_samples, num_timesteps, num_channels)
259
    y_val : numpy array
260
        Class labels for validation set
261
    k : int
262
        number of neighbors to use for classifying
263
264
    Returns
265
    -------
266
    accuracy: float
267
        accuracy score on the validation set
268
    """
269
    num_samples, num_timesteps, num_channels = X_train.shape
270
    clf = neighbors.KNeighborsClassifier(k)
271
    clf.fit(
272
        X_train.reshape(
273
            num_samples,
274
            num_timesteps *
275
            num_channels),
276
        y_train)
277
    num_samples, num_timesteps, num_channels = X_val.shape
278
    val_predict = clf.predict(
279
        X_val.reshape(num_samples,
280
                      num_timesteps * num_channels))
281
    return metrics.accuracy_score(val_predict, y_val)
282