Completed
Push — master ( 3e0ec6...69dab7 )
by Christiaan
04:45
created

get_metric_name()   A

Complexity

Conditions 4

Size

Total Lines 20

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 4.5923

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
c 1
b 0
f 0
dl 0
loc 20
ccs 6
cts 9
cp 0.6667
crap 4.5923
rs 9.2
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from sklearn import neighbors, metrics as sklearnmetrics
16 1
import warnings
17 1
import json
18 1
import os
19 1
from keras.callbacks import EarlyStopping
20 1
from keras import metrics
21
22
23 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
24
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (88/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
25
                            model_path=None, early_stopping=False,
26
                            batch_size=20, metric='accuracy'):
27
    """
28
    Given a list of compiled models, this function trains
29
    them all on a subset of the train data. If the given size of the subset is
30
    smaller then the size of the data, the complete data set is used.
31
32
    Parameters
33
    ----------
34
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
35
        The input dataset for training
36
    y_train : numpy array of shape (num_samples, num_classes)
37
        The output classes for the train data, in binary format
38
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
39
        The input dataset for validation
40
    y_val : numpy array of shape (num_samples_val, num_classes)
41
        The output classes for the validation data, in binary format
42
    models : list of model, params, modeltypes
43
        List of keras models to train
44
    nr_epochs : int, optional
45
        nr of epochs to use for training one model
46
    subset_size :
47
        The number of samples used from the complete train set
48
    verbose : bool, optional
49
        flag for displaying verbose output
50
    outputfile: str, optional
51
        Filename to store the model training results
52
    model_path : str, optional
53
        Directory to store the models as HDF5 files
54
    early_stopping: bool
55
        Stop when validation loss does not decrease
56
    batch_size : int
57
        nr of samples per batch
58
    metric : str
59
        metric to store in the history object
60
61
    Returns
62
    ----------
63
    histories : list of Keras History objects
64
        train histories for all models
65
    val_metrics : list of floats
66
        validation accuraracies of the models
67
    val_losses : list of floats
68
        validation losses of the models
69
    """
70
    # if subset_size is smaller then X_train, this will work fine
71 1
    X_train_sub = X_train[:subset_size, :, :]
72 1
    y_train_sub = y_train[:subset_size, :]
73
74 1
    metric_name = get_metric_name(metric)
75
76 1
    histories = []
77 1
    val_metrics = []
78 1
    val_losses = []
79 1
    for i, (model, params, model_types) in enumerate(models):
80 1
        if verbose:
81
            print('Training model %d' % i, model_types)
82 1
        model_metrics = [get_metric_name(name) for name in model.metrics]
83 1
        if metric_name not in model_metrics:
84
            raise ValueError(
85
                'Invalid metric. The model was not compiled with {} as metric'.format(metric_name))
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (99/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
86 1
        if early_stopping:
87
            callbacks = [
88
                EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
89
        else:
90 1
            callbacks = []
91 1
        history = model.fit(X_train_sub, y_train_sub,
92
                            epochs=nr_epochs, batch_size=batch_size,
93
                            # see comment on subsize_set
94
                            validation_data=(X_val, y_val),
95
                            verbose=verbose,
96
                            callbacks=callbacks)
97 1
        histories.append(history)
98
99 1
        val_metrics.append(history.history['val_' + metric_name][-1])
100 1
        val_losses.append(history.history['val_loss'][-1])
101 1
        if outputfile is not None:
102
            store_train_hist_as_json(params, model_types,
103
                                     history.history, outputfile)
104 1
        if model_path is not None:
105
                model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
0 ignored issues
show
Coding Style introduced by
The indentation here looks off. 12 spaces were expected, but 16 were found.
Loading history...
106
107 1
    return histories, val_metrics, val_losses
108
109
110 1
def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (89/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
111
    """
112
    This function stores the model parameters, the loss and accuracy history
113
    of one model in a JSON file. It appends the model information to the
114
    existing models in the file.
115
116
    Parameters
117
    ----------
118
    params : dict
119
        parameters for one model
120
    model_type : Keras model object
121
        Keras model object for one model
122
    history : dict
123
        training history from one model
124
    outputfile : str
125
        path where the json file needs to be stored
126
    metric_name : str, optional
127
        name of metric from history to store
128
    """
129 1
    jsondata = params.copy()
130 1
    for k in jsondata.keys():
131 1
        if isinstance(jsondata[k], np.ndarray):
132 1
            jsondata[k] = jsondata[k].tolist()
133 1
    jsondata['train_metric'] = history[metric_name]
134 1
    jsondata['train_loss'] = history['loss']
135 1
    jsondata['val_metric'] = history['val_' + metric_name]
136 1
    jsondata['val_loss'] = history['val_loss']
137 1
    jsondata['modeltype'] = model_type
138 1
    jsondata['metric'] = metric_name
139 1
    if os.path.isfile(outputfile):
140
        with open(outputfile, 'r') as outfile:
141
            previousdata = json.load(outfile)
142
    else:
143 1
        previousdata = []
144 1
    previousdata.append(jsondata)
145 1
    with open(outputfile, 'w') as outfile:
146 1
        json.dump(previousdata, outfile, sort_keys=True,
147
                  indent=4, ensure_ascii=False)
148
149
150 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
151
                           number_of_models=5, nr_epochs=5, subset_size=100,
152
                           outputpath=None, model_path=None, metric='accuracy',
153
                           **kwargs):
154
    """
155
    Tries out a number of models on a subsample of the data,
156
    and outputs the best found architecture and hyperparameters.
157
158
    Parameters
159
    ----------
160
    X_train : numpy array
161
        The input dataset for training of shape
162
        (num_samples, num_timesteps, num_channels)
163
    y_train : numpy array
164
        The output classes for the train data, in binary format of shape
165
        (num_samples, num_classes)
166
    X_val : numpy array
167
        The input dataset for validation of shape
168
        (num_samples_val, num_timesteps, num_channels)
169
    y_val : numpy array
170
        The output classes for the validation data, in binary format of shape
171
        (num_samples_val, num_classes)
172
    verbose : bool, optional
173
        flag for displaying verbose output
174
    number_of_models : int, optiona
175
        The number of models to generate and test
176
    nr_epochs : int, optional
177
        The number of epochs that each model is trained
178
    subset_size : int, optional
179
        The size of the subset of the data that is used for finding
180
        the optimal architecture
181
    outputpath : str, optional
182
        File location to store the model results
183
    model_path: str, optional
184
        Directory to save the models as HDF5 files
185
    metric: str, optional
186
        metric that is used to evaluate the model on the validation set.
187
        See https://keras.io/metrics/ for possible metrics
188
    **kwargs: key-value parameters
189
        parameters for generating the models
190
        (see docstring for modelgen.generate_models)
191
192
    Returns
193
    ----------
194
    best_model : Keras model
195
        Best performing model, already trained on a small sample data set.
196
    best_params : dict
197
        Dictionary containing the hyperparameters for the best model
198
    best_model_type : str
199
        Type of the best model
200
    knn_acc : float
201
        accuaracy for kNN prediction on validation set
202
    """
203 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
204
                                      number_of_models=number_of_models,
205
                                      metrics=[metric],
206
                                      **kwargs)
207 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
208
                                                                    y_train,
209
                                                                    X_val,
210
                                                                    y_val,
211
                                                                    models,
212
                                                                    nr_epochs,
213
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
214
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
215
                                                                    outputfile=outputpath,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
216
                                                                    model_path=model_path,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
217
                                                                    metric=metric)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
218 1
    best_model_index = np.argmax(val_accuracies)
219 1
    best_model, best_params, best_model_type = models[best_model_index]
220 1
    knn_acc = kNN_accuracy(
221
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
222 1
    if verbose:
223
        print('Best model: model ', best_model_index)
224
        print('Model type: ', best_model_type)
225
        print('Hyperparameters: ', best_params)
226
        print(str(metric) + ' on validation set: ',
227
              val_accuracies[best_model_index])
228
        print('Accuracy of kNN on validation set', knn_acc)
229
230 1
    if val_accuracies[best_model_index] < knn_acc:
231
        warnings.warn('Best model not better than kNN: ' +
232
                      str(val_accuracies[best_model_index]) + ' vs  ' +
233
                      str(knn_acc)
234
                      )
235 1
    return best_model, best_params, best_model_type, knn_acc
236
237
238 1
def get_metric_name(name):
239
    """
240
    Gives the keras name for a metric
241
242
    Parameters
243
    ----------
244
    name : str
245
        original name of the metric
246
    Returns
247
    -------
248
249
    """
250 1
    if name == 'acc' or name == 'accuracy':
251 1
        return 'acc'
252 1
    try:
253 1
        metric_fn = metrics.get(name)
254 1
        return metric_fn.__name__
255
    except:
256
        pass
257
    return name
258
259
260 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
261
    """
262
    Performs k-Neigherst Neighbors and returns the accuracy score.
263
264
    Parameters
265
    ----------
266
    X_train : numpy array
267
        Train set of shape (num_samples, num_timesteps, num_channels)
268
    y_train : numpy array
269
        Class labels for train set
270
    X_val : numpy array
271
        Validation set of shape (num_samples, num_timesteps, num_channels)
272
    y_val : numpy array
273
        Class labels for validation set
274
    k : int
275
        number of neighbors to use for classifying
276
277
    Returns
278
    -------
279
    accuracy: float
280
        accuracy score on the validation set
281
    """
282 1
    num_samples, num_timesteps, num_channels = X_train.shape
283 1
    clf = neighbors.KNeighborsClassifier(k)
284 1
    clf.fit(
285
        X_train.reshape(
286
            num_samples,
287
            num_timesteps *
288
            num_channels),
289
        y_train)
290 1
    num_samples, num_timesteps, num_channels = X_val.shape
291 1
    val_predict = clf.predict(
292
        X_val.reshape(num_samples,
293
                      num_timesteps * num_channels))
294
    return sklearnmetrics.accuracy_score(val_predict, y_val)
295