Completed
Pull Request — master (#147)
by Dafne van
02:39
created

get_metric_name()   A

Complexity

Conditions 4

Size

Total Lines 20

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 4.5923

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
c 1
b 0
f 0
dl 0
loc 20
ccs 6
cts 9
cp 0.6667
crap 4.5923
rs 9.2
1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from sklearn import neighbors, metrics as sklearnmetrics
16 1
import warnings
17 1
import json
18 1
import os
19 1
from keras.callbacks import EarlyStopping
20 1
from keras import metrics
21
22 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
23
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (88/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
24
                            model_path=None, early_stopping=False,
25
                            batch_size=20, metric='accuracy'):
26
    """
27
    Given a list of compiled models, this function trains
28
    them all on a subset of the train data. If the given size of the subset is
29
    smaller then the size of the data, the complete data set is used.
30
31
    Parameters
32
    ----------
33
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
34
        The input dataset for training
35
    y_train : numpy array of shape (num_samples, num_classes)
36
        The output classes for the train data, in binary format
37
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
38
        The input dataset for validation
39
    y_val : numpy array of shape (num_samples_val, num_classes)
40
        The output classes for the validation data, in binary format
41
    models : list of model, params, modeltypes
42
        List of keras models to train
43
    nr_epochs : int, optional
44
        nr of epochs to use for training one model
45
    subset_size :
46
        The number of samples used from the complete train set
47
    verbose : bool, optional
48
        flag for displaying verbose output
49
    outputfile: str, optional
50
        Filename to store the model training results
51
    model_path : str, optional
52
        Directory to store the models as HDF5 files
53
    early_stopping: bool
54
        Stop when validation loss does not decrease
55
    batch_size : int
56
        nr of samples per batch
57
    metric : str
58
        metric to store in the history object
59
60
    Returns
61
    ----------
62
    histories : list of Keras History objects
63
        train histories for all models
64
    val_metrics : list of floats
65
        validation accuraracies of the models
66
    val_losses : list of floats
67
        validation losses of the models
68
    """
69
    # if subset_size is smaller then X_train, this will work fine
70 1
    X_train_sub = X_train[:subset_size, :, :]
71 1
    y_train_sub = y_train[:subset_size, :]
72
73 1
    metric_name = get_metric_name(metric)
74
75 1
    histories = []
76 1
    val_metrics = []
77 1
    val_losses = []
78 1
    for i, (model, params, model_types) in enumerate(models):
79 1
        if verbose:
80
            print('Training model %d' % i, model_types)
81 1
        model_metrics = [get_metric_name(name) for name in model.metrics]
82 1
        if metric_name not in model_metrics:
83
            raise ValueError('Invalid metric, the model should be compiled with the same metric!')
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (98/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
84 1
        if early_stopping:
85
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (101/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
86
        else:
87 1
            callbacks = []
88 1
        history = model.fit(X_train_sub, y_train_sub,
89
                            epochs=nr_epochs, batch_size=batch_size,
90
                            # see comment on subsize_set
91
                            validation_data=(X_val, y_val),
92
                            verbose=verbose,
93
                            callbacks=callbacks)
94 1
        histories.append(history)
95
96 1
        val_metrics.append(history.history['val_'+metric_name][-1])
97 1
        val_losses.append(history.history['val_loss'][-1])
98 1
        if outputfile is not None:
99
            store_train_hist_as_json(params, model_types,
100
                         history.history, outputfile)
101 1
        if model_path is not None:
102
                model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
0 ignored issues
show
Coding Style introduced by
The indentation here looks off. 12 spaces were expected, but 16 were found.
Loading history...
103
104 1
    return histories, val_metrics, val_losses
105
106
107
108 1
def store_train_hist_as_json(params, model_type, history, outputfile, metric_name='acc'):
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (89/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
109
    """
110
    This function stores the model parameters, the loss and accuracy history
111
    of one model in a JSON file. It appends the model information to the
112
    existing models in the file.
113
114
    Parameters
115
    ----------
116
    params : dict
117
        parameters for one model
118
    model_type : Keras model object
119
        Keras model object for one model
120
    history : dict
121
        training history from one model
122
    outputfile : str
123
        path where the json file needs to be stored
124
    metric_name : str, optional
125
        name of metric from history to store
126
    """
127 1
    jsondata = params.copy()
128 1
    for k in jsondata.keys():
129 1
        if isinstance(jsondata[k], np.ndarray):
130 1
            jsondata[k] = jsondata[k].tolist()
131 1
    jsondata['train_metric'] = history[metric_name]
132 1
    jsondata['train_loss'] = history['loss']
133 1
    jsondata['val_metric'] = history['val_'+metric_name]
134 1
    jsondata['val_loss'] = history['val_loss']
135 1
    jsondata['modeltype'] = model_type
136 1
    jsondata['metric'] = metric_name
137 1
    if os.path.isfile(outputfile):
138
        with open(outputfile, 'r') as outfile:
139
            previousdata = json.load(outfile)
140
    else:
141 1
        previousdata = []
142 1
    previousdata.append(jsondata)
143 1
    with open(outputfile, 'w') as outfile:
144 1
        json.dump(previousdata, outfile, sort_keys=True,
145
                  indent=4, ensure_ascii=False)
146
147
148 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
149
                           number_of_models=5, nr_epochs=5, subset_size=100,
150
                           outputpath=None, model_path=None, metric='accuracy',
151
                           **kwargs):
152
    """
153
    Tries out a number of models on a subsample of the data,
154
    and outputs the best found architecture and hyperparameters.
155
156
    Parameters
157
    ----------
158
    X_train : numpy array
159
        The input dataset for training of shape
160
        (num_samples, num_timesteps, num_channels)
161
    y_train : numpy array
162
        The output classes for the train data, in binary format of shape
163
        (num_samples, num_classes)
164
    X_val : numpy array
165
        The input dataset for validation of shape
166
        (num_samples_val, num_timesteps, num_channels)
167
    y_val : numpy array
168
        The output classes for the validation data, in binary format of shape
169
        (num_samples_val, num_classes)
170
    verbose : bool, optional
171
        flag for displaying verbose output
172
    number_of_models : int, optiona
173
        The number of models to generate and test
174
    nr_epochs : int, optional
175
        The number of epochs that each model is trained
176
    subset_size : int, optional
177
        The size of the subset of the data that is used for finding
178
        the optimal architecture
179
    outputpath : str, optional
180
        File location to store the model results
181
    model_path: str, optional
182
        Directory to save the models as HDF5 files
183
    metric: str, optional
184
        metric that is used to evaluate the model on the validation set.
185
        See https://keras.io/metrics/ for possible metrics
186
    **kwargs: key-value parameters
187
        parameters for generating the models
188
        (see docstring for modelgen.generate_models)
189
190
    Returns
191
    ----------
192
    best_model : Keras model
193
        Best performing model, already trained on a small sample data set.
194
    best_params : dict
195
        Dictionary containing the hyperparameters for the best model
196
    best_model_type : str
197
        Type of the best model
198
    knn_acc : float
199
        accuaracy for kNN prediction on validation set
200
    """
201 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
202
                                      number_of_models=number_of_models,
203
                                      metrics=[metric],
204
                                      **kwargs)
205 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
206
                                                                    y_train,
207
                                                                    X_val,
208
                                                                    y_val,
209
                                                                    models,
210
                                                                    nr_epochs,
211
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
212
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
213
                                                                    outputfile=outputpath,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
214
                                                                    model_path=model_path,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
215
                                                                    metric=metric)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (82/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
216 1
    best_model_index = np.argmax(val_accuracies)
217 1
    best_model, best_params, best_model_type = models[best_model_index]
218 1
    knn_acc = kNN_accuracy(
219
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
220 1
    if verbose:
221
        print('Best model: model ', best_model_index)
222
        print('Model type: ', best_model_type)
223
        print('Hyperparameters: ', best_params)
224
        print(str(metric) + ' on validation set: ', val_accuracies[best_model_index])
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (85/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
225
        print('Accuracy of kNN on validation set', knn_acc)
226
227 1
    if val_accuracies[best_model_index] < knn_acc:
228
        warnings.warn('Best model not better than kNN: ' +
229
                      str(val_accuracies[best_model_index]) + ' vs  ' +
230
                      str(knn_acc)
231
                      )
232 1
    return best_model, best_params, best_model_type, knn_acc
233
234
235 1
def get_metric_name(name):
236
    """
237
    Gives the keras name for a metric
238
239
    Parameters
240
    ----------
241
    name : str
242
        original name of the metric
243
    Returns
244
    -------
245
246
    """
247 1
    if name=='acc' or name=='accuracy':
0 ignored issues
show
Coding Style introduced by
Exactly one space required around comparison
if name=='acc' or name=='accuracy':
^^
Loading history...
Coding Style introduced by
Exactly one space required around comparison
if name=='acc' or name=='accuracy':
^^
Loading history...
248 1
        return 'acc'
249 1
    try:
250 1
        metric_fn = metrics.get(name)
251 1
        return metric_fn.__name__
252
    except:
253
        pass
254
    return name
255
256
257 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
258
    """
259
    Performs k-Neigherst Neighbors and returns the accuracy score.
260
261
    Parameters
262
    ----------
263
    X_train : numpy array
264
        Train set of shape (num_samples, num_timesteps, num_channels)
265
    y_train : numpy array
266
        Class labels for train set
267
    X_val : numpy array
268
        Validation set of shape (num_samples, num_timesteps, num_channels)
269
    y_val : numpy array
270
        Class labels for validation set
271
    k : int
272
        number of neighbors to use for classifying
273
274
    Returns
275
    -------
276
    accuracy: float
277
        accuracy score on the validation set
278
    """
279 1
    num_samples, num_timesteps, num_channels = X_train.shape
280 1
    clf = neighbors.KNeighborsClassifier(k)
281 1
    clf.fit(
282
        X_train.reshape(
283
            num_samples,
284
            num_timesteps *
285
            num_channels),
286
        y_train)
287 1
    num_samples, num_timesteps, num_channels = X_val.shape
288 1
    val_predict = clf.predict(
289
        X_val.reshape(num_samples,
290
                      num_timesteps * num_channels))
291
    return sklearnmetrics.accuracy_score(val_predict, y_val)
292