Completed
Push — master ( 95c5a7...9dd26d )
by Dafne van
02:23
created

find_best_architecture()   B

Complexity

Conditions 3

Size

Total Lines 77

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 3.576

Importance

Changes 1
Bugs 0 Features 1
Metric Value
cc 3
c 1
b 0
f 1
dl 0
loc 77
ccs 9
cts 15
cp 0.6
crap 3.576
rs 8.9342

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from sklearn import neighbors, metrics
16 1
import warnings
17 1
import json
18 1
import os
19 1
from keras.callbacks import EarlyStopping
20
21 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
22
                            nr_epochs=5, subset_size=100, verbose=True,
23
                            outputfile=None, early_stopping=False):
24
    """
25
    Given a list of compiled models, this function trains
26
    them all on a subset of the train data. If the given size of the subset is
27
    smaller then the size of the data, the complete data set is used.
28
29
    Parameters
30
    ----------
31
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
32
        The input dataset for training
33
    y_train : numpy array of shape (num_samples, num_classes)
34
        The output classes for the train data, in binary format
35
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
36
        The input dataset for validation
37
    y_val : numpy array of shape (num_samples_val, num_classes)
38
        The output classes for the validation data, in binary format
39
    models : list of model, params, modeltypes
40
        List of keras models to train
41
    nr_epochs : int, optional
42
        nr of epochs to use for training one model
43
    subset_size :
44
        The number of samples used from the complete train set
45
    verbose : bool, optional
46
        flag for displaying verbose output
47
    outputfile : str, optional
48
        File location to store the model results
49
    early_stopping: bool
50
        Stop when validation loss does not decrease
51
52
    Returns
53
    ----------
54
    histories : list of Keras History objects
55
        train histories for all models
56
    val_accuracies : list of floats
57
        validation accuraracies of the models
58
    val_losses : list of floats
59
        validation losses of the models
60
    """
61
    # if subset_size is smaller then X_train, this will work fine
62 1
    X_train_sub = X_train[:subset_size, :, :]
63 1
    y_train_sub = y_train[:subset_size, :]
64
65 1
    histories = []
66 1
    val_accuracies = []
67 1
    val_losses = []
68 1
    for i, (model, params, model_types) in enumerate(models):
69 1
        if verbose:
70
            print('Training model %d' % i, model_types)
71 1
        if early_stopping:
72
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (101/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
73
        else:
74 1
            callbacks = []
75 1
        history = model.fit(X_train_sub, y_train_sub,
76
                            epochs=nr_epochs, batch_size=20,
77
                            # see comment on subsize_set
78
                            validation_data=(X_val, y_val),
79
                            verbose=verbose,
80
                            callbacks=callbacks)
81 1
        histories.append(history)
82 1
        val_accuracies.append(history.history['val_acc'][-1])
83 1
        val_losses.append(history.history['val_loss'][-1])
84 1
        if outputfile is not None:
85
            store_train_hist_as_json(params, model_types,
86
                                     history.history, outputfile)
87 1
    return histories, val_accuracies, val_losses
88
89
90 1
def store_train_hist_as_json(params, model_type, history, outputfile):
91
    """
92
    This function stores the model parameters, the loss and accuracy history
93
    of one model in a JSON file. It appends the model information to the
94
    existing models in the file.
95
96
    Parameters
97
    ----------
98
    params : dict
99
        parameters for one model
100
    model_type : Keras model object
101
        Keras model object for one model
102
    history : dict
103
        training history from one model
104
    outputfile : str
105
        path where the json file needs to be stored
106
    """
107 1
    jsondata = params.copy()
108 1
    for k in jsondata.keys():
109 1
        if isinstance(jsondata[k], np.ndarray):
110 1
            jsondata[k] = jsondata[k].tolist()
111 1
    jsondata['train_acc'] = history['acc']
112 1
    jsondata['train_loss'] = history['loss']
113 1
    jsondata['val_acc'] = history['val_acc']
114 1
    jsondata['val_loss'] = history['val_loss']
115 1
    jsondata['modeltype'] = model_type
116 1
    jsondata['modeltype'] = model_type
117 1
    if os.path.isfile(outputfile):
118
        with open(outputfile, 'r') as outfile:
119
            previousdata = json.load(outfile)
120
    else:
121 1
        previousdata = []
122 1
    previousdata.append(jsondata)
123 1
    with open(outputfile, 'w') as outfile:
124 1
        json.dump(previousdata, outfile, sort_keys=True,
125
                  indent=4, ensure_ascii=False)
126
127
128 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
129
                           number_of_models=5, nr_epochs=5, subset_size=100,
130
                           outputpath=None, **kwargs
131
                           ):
132
    """
133
    Tries out a number of models on a subsample of the data,
134
    and outputs the best found architecture and hyperparameters.
135
136
    Parameters
137
    ----------
138
    X_train : numpy array
139
        The input dataset for training of shape
140
        (num_samples, num_timesteps, num_channels)
141
    y_train : numpy array
142
        The output classes for the train data, in binary format of shape
143
        (num_samples, num_classes)
144
    X_val : numpy array
145
        The input dataset for validation of shape
146
        (num_samples_val, num_timesteps, num_channels)
147
    y_val : numpy array
148
        The output classes for the validation data, in binary format of shape
149
        (num_samples_val, num_classes)
150
    verbose : bool, optional
151
        flag for displaying verbose output
152
    number_of_models : int, optiona
153
        The number of models to generate and test
154
    nr_epochs : int, optional
155
        The number of epochs that each model is trained
156
    subset_size : int, optional
157
        The size of the subset of the data that is used for finding
158
        the optimal architecture
159
    outputpath : str, optional
160
        File location to store the model results
161
    **kwargs: key-value parameters
162
        parameters for generating the models
163
        (see docstring for modelgen.generate_models)
164
165
    Returns
166
    ----------
167
    best_model : Keras model
168
        Best performing model, already trained on a small sample data set.
169
    best_params : dict
170
        Dictionary containing the hyperparameters for the best model
171
    best_model_type : str
172
        Type of the best model
173
    knn_acc : float
174
        accuaracy for kNN prediction on validation set
175
    """
176 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
177
                                      number_of_models=number_of_models,
178
                                      **kwargs)
179 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
180
                                                                    y_train,
181
                                                                    X_val,
182
                                                                    y_val,
183
                                                                    models,
184
                                                                    nr_epochs,
185
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
186
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
187
                                                                    outputfile=outputpath)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
188 1
    best_model_index = np.argmax(val_accuracies)
189 1
    best_model, best_params, best_model_type = models[best_model_index]
190 1
    knn_acc = kNN_accuracy(
191
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
192 1
    if verbose:
193
        print('Best model: model ', best_model_index)
194
        print('Model type: ', best_model_type)
195
        print('Hyperparameters: ', best_params)
196
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
197
        print('Accuracy of kNN on validation set', knn_acc)
198
199 1
    if val_accuracies[best_model_index] < knn_acc:
200
        warnings.warn('Best model not better than kNN: ' +
201
                      str(val_accuracies[best_model_index]) + ' vs  ' +
202
                      str(knn_acc)
203
                      )
204 1
    return best_model, best_params, best_model_type, knn_acc
205
206
207 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
208
    """
209
    Performs k-Neigherst Neighbors and returns the accuracy score.
210
211
    Parameters
212
    ----------
213
    X_train : numpy array
214
        Train set of shape (num_samples, num_timesteps, num_channels)
215
    y_train : numpy array
216
        Class labels for train set
217
    X_val : numpy array
218
        Validation set of shape (num_samples, num_timesteps, num_channels)
219
    y_val : numpy array
220
        Class labels for validation set
221
    k : int
222
        number of neighbors to use for classifying
223
224
    Returns
225
    -------
226
    accuracy: float
227
        accuracy score on the validation set
228
    """
229 1
    num_samples, num_timesteps, num_channels = X_train.shape
230 1
    clf = neighbors.KNeighborsClassifier(k)
231 1
    clf.fit(
232
        X_train.reshape(
233
            num_samples,
234
            num_timesteps *
235
            num_channels),
236
        y_train)
237 1
    num_samples, num_timesteps, num_channels = X_val.shape
238 1
    val_predict = clf.predict(
239
        X_val.reshape(num_samples,
240
                      num_timesteps * num_channels))
241
    return metrics.accuracy_score(val_predict, y_val)
242