Completed
Push — master ( 681a50...95647a )
by Dafne van
13s
created

train_models_on_samples()   B

Complexity

Conditions 6

Size

Total Lines 74

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 17
CRAP Score 6.2488

Importance

Changes 4
Bugs 1 Features 0
Metric Value
cc 6
c 4
b 1
f 0
dl 0
loc 74
ccs 17
cts 21
cp 0.8095
crap 6.2488
rs 7.4965

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
 Summary:
3
 This module provides the main functionality of mcfly: searching for an
4
 optimal model architecture. The work flow is as follows:
5
 Function generate_models from modelgen.py generates and compiles models.
6
 Function train_models_on_samples trains those models.
7
 Function plotTrainingProcess plots the training process.
8
 Function find_best_architecture is wrapper function that combines
9
 these steps.
10
 Example function calls can be found in the tutorial notebook
11
 'EvaluateDifferentModels.ipynb'.
12
"""
13 1
import numpy as np
14 1
from . import modelgen
15 1
from sklearn import neighbors, metrics
16 1
import warnings
17 1
import json
18 1
import os
19 1
from keras.callbacks import EarlyStopping
20
21 1
def train_models_on_samples(X_train, y_train, X_val, y_val, models,
22
                            nr_epochs=5, subset_size=100, verbose=True, outputfile=None,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (88/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
23
                            model_path=None, early_stopping=False,
24
                            batch_size=20):
25
    """
26
    Given a list of compiled models, this function trains
27
    them all on a subset of the train data. If the given size of the subset is
28
    smaller then the size of the data, the complete data set is used.
29
30
    Parameters
31
    ----------
32
    X_train : numpy array of shape (num_samples, num_timesteps, num_channels)
33
        The input dataset for training
34
    y_train : numpy array of shape (num_samples, num_classes)
35
        The output classes for the train data, in binary format
36
    X_val : numpy array of shape (num_samples_val, num_timesteps, num_channels)
37
        The input dataset for validation
38
    y_val : numpy array of shape (num_samples_val, num_classes)
39
        The output classes for the validation data, in binary format
40
    models : list of model, params, modeltypes
41
        List of keras models to train
42
    nr_epochs : int, optional
43
        nr of epochs to use for training one model
44
    subset_size :
45
        The number of samples used from the complete train set
46
    verbose : bool, optional
47
        flag for displaying verbose output
48
    outputfile: str, optional
49
        Filename to store the model training results
50
    model_path : str, optional
51
        Directory to store the models as HDF5 files
52
    early_stopping: bool
53
        Stop when validation loss does not decrease
54
    batch_size : int
55
        nr of samples per batch
56
57
    Returns
58
    ----------
59
    histories : list of Keras History objects
60
        train histories for all models
61
    val_accuracies : list of floats
62
        validation accuraracies of the models
63
    val_losses : list of floats
64
        validation losses of the models
65
    """
66
    # if subset_size is smaller then X_train, this will work fine
67 1
    X_train_sub = X_train[:subset_size, :, :]
68 1
    y_train_sub = y_train[:subset_size, :]
69
70 1
    histories = []
71 1
    val_accuracies = []
72 1
    val_losses = []
73 1
    for i, (model, params, model_types) in enumerate(models):
74 1
        if verbose:
75
            print('Training model %d' % i, model_types)
76 1
        if early_stopping:
77
            callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=verbose, mode='auto')]
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (101/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
78
        else:
79 1
            callbacks = []
80 1
        history = model.fit(X_train_sub, y_train_sub,
81
                            epochs=nr_epochs, batch_size=batch_size,
82
                            # see comment on subsize_set
83
                            validation_data=(X_val, y_val),
84
                            verbose=verbose,
85
                            callbacks=callbacks)
86 1
        histories.append(history)
87 1
        val_accuracies.append(history.history['val_acc'][-1])
88 1
        val_losses.append(history.history['val_loss'][-1])
89 1
        if outputfile is not None:
90
            store_train_hist_as_json(params, model_types,
91
                                     history.history, outputfile)
92 1
        if model_path is not None:
93
                model.save(os.path.join(model_path, 'model_{}.h5'.format(i)))
0 ignored issues
show
Coding Style introduced by
The indentation here looks off. 12 spaces were expected, but 16 were found.
Loading history...
94 1
    return histories, val_accuracies, val_losses
95
96
97 1
def store_train_hist_as_json(params, model_type, history, outputfile):
98
    """
99
    This function stores the model parameters, the loss and accuracy history
100
    of one model in a JSON file. It appends the model information to the
101
    existing models in the file.
102
103
    Parameters
104
    ----------
105
    params : dict
106
        parameters for one model
107
    model_type : Keras model object
108
        Keras model object for one model
109
    history : dict
110
        training history from one model
111
    outputfile : str
112
        path where the json file needs to be stored
113
    """
114 1
    jsondata = params.copy()
115 1
    for k in jsondata.keys():
116 1
        if isinstance(jsondata[k], np.ndarray):
117 1
            jsondata[k] = jsondata[k].tolist()
118 1
    jsondata['train_acc'] = history['acc']
119 1
    jsondata['train_loss'] = history['loss']
120 1
    jsondata['val_acc'] = history['val_acc']
121 1
    jsondata['val_loss'] = history['val_loss']
122 1
    jsondata['modeltype'] = model_type
123 1
    jsondata['modeltype'] = model_type
124 1
    if os.path.isfile(outputfile):
125
        with open(outputfile, 'r') as outfile:
126
            previousdata = json.load(outfile)
127
    else:
128 1
        previousdata = []
129 1
    previousdata.append(jsondata)
130 1
    with open(outputfile, 'w') as outfile:
131 1
        json.dump(previousdata, outfile, sort_keys=True,
132
                  indent=4, ensure_ascii=False)
133
134
135 1
def find_best_architecture(X_train, y_train, X_val, y_val, verbose=True,
136
                           number_of_models=5, nr_epochs=5, subset_size=100,
137
                           outputpath=None, model_path=None, **kwargs
138
                           ):
139
    """
140
    Tries out a number of models on a subsample of the data,
141
    and outputs the best found architecture and hyperparameters.
142
143
    Parameters
144
    ----------
145
    X_train : numpy array
146
        The input dataset for training of shape
147
        (num_samples, num_timesteps, num_channels)
148
    y_train : numpy array
149
        The output classes for the train data, in binary format of shape
150
        (num_samples, num_classes)
151
    X_val : numpy array
152
        The input dataset for validation of shape
153
        (num_samples_val, num_timesteps, num_channels)
154
    y_val : numpy array
155
        The output classes for the validation data, in binary format of shape
156
        (num_samples_val, num_classes)
157
    verbose : bool, optional
158
        flag for displaying verbose output
159
    number_of_models : int, optiona
160
        The number of models to generate and test
161
    nr_epochs : int, optional
162
        The number of epochs that each model is trained
163
    subset_size : int, optional
164
        The size of the subset of the data that is used for finding
165
        the optimal architecture
166
    outputpath : str, optional
167
        Filename to store the model training history
168
    model_path: str, optional
169
        Directory to save the models as HDF5 files
170
    **kwargs: key-value parameters
171
        parameters for generating the models
172
        (see docstring for modelgen.generate_models)
173
174
    Returns
175
    ----------
176
    best_model : Keras model
177
        Best performing model, already trained on a small sample data set.
178
    best_params : dict
179
        Dictionary containing the hyperparameters for the best model
180
    best_model_type : str
181
        Type of the best model
182
    knn_acc : float
183
        accuaracy for kNN prediction on validation set
184
    """
185 1
    models = modelgen.generate_models(X_train.shape, y_train.shape[1],
186
                                      number_of_models=number_of_models,
187
                                      **kwargs)
188 1
    histories, val_accuracies, val_losses = train_models_on_samples(X_train,
189
                                                                    y_train,
190
                                                                    X_val,
191
                                                                    y_val,
192
                                                                    models,
193
                                                                    nr_epochs,
194
                                                                    subset_size=subset_size,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (92/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
195
                                                                    verbose=verbose,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (84/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
196
                                                                    outputfile=outputpath,
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
197
                                                                    model_path=model_path)
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (90/79).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
198 1
    best_model_index = np.argmax(val_accuracies)
199 1
    best_model, best_params, best_model_type = models[best_model_index]
200 1
    knn_acc = kNN_accuracy(
201
        X_train[:subset_size, :, :], y_train[:subset_size, :], X_val, y_val)
202 1
    if verbose:
203
        print('Best model: model ', best_model_index)
204
        print('Model type: ', best_model_type)
205
        print('Hyperparameters: ', best_params)
206
        print('Accuracy on validation set: ', val_accuracies[best_model_index])
207
        print('Accuracy of kNN on validation set', knn_acc)
208
209 1
    if val_accuracies[best_model_index] < knn_acc:
210
        warnings.warn('Best model not better than kNN: ' +
211
                      str(val_accuracies[best_model_index]) + ' vs  ' +
212
                      str(knn_acc)
213
                      )
214 1
    return best_model, best_params, best_model_type, knn_acc
215
216
217 1
def kNN_accuracy(X_train, y_train, X_val, y_val, k=1):
218
    """
219
    Performs k-Neigherst Neighbors and returns the accuracy score.
220
221
    Parameters
222
    ----------
223
    X_train : numpy array
224
        Train set of shape (num_samples, num_timesteps, num_channels)
225
    y_train : numpy array
226
        Class labels for train set
227
    X_val : numpy array
228
        Validation set of shape (num_samples, num_timesteps, num_channels)
229
    y_val : numpy array
230
        Class labels for validation set
231
    k : int
232
        number of neighbors to use for classifying
233
234
    Returns
235
    -------
236
    accuracy: float
237
        accuracy score on the validation set
238
    """
239 1
    num_samples, num_timesteps, num_channels = X_train.shape
240 1
    clf = neighbors.KNeighborsClassifier(k)
241 1
    clf.fit(
242
        X_train.reshape(
243
            num_samples,
244
            num_timesteps *
245
            num_channels),
246
        y_train)
247 1
    num_samples, num_timesteps, num_channels = X_val.shape
248 1
    val_predict = clf.predict(
249
        X_val.reshape(num_samples,
250
                      num_timesteps * num_channels))
251
    return metrics.accuracy_score(val_predict, y_val)
252