Completed
Pull Request — master (#356)
by Osma
07:25
created

annif.backend.nn_ensemble   A

Complexity

Total Complexity 13

Size/Duplication

Total Lines 137
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 13
eloc 103
dl 0
loc 137
rs 10
c 0
b 0
f 0

8 Methods

Rating   Name   Duplication   Size   Complexity  
A NNEnsembleBackend._create_model() 0 28 2
A NNEnsembleBackend.default_params() 0 5 1
A NNEnsembleBackend.learn() 0 3 1
A NNEnsembleBackend._corpus_to_vectors() 0 22 3
A NNEnsembleBackend.initialize() 0 10 3
A NNEnsembleBackend.train() 0 4 1
A NNEnsembleBackend._merge_hits_from_sources() 0 7 1
A NNEnsembleBackend._learn() 0 11 1
1
"""Neural network based ensemble backend that combines results from multiple
2
projects."""
3
4
5
import os.path
6
import numpy as np
7
from tensorflow.keras.layers import Input, Dense, Add, Flatten, Lambda, Dropout
8
from tensorflow.keras.models import Model, load_model
9
import tensorflow.keras.backend as K
10
import annif.corpus
11
import annif.project
12
import annif.util
13
from annif.exception import NotInitializedException
14
from annif.suggestion import VectorSuggestionResult
15
from . import backend
16
from . import ensemble
17
18
19
class NNEnsembleBackend(
20
        backend.AnnifLearningBackend,
21
        ensemble.EnsembleBackend):
22
    """Neural network ensemble backend that combines results from multiple
23
    projects"""
24
25
    name = "nn_ensemble"
26
27
    MODEL_FILE = "nn-model.h5"
28
29
    DEFAULT_PARAMS = {
30
        'nodes': 100,
31
        'dropout_rate': 0.2,
32
        'optimizer': 'adam',
33
        'epochs': 10,
34
        'learn-epochs': 1,
35
    }
36
37
    # defaults for uninitialized instances
38
    _model = None
39
40
    def default_params(self):
41
        params = {}
42
        params.update(super().default_params())
43
        params.update(self.DEFAULT_PARAMS)
44
        return params
45
46
    def initialize(self):
47
        if self._model is not None:
48
            return  # already initialized
49
        model_filename = os.path.join(self.datadir, self.MODEL_FILE)
50
        if not os.path.exists(model_filename):
51
            raise NotInitializedException(
52
                'model file {} not found'.format(model_filename),
53
                backend_id=self.backend_id)
54
        self.debug('loading Keras model from {}'.format(model_filename))
55
        self._model = load_model(model_filename)
56
57
    def _merge_hits_from_sources(self, hits_from_sources, project, params):
58
        score_vector = np.array([hits.vector * weight
59
                                 for hits, weight in hits_from_sources],
60
                                dtype=np.float32)
61
        results = self._model.predict(
62
            np.expand_dims(score_vector.transpose(), 0))
63
        return VectorSuggestionResult(results[0], project.subjects)
64
65
    def _create_model(self, sources, project):
66
        self.info("creating NN ensemble model")
67
68
        inputs = Input(shape=(len(project.subjects), len(sources)))
69
70
        flat_input = Flatten()(inputs)
71
        drop_input = Dropout(
72
            rate=float(
73
                self.params['dropout_rate']))(flat_input)
74
        hidden = Dense(int(self.params['nodes']),
75
                       activation="relu")(drop_input)
76
        drop_hidden = Dropout(rate=float(self.params['dropout_rate']))(hidden)
77
        delta = Dense(len(project.subjects),
78
                      kernel_initializer='zeros',
79
                      bias_initializer='zeros')(drop_hidden)
80
81
        mean = Lambda(lambda x: K.mean(x, axis=2))(inputs)
82
83
        predictions = Add()([mean, delta])
84
85
        self._model = Model(inputs=inputs, outputs=predictions)
86
        self._model.compile(optimizer=self.params['optimizer'],
87
                            loss='binary_crossentropy',
88
                            metrics=['top_k_categorical_accuracy'])
89
90
        summary = []
91
        self._model.summary(print_fn=summary.append)
92
        self.debug("Created model: \n" + "\n".join(summary))
93
94
    def train(self, corpus, project):
95
        sources = annif.util.parse_sources(self.params['sources'])
96
        self._create_model(sources, project)
97
        self._learn(corpus, project, epochs=int(self.params['epochs']))
98
99
    def _corpus_to_vectors(self, corpus, project):
100
        # pass corpus through all source projects
101
        sources = [(annif.project.get_project(project_id), weight)
102
                   for project_id, weight
103
                   in annif.util.parse_sources(self.params['sources'])]
104
105
        score_vectors = []
106
        true_vectors = []
107
        for doc in corpus.documents:
108
            doc_scores = []
109
            for source_project, weight in sources:
110
                hits = source_project.suggest(doc.text)
111
                doc_scores.append(hits.vector * weight)
112
            score_vectors.append(np.array(doc_scores,
113
                                          dtype=np.float32).transpose())
114
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
115
            true_vectors.append(subjects.as_vector(project.subjects))
116
        # collect the results into a single vector, considering weights
117
        scores = np.array(score_vectors, dtype=np.float32)
118
        # collect the gold standard values into another vector
119
        true = np.array(true_vectors, dtype=np.float32)
120
        return (scores, true)
121
122
    def _learn(self, corpus, project, epochs):
123
        scores, true = self._corpus_to_vectors(corpus, project)
124
125
        # fit the model
126
        self._model.fit(scores, true, batch_size=32, verbose=True,
127
                        epochs=epochs)
128
129
        annif.util.atomic_save(
130
            self._model,
131
            self.datadir,
132
            self.MODEL_FILE)
133
134
    def learn(self, corpus, project):
135
        self.initialize()
136
        self._learn(corpus, project, int(self.params['learn-epochs']))
137