Passed
Pull Request — master (#597)
by Osma
02:50
created

MLLMModel._prepare_train_index()   A

Complexity

Conditions 2

Size

Total Lines 24
Code Lines 17

Duplication

Lines 24
Ratio 100 %

Importance

Changes 0
Metric Value
cc 2
eloc 17
nop 4
dl 24
loc 24
rs 9.55
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.exception import OperationFailedException
16
from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
from annif.lexical.util import get_subject_labels
18
from annif.lexical.util import make_relation_matrix, make_collection_matrix
19
20
21
Term = collections.namedtuple('Term', 'subject_id label is_pref')
22
23
Match = collections.namedtuple(
24
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
25
26
Candidate = collections.namedtuple(
27
    'Candidate',
28
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
29
    'first_occ last_occ spread')
30
31
ModelData = collections.namedtuple(
32
    'ModelData',
33
    'broader narrower related collection ' +
34
    'doc_freq subj_freq idf')
35
36
Feature = IntEnum(
37
    'Feature',
38
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
39
    'first_occ last_occ spread doc_length ' +
40
    'broader narrower related collection',
41
    start=0)
42
43
44 View Code Duplication
def conflate_matches(matches, doc_length):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
45
    subj_matches = collections.defaultdict(list)
46
    for match in matches:
47
        subj_matches[match.subject_id].append(match)
48
    return [
49
        Candidate(
50
            doc_length=doc_length,
51
            subject_id=subject_id,
52
            freq=len(matches) / doc_length,
53
            is_pref=mean((float(m.is_pref) for m in matches)),
54
            n_tokens=mean((m.n_tokens for m in matches)),
55
            ambiguity=mean((m.ambiguity for m in matches)),
56
            first_occ=matches[0].pos / doc_length,
57
            last_occ=matches[-1].pos / doc_length,
58
            spread=(matches[-1].pos - matches[0].pos) / doc_length
59
        )
60
        for subject_id, matches in subj_matches.items()]
61
62
63 View Code Duplication
def generate_candidates(text, analyzer, vectorizer, index):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
64
    sentences = analyzer.tokenize_sentences(text)
65
    sent_tokens = vectorizer.transform(sentences)
66
    matches = []
67
68
    for sent_idx, token_matrix in enumerate(sent_tokens):
69
        tset = TokenSet(token_matrix.nonzero()[1])
70
        for ts, ambiguity in index.search(tset):
71
            matches.append(Match(subject_id=ts.subject_id,
72
                                 is_pref=ts.is_pref,
73
                                 n_tokens=len(ts),
74
                                 pos=sent_idx,
75
                                 ambiguity=ambiguity))
76
77
    return conflate_matches(matches, len(sentences))
78
79
80 View Code Duplication
def candidates_to_features(candidates, mdata):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
81
    """Convert a list of Candidates to a NumPy feature matrix"""
82
83
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
84
    c_ids = [c.subject_id for c in candidates]
85
    c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
86
    c_vec[c_ids] = True
87
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
88
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
89
    related = mdata.related.multiply(c_vec).sum(axis=1)
90
    collection = mdata.collection.multiply(c_vec).T.dot(
91
        mdata.collection).sum(axis=0)
92
    for idx, c in enumerate(candidates):
93
        subj = c.subject_id
94
        matrix[idx, Feature.freq] = c.freq
95
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
96
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
97
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
98
        matrix[idx, Feature.is_pref] = c.is_pref
99
        matrix[idx, Feature.n_tokens] = c.n_tokens
100
        matrix[idx, Feature.ambiguity] = c.ambiguity
101
        matrix[idx, Feature.first_occ] = c.first_occ
102
        matrix[idx, Feature.last_occ] = c.last_occ
103
        matrix[idx, Feature.spread] = c.spread
104
        matrix[idx, Feature.doc_length] = c.doc_length
105
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
106
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
107
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
108
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
109
    return matrix
110
111
112
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
113
114
    @classmethod
115
    def generate_candidates(cls, doc_subject_ids, text):
116
        candidates = generate_candidates(text, **cls.args)  # pragma: no cover
117
        return doc_subject_ids, candidates  # pragma: no cover
118
119
120
class MLLMFeatureConverter(annif.parallel.BaseWorker):
121
122
    @classmethod
123
    def candidates_to_features(cls, candidates):
124
        return candidates_to_features(candidates,
125
                                      **cls.args)  # pragma: no cover
126
127
128 View Code Duplication
class MLLMModel:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
129
    """Maui-like Lexical Matching model"""
130
131
    def generate_candidates(self, text, analyzer):
132
        return generate_candidates(text, analyzer,
133
                                   self._vectorizer, self._index)
134
135
    @property
136
    def _model_data(self):
137
        return ModelData(broader=self._broader_matrix,
138
                         narrower=self._narrower_matrix,
139
                         related=self._related_matrix,
140
                         collection=self._collection_matrix,
141
                         doc_freq=self._doc_freq,
142
                         subj_freq=self._subj_freq,
143
                         idf=self._idf)
144
145
    def _candidates_to_features(self, candidates):
146
        return candidates_to_features(candidates, self._model_data)
147
148
    @staticmethod
149
    def _get_label_props(params):
150
        pref_label_props = [SKOS.prefLabel]
151
152
        if annif.util.boolean(params['use_hidden_labels']):
153
            nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
154
        else:
155
            nonpref_label_props = [SKOS.altLabel]
156
157
        return (pref_label_props, nonpref_label_props)
158
159
    def _prepare_terms(self, graph, vocab, params):
160
        pref_label_props, nonpref_label_props = self._get_label_props(params)
161
162
        terms = []
163
        subject_ids = []
164
        for subj_id, uri, _, _ in vocab.subjects.active:
165
            subject_ids.append(subj_id)
166
167
            for label in get_subject_labels(graph, uri, pref_label_props,
168
                                            params['language']):
169
                terms.append(Term(subject_id=subj_id,
170
                                  label=label,
171
                                  is_pref=True))
172
173
            for label in get_subject_labels(graph, uri, nonpref_label_props,
174
                                            params['language']):
175
                terms.append(Term(subject_id=subj_id,
176
                                  label=label,
177
                                  is_pref=False))
178
179
        return (terms, subject_ids)
180
181
    def _prepare_relations(self, graph, vocab):
182
        self._broader_matrix = make_relation_matrix(
183
            graph, vocab, SKOS.broader)
184
        self._narrower_matrix = make_relation_matrix(
185
            graph, vocab, SKOS.narrower)
186
        self._related_matrix = make_relation_matrix(
187
            graph, vocab, SKOS.related)
188
        self._collection_matrix = make_collection_matrix(graph, vocab)
189
190
    def _prepare_train_index(self, vocab, analyzer, params):
191
        graph = vocab.as_graph()
192
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
193
        self._prepare_relations(graph, vocab)
194
195
        self._vectorizer = CountVectorizer(
196
            binary=True,
197
            tokenizer=analyzer.tokenize_words
198
        )
199
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
200
201
        # frequency of each token used in labels - how rare each word is
202
        token_freq = np.bincount(label_corpus.indices,
203
                                 minlength=label_corpus.shape[1])
204
205
        self._index = TokenSetIndex()
206
        for term, label_matrix in zip(terms, label_corpus):
207
            tokens = label_matrix.nonzero()[1]
208
            # sort tokens by frequency - use the rarest token as index key
209
            tokens = sorted(tokens, key=token_freq.__getitem__)
210
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
211
            self._index.add(tset)
212
213
        return subject_ids
214
215
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
216
        # frequency of subjects (by id) in the generated candidates
217
        self._doc_freq = collections.Counter()
218
        # frequency of manually assigned subjects ("domain keyphraseness")
219
        self._subj_freq = collections.Counter()
220
        train_x = []
221
        train_y = []
222
223
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
224
225
        cg_args = {
226
            'analyzer': analyzer,
227
            'vectorizer': self._vectorizer,
228
            'index': self._index
229
        }
230
231
        with pool_class(jobs,
232
                        initializer=MLLMCandidateGenerator.init,
233
                        initargs=(cg_args,)) as pool:
234
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
235
                       doc.text)
236
                      for doc in corpus.documents)
237
            for doc_subject_ids, candidates in pool.starmap(
238
                    MLLMCandidateGenerator.generate_candidates, params, 10):
239
240
                self._subj_freq.update(doc_subject_ids)
241
                self._doc_freq.update([c.subject_id for c in candidates])
242
                train_x.append(candidates)
243
                train_y += [(c.subject_id in doc_subject_ids)
244
                            for c in candidates]
245
246
        return (train_x, train_y)
247
248
    def _calculate_idf(self, subject_ids, doc_count):
249
        idf = collections.defaultdict(float)
250
        for subj_id in subject_ids:
251
            idf[subj_id] = math.log((doc_count + 1) /
252
                                    (self._doc_freq[subj_id] + 1)) + 1
253
254
        return idf
255
256
    def _prepare_features(self, train_x, n_jobs):
257
        fc_args = {'mdata': self._model_data}
258
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
259
260
        with pool_class(jobs,
261
                        initializer=MLLMFeatureConverter.init,
262
                        initargs=(fc_args,)) as pool:
263
            features = pool.map(
264
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
265
266
        return features
267
268
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
269
        # create an index from the vocabulary terms
270
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
271
272
        # convert the corpus into train data
273
        train_x, train_y = self._prepare_train_data(
274
            corpus, vocab, analyzer, n_jobs)
275
276
        # precalculate idf values for all candidate subjects
277
        self._idf = self._calculate_idf(subject_ids, len(train_x))
278
279
        # convert the train data into feature values
280
        features = self._prepare_features(train_x, n_jobs)
281
282
        return (np.vstack(features), np.array(train_y))
283
284
    def _create_classifier(self, params):
285
        return BaggingClassifier(
286
            DecisionTreeClassifier(
287
                min_samples_leaf=int(params['min_samples_leaf']),
288
                max_leaf_nodes=int(params['max_leaf_nodes'])
289
            ), max_samples=float(params['max_samples']))
290
291
    def train(self, train_x, train_y, params):
292
        # fit the model on the training corpus
293
        self._classifier = self._create_classifier(params)
294
        self._classifier.fit(train_x, train_y)
295
        # sanity check: verify that the classifier has seen both classes
296
        if self._classifier.n_classes_ != 2:
297
            raise OperationFailedException(
298
                "Unable to create classifier: " +
299
                "Not enough positive and negative examples " +
300
                "in the training data. Please check that your training " +
301
                "data matches your vocabulary.")
302
303
    def _prediction_to_list(self, scores, candidates):
304
        subj_scores = [(score[1], c.subject_id)
305
                       for score, c in zip(scores, candidates)]
306
        return sorted(subj_scores, reverse=True)
307
308
    def predict(self, candidates):
309
        if not candidates:
310
            return []
311
        features = self._candidates_to_features(candidates)
312
        scores = self._classifier.predict_proba(features)
313
        return self._prediction_to_list(scores, candidates)
314
315
    def save(self, filename):
316
        return joblib.dump(self, filename)
317
318
    @staticmethod
319
    def load(filename):
320
        return joblib.load(filename)
321