Passed
Pull Request — master (#462)
by Osma
01:51
created

annif.backend.mllm.MLLMModel._prepare_terms()   B

Complexity

Conditions 7

Size

Total Lines 22
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 19
nop 4
dl 0
loc 22
rs 8
c 0
b 0
f 0
1
"""Maui-like Lexical Matching backend"""
2
3
import collections
4
import math
5
from enum import IntEnum
6
from statistics import mean
7
import os.path
8
import joblib
9
import numpy as np
10
from rdflib import URIRef
11
from rdflib.namespace import SKOS
12
from scipy.sparse import lil_matrix
13
from sklearn.feature_extraction.text import CountVectorizer
14
from sklearn.ensemble import BaggingClassifier
15
from sklearn.tree import DecisionTreeClassifier
16
import annif.util
17
from annif.exception import NotInitializedException
18
from annif.suggestion import VectorSuggestionResult
19
from . import backend
20
from . import hyperopt
21
22
Term = collections.namedtuple('Term', 'subject_id label is_pref')
23
Match = collections.namedtuple(
24
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'broader narrower related',
35
    start=0)
36
37
38
class TokenSet:
39
    """Represents a set of tokens (expressed as integer token IDs) that can
40
    be matched with another set of tokens. A TokenSet can optionally
41
    be associated with a subject from the vocabulary."""
42
43
    def __init__(self, tokens, subject_id=None, is_pref=False):
44
        self._tokens = set(tokens)
45
        self.subject_id = subject_id
46
        self.is_pref = is_pref
47
48
    def __len__(self):
49
        return len(self._tokens)
50
51
    def __iter__(self):
52
        return iter(self._tokens)
53
54
    def contains(self, other):
55
        """Returns True iff the tokens in the other TokenSet are all
56
        included within this TokenSet."""
57
58
        return other._tokens.issubset(self._tokens)
59
60
    def sample(self):
61
        """Return an arbitrary token from this TokenSet, or None if empty"""
62
        try:
63
            return next(iter(self._tokens))
64
        except StopIteration:
65
            return None
66
67
68
class TokenSetIndex:
69
    """A searchable index of TokenSets (representing vocabulary terms)"""
70
71
    def __init__(self):
72
        self._index = collections.defaultdict(set)
73
74
    def __len__(self):
75
        return len(self._index)
76
77
    def add(self, tset):
78
        """Add a TokenSet into this index"""
79
        token = tset.sample()
80
        if token is not None:
81
            self._index[token].add(tset)
82
83
    def search(self, tset):
84
        """Return the TokenSets that are contained in the given TokenSet.
85
        The matches are returned as a list of (TokenSet, ambiguity) pairs
86
        where ambiguity is an integer indicating the number of other TokenSets
87
        that also match the same tokens."""
88
89
        subj_tsets = {}
90
        subj_ambiguity = collections.Counter()
91
92
        for token in tset:
93
            for ts in self._index[token]:
94
                if not tset.contains(ts):
95
                    continue
96
                if ts.subject_id not in subj_tsets or \
97
                   not subj_tsets[ts.subject_id].is_pref:
98
                    subj_tsets[ts.subject_id] = ts
99
100
        for ts in subj_tsets.values():
101
            for other in subj_tsets.values():
102
                if ts == other:
103
                    continue
104
                if other.contains(ts):
105
                    subj_ambiguity.update([ts.subject_id])
106
107
        return [(ts, subj_ambiguity[ts.subject_id])
108
                for uri, ts in subj_tsets.items()]
109
110
111
class MLLMModel:
112
    """Maui-like Lexical Matching model"""
113
114
    def _conflate_matches(self, matches, doc_length):
115
        subj_matches = collections.defaultdict(list)
116
        for match in matches:
117
            subj_matches[match.subject_id].append(match)
118
        return [
119
            Candidate(
120
                doc_length=doc_length,
121
                subject_id=subject_id,
122
                freq=len(matches) / doc_length,
123
                is_pref=mean((float(m.is_pref) for m in matches)),
124
                n_tokens=mean((m.n_tokens for m in matches)),
125
                ambiguity=mean((m.ambiguity for m in matches)),
126
                first_occ=matches[0].pos / doc_length,
127
                last_occ=matches[-1].pos / doc_length,
128
                spread=(matches[-1].pos - matches[0].pos) / doc_length
129
            )
130
            for subject_id, matches in subj_matches.items()]
131
132
    def generate_candidates(self, text, analyzer):
133
        sentences = analyzer.tokenize_sentences(text)
134
        sent_tokens = self._vectorizer.transform(sentences)
135
        matches = []
136
137
        for sent_idx, token_matrix in enumerate(sent_tokens):
138
            tset = TokenSet(token_matrix.nonzero()[1])
139
            for ts, ambiguity in self._index.search(tset):
140
                matches.append(Match(subject_id=ts.subject_id,
141
                                     is_pref=ts.is_pref,
142
                                     n_tokens=len(ts),
143
                                     pos=sent_idx,
144
                                     ambiguity=ambiguity))
145
146
        return self._conflate_matches(matches, len(sentences))
147
148
    def _candidates_to_features(self, candidates):
149
        """Convert a list of Candidates to a NumPy feature matrix"""
150
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
151
        c_ids = [c.subject_id for c in candidates]
152
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
153
        c_vec[c_ids] = True
154
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
155
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
156
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
157
        for idx, c in enumerate(candidates):
158
            subj = c.subject_id
159
            matrix[idx, Feature.freq] = c.freq
160
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
161
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
162
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
163
            matrix[idx, Feature.is_pref] = c.is_pref
164
            matrix[idx, Feature.n_tokens] = c.n_tokens
165
            matrix[idx, Feature.ambiguity] = c.ambiguity
166
            matrix[idx, Feature.first_occ] = c.first_occ
167
            matrix[idx, Feature.last_occ] = c.last_occ
168
            matrix[idx, Feature.spread] = c.spread
169
            matrix[idx, Feature.doc_length] = c.doc_length
170
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
171
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
172
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
173
        return matrix
174
175
    def _prepare_terms(self, graph, vocab, params):
176
        terms = []
177
        subject_ids = []
178
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
179
            if pref is None:
180
                continue  # deprecated subject
181
            subject_ids.append(subj_id)
182
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
183
184
            if annif.util.boolean(params['use_hidden_labels']):
185
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
186
            else:
187
                label_props = [SKOS.altLabel]
188
189
            for prop in label_props:
190
                for label in graph.objects(URIRef(uri), prop):
191
                    if label.language != params['language']:
192
                        continue
193
                    terms.append(Term(subject_id=subj_id,
194
                                      label=str(label),
195
                                      is_pref=False))
196
        return (terms, subject_ids)
197
198
    def _prepare_relations(self, graph, vocab):
199
        n_subj = len(vocab.subjects)
200
        self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
201
        self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
202
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
203
204
        prop_matrix = [
205
            (SKOS.broader, self._broader_matrix),
206
            (SKOS.narrower, self._narrower_matrix),
207
            (SKOS.related, self._related_matrix)
208
        ]
209
210
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
211
            if pref is None:
212
                continue  # deprecated subject
213
214
            for prop, matrix in prop_matrix:
215
                for other in graph.objects(URIRef(uri), prop):
216
                    other_id = vocab.subjects.by_uri(str(other),
217
                                                     warnings=False)
218
                    if other_id is not None:
219
                        matrix[subj_id, other_id] = True
220
221
    def _prepare_train_index(self, vocab, analyzer, params):
222
        graph = vocab.as_graph()
223
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
224
        self._prepare_relations(graph, vocab)
225
226
        self._vectorizer = CountVectorizer(
227
            binary=True,
228
            tokenizer=analyzer.tokenize_words
229
        )
230
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
231
232
        self._index = TokenSetIndex()
233
        for term, label_matrix in zip(terms, label_corpus):
234
            tokens = label_matrix.nonzero()[1]
235
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
236
            self._index.add(tset)
237
238
        return subject_ids
239
240
    def prepare_train(self, corpus, vocab, analyzer, params):
241
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
242
243
        # frequency of subjects (by id) in the generated candidates
244
        self._doc_freq = collections.Counter()
245
        # frequency of manually assigned subjects ("domain keyphraseness")
246
        self._subj_freq = collections.Counter()
247
        doc_count = 0
248
        train_x = []
249
        train_y = []
250
        for idx, doc in enumerate(corpus.documents):
251
            doc_subject_ids = [vocab.subjects.by_uri(uri)
252
                               for uri in doc.uris]
253
            self._subj_freq.update(doc_subject_ids)
254
            candidates = self.generate_candidates(doc.text, analyzer)
255
            self._doc_freq.update([c.subject_id for c in candidates])
256
            train_x.append(candidates)
257
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
258
            doc_count += 1
259
260
        # precalculate idf values for candidate subjects
261
        self._idf = collections.defaultdict(float)
262
        for subj_id in subject_ids:
263
            self._idf[subj_id] = math.log((doc_count + 1) /
264
                                          (self._doc_freq[subj_id] + 1)) + 1
265
        return (np.vstack([self._candidates_to_features(candidates)
266
                           for candidates in train_x]), np.array(train_y))
267
268
    def _create_classifier(self, params):
269
        return BaggingClassifier(
270
            DecisionTreeClassifier(
271
                min_samples_leaf=int(params['min_samples_leaf']),
272
                max_leaf_nodes=int(params['max_leaf_nodes'])
273
            ), max_samples=float(params['max_samples']))
274
275
    def train(self, train_x, train_y, params):
276
        # fit the model on the training corpus
277
        self._classifier = self._create_classifier(params)
278
        self._classifier.fit(train_x, train_y)
279
280
    def _prediction_to_list(self, scores, candidates):
281
        subj_scores = [(score[1], c.subject_id)
282
                       for score, c in zip(scores, candidates)]
283
        return sorted(subj_scores, reverse=True)
284
285
    def predict(self, candidates):
286
        if not candidates:
287
            return []
288
        features = self._candidates_to_features(candidates)
289
        scores = self._classifier.predict_proba(features)
290
        return self._prediction_to_list(scores, candidates)
291
292
293
class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
294
    """Hyperparameter optimizer for the MLLM backend"""
295
296
    def _prepare(self, n_jobs=1):
297
        self._backend.initialize()
298
        self._train_x, self._train_y = self._backend._load_train_data()
299
        self._candidates = []
300
        self._gold_subjects = []
301
302
        # TODO parallelize generation of candidates
303
        for doc in self._corpus.documents:
304
            candidates = self._backend._generate_candidates(doc.text)
305
            self._candidates.append(candidates)
306
            self._gold_subjects.append(
307
                annif.corpus.SubjectSet((doc.uris, doc.labels)))
308
309
    def _objective(self, trial):
310
        params = {
311
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30),
312
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000),
313
            'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
314
            'use_hidden_labels':
315
                trial.suggest_categorical('use_hidden_labels', [True, False]),
316
            'limit': 100
317
        }
318
        model = self._backend._model._create_classifier(params)
319
        model.fit(self._train_x, self._train_y)
320
321
        batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
322
        for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
323
            if candidates:
324
                features = \
325
                    self._backend._model._candidates_to_features(candidates)
326
                scores = model.predict_proba(features)
327
                ranking = self._backend._model._prediction_to_list(
328
                    scores, candidates)
329
            else:
330
                ranking = []
331
            results = self._backend._prediction_to_result(ranking, params)
332
            batch.evaluate(results, goldsubj)
333
        results = batch.results(metrics=[self._metric])
334
        return results[self._metric]
335
336
    def _postprocess(self, study):
337
        bp = study.best_params
338
        lines = [
339
            f"min_samples_leaf={bp['min_samples_leaf']}",
340
            f"max_leaf_nodes={bp['max_leaf_nodes']}",
341
            f"max_samples={bp['max_samples']:.4f}",
342
            f"use_hidden_labels={bp['use_hidden_labels']}"
343
        ]
344
        return hyperopt.HPRecommendation(lines=lines, score=study.best_value)
345
346
347
class MLLMBackend(hyperopt.AnnifHyperoptBackend):
348
    """Maui-like Lexical Matching backend for Annif"""
349
    name = "mllm"
350
    needs_subject_index = True
351
352
    # defaults for unitialized instances
353
    _model = None
354
355
    MODEL_FILE = 'mllm-model.gz'
356
    TRAIN_FILE = 'mllm-train.gz'
357
358
    DEFAULT_PARAMETERS = {
359
        'min_samples_leaf': 20,
360
        'max_leaf_nodes': 1000,
361
        'max_samples': 0.9,
362
        'use_hidden_labels': False
363
    }
364
365
    def get_hp_optimizer(self, corpus, metric):
366
        return MLLMOptimizer(self, corpus, metric)
367
368
    def default_params(self):
369
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
370
        params.update(self.DEFAULT_PARAMETERS)
371
        return params
372
373
    def _load_model(self):
374
        path = os.path.join(self.datadir, self.MODEL_FILE)
375
        self.debug('loading model from {}'.format(path))
376
        if os.path.exists(path):
377
            return joblib.load(path)
378
        else:
379
            raise NotInitializedException(
380
                'model {} not found'.format(path),
381
                backend_id=self.backend_id)
382
383
    def _load_train_data(self):
384
        path = os.path.join(self.datadir, self.TRAIN_FILE)
385
        if os.path.exists(path):
386
            return joblib.load(path)
387
        else:
388
            raise NotInitializedException(
389
                'train data file {} not found'.format(path),
390
                backend_id=self.backend_id)
391
392
    def initialize(self):
393
        if self._model is None:
394
            self._model = self._load_model()
395
396
    def _train(self, corpus, params):
397
        self.info('starting train')
398
        if corpus != 'cached':
399
            self.info("preparing training data")
400
            self._model = MLLMModel()
401
            train_data = self._model.prepare_train(corpus,
402
                                                   self.project.vocab,
403
                                                   self.project.analyzer,
404
                                                   params)
405
            annif.util.atomic_save(train_data,
406
                                   self.datadir,
407
                                   self.TRAIN_FILE,
408
                                   method=joblib.dump)
409
        else:
410
            self.info("reusing cached training data from previous run")
411
            self._model = self._load_model()
412
            train_data = self._load_train_data()
413
414
        self.info("training model")
415
        self._model.train(train_data[0], train_data[1], params)
416
417
        self.info('saving model')
418
        annif.util.atomic_save(
419
            self._model,
420
            self.datadir,
421
            self.MODEL_FILE,
422
            method=joblib.dump)
423
424
    def _generate_candidates(self, text):
425
        return self._model.generate_candidates(text, self.project.analyzer)
426
427
    def _prediction_to_result(self, prediction, params):
428
        vector = np.zeros(len(self.project.subjects), dtype=np.float32)
429
        for score, subject_id in prediction:
430
            vector[subject_id] = score
431
        result = VectorSuggestionResult(vector)
432
        return result.filter(self.project.subjects,
433
                             limit=int(params['limit']))
434
435
    def _suggest(self, text, params):
436
        candidates = self._generate_candidates(text)
437
        prediction = self._model.predict(candidates)
438
        return self._prediction_to_result(prediction, params)
439