Passed
Pull Request — master (#462)
by Osma
03:40
created

annif.backend.mllm.TokenSet.__iter__()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""Maui-like Lexical Matching backend"""
2
3
import collections
4
import math
5
from enum import IntEnum
6
from statistics import mean
7
import os.path
8
import joblib
9
import numpy as np
10
from rdflib import URIRef
11
from rdflib.namespace import SKOS
12
from scipy.sparse import lil_matrix
13
from sklearn.feature_extraction.text import CountVectorizer
14
from sklearn.ensemble import BaggingClassifier
15
from sklearn.tree import DecisionTreeClassifier
16
import annif.util
17
from annif.exception import NotInitializedException
18
from annif.suggestion import VectorSuggestionResult
19
from . import backend
20
from . import hyperopt
21
22
Term = collections.namedtuple('Term', 'subject_id label is_pref')
23
Match = collections.namedtuple(
24
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'related',
35
    start=0)
36
37
38
class TokenSet:
39
    """Represents a set of tokens (expressed as integer token IDs) that can
40
    be matched with another set of tokens. A TokenSet can optionally
41
    be associated with a subject from the vocabulary."""
42
43
    def __init__(self, tokens, subject_id=None, is_pref=False):
44
        self._tokens = set(tokens)
45
        self.subject_id = subject_id
46
        self.is_pref = is_pref
47
48
    def __len__(self):
49
        return len(self._tokens)
50
51
    def __iter__(self):
52
        return iter(self._tokens)
53
54
    def contains(self, other):
55
        """Returns True iff the tokens in the other TokenSet are all
56
        included within this TokenSet."""
57
58
        return other._tokens.issubset(self._tokens)
59
60
    def sample(self):
61
        """Return an arbitrary token from this TokenSet, or None if empty"""
62
        try:
63
            return next(iter(self._tokens))
64
        except StopIteration:
65
            return None
66
67
68
class TokenSetIndex:
69
    """A searchable index of TokenSets (representing vocabulary terms)"""
70
71
    def __init__(self):
72
        self._index = collections.defaultdict(set)
73
74
    def __len__(self):
75
        return len(self._index)
76
77
    def add(self, tset):
78
        """Add a TokenSet into this index"""
79
        token = tset.sample()
80
        if token is not None:
81
            self._index[token].add(tset)
82
83
    def search(self, tset):
84
        """Return the TokenSets that are contained in the given TokenSet.
85
        The matches are returned as a list of (TokenSet, ambiguity) pairs
86
        where ambiguity is an integer indicating the number of other TokenSets
87
        that also match the same tokens."""
88
89
        subj_tsets = {}
90
        subj_ambiguity = collections.Counter()
91
92
        for token in tset:
93
            for ts in self._index[token]:
94
                if not tset.contains(ts):
95
                    continue
96
                if ts.subject_id not in subj_tsets or \
97
                   not subj_tsets[ts.subject_id].is_pref:
98
                    subj_tsets[ts.subject_id] = ts
99
100
        for ts in subj_tsets.values():
101
            for other in subj_tsets.values():
102
                if ts == other:
103
                    continue
104
                if other.contains(ts):
105
                    subj_ambiguity.update([ts.subject_id])
106
107
        return [(ts, subj_ambiguity[ts.subject_id])
108
                for uri, ts in subj_tsets.items()]
109
110
111
class MLLMModel:
112
    """Maui-like Lexical Matching model"""
113
114
    def _conflate_matches(self, matches, doc_length):
115
        subj_matches = collections.defaultdict(list)
116
        for match in matches:
117
            subj_matches[match.subject_id].append(match)
118
        return [
119
            Candidate(
120
                doc_length=doc_length,
121
                subject_id=subject_id,
122
                freq=len(matches) / doc_length,
123
                is_pref=mean((float(m.is_pref) for m in matches)),
124
                n_tokens=mean((m.n_tokens for m in matches)),
125
                ambiguity=mean((m.ambiguity for m in matches)),
126
                first_occ=matches[0].pos / doc_length,
127
                last_occ=matches[-1].pos / doc_length,
128
                spread=(matches[-1].pos - matches[0].pos) / doc_length
129
            )
130
            for subject_id, matches in subj_matches.items()]
131
132
    def generate_candidates(self, text, analyzer):
133
        sentences = analyzer.tokenize_sentences(text)
134
        sent_tokens = self._vectorizer.transform(sentences)
135
        matches = []
136
137
        for sent_idx, token_matrix in enumerate(sent_tokens):
138
            tset = TokenSet(token_matrix.nonzero()[1])
139
            for ts, ambiguity in self._index.search(tset):
140
                matches.append(Match(subject_id=ts.subject_id,
141
                                     is_pref=ts.is_pref,
142
                                     n_tokens=len(ts),
143
                                     pos=sent_idx,
144
                                     ambiguity=ambiguity))
145
146
        return self._conflate_matches(matches, len(sentences))
147
148
    def _candidates_to_features(self, candidates):
149
        """Convert a list of Candidates to a NumPy feature matrix"""
150
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
151
        c_ids = [c.subject_id for c in candidates]
152
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
153
        c_vec[c_ids] = True
154
        rels = self._related_matrix.multiply(c_vec).sum(axis=1)
155
        for idx, c in enumerate(candidates):
156
            subj = c.subject_id
157
            matrix[idx, Feature.freq] = c.freq
158
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
159
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
160
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
161
            matrix[idx, Feature.is_pref] = c.is_pref
162
            matrix[idx, Feature.n_tokens] = c.n_tokens
163
            matrix[idx, Feature.ambiguity] = c.ambiguity
164
            matrix[idx, Feature.first_occ] = c.first_occ
165
            matrix[idx, Feature.last_occ] = c.last_occ
166
            matrix[idx, Feature.spread] = c.spread
167
            matrix[idx, Feature.doc_length] = c.doc_length
168
            matrix[idx, Feature.related] = rels[subj, 0] / len(c_ids)
169
        return matrix
170
171
    def prepare_train(self, corpus, vocab, analyzer, params):
172
        graph = vocab.as_graph()
173
        terms = []
174
        subject_ids = []
175
        n_subj = len(vocab.subjects)
176
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
177
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
178
            if pref is None:
179
                continue  # deprecated subject
180
            subject_ids.append(subj_id)
181
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
182
183
            if annif.util.boolean(params['use_hidden_labels']):
184
                props = [SKOS.altLabel, SKOS.hiddenLabel]
185
            else:
186
                props = [SKOS.altLabel]
187
188
            non_pref = graph.preferredLabel(URIRef(uri),
189
                                            lang=params['language'],
190
                                            labelProperties=props)
191
            for label, _ in non_pref:
192
                terms.append(Term(subject_id=subj_id,
193
                                  label=str(label),
194
                                  is_pref=False))
195
196
            for related in graph.objects(URIRef(uri), SKOS.related):
197
                broad_id = vocab.subjects.by_uri(str(related), warnings=False)
198
                if broad_id is not None:
199
                    self._related_matrix[subj_id, broad_id] = True
200
201
        self._vectorizer = CountVectorizer(
202
            binary=True,
203
            tokenizer=analyzer.tokenize_words
204
        )
205
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
206
207
        self._index = TokenSetIndex()
208
        for term, label_matrix in zip(terms, label_corpus):
209
            tokens = label_matrix.nonzero()[1]
210
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
211
            self._index.add(tset)
212
213
        # frequency of subjects (by id) in the generated candidates
214
        self._doc_freq = collections.Counter()
215
        # frequency of manually assigned subjects ("domain keyphraseness")
216
        self._subj_freq = collections.Counter()
217
        doc_count = 0
218
        train_x = []
219
        train_y = []
220
        for idx, doc in enumerate(corpus.documents):
221
            doc_subject_ids = [vocab.subjects.by_uri(uri)
222
                               for uri in doc.uris]
223
            self._subj_freq.update(doc_subject_ids)
224
            candidates = self.generate_candidates(doc.text, analyzer)
225
            self._doc_freq.update([c.subject_id for c in candidates])
226
            train_x.append(candidates)
227
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
228
            doc_count += 1
229
230
        # precalculate idf values for candidate subjects
231
        self._idf = collections.defaultdict(float)
232
        for subj_id in subject_ids:
233
            self._idf[uri] = math.log((doc_count + 1) /
234
                                      (self._doc_freq[subj_id] + 1)) + 1
235
        return (np.vstack([self._candidates_to_features(candidates)
236
                           for candidates in train_x]), np.array(train_y))
237
238
    def _create_classifier(self, params):
239
        return BaggingClassifier(
240
            DecisionTreeClassifier(
241
                min_samples_leaf=int(params['min_samples_leaf']),
242
                max_leaf_nodes=int(params['max_leaf_nodes'])
243
            ), max_samples=float(params['max_samples']))
244
245
    def train(self, train_x, train_y, params):
246
        # fit the model on the training corpus
247
        self._classifier = self._create_classifier(params)
248
        self._classifier.fit(train_x, train_y)
249
250
    def _prediction_to_list(self, scores, candidates):
251
        subj_scores = [(score[1], c.subject_id)
252
                       for score, c in zip(scores, candidates)]
253
        return sorted(subj_scores, reverse=True)
254
255
    def predict(self, candidates):
256
        if not candidates:
257
            return []
258
        features = self._candidates_to_features(candidates)
259
        scores = self._classifier.predict_proba(features)
260
        return self._prediction_to_list(scores, candidates)
261
262
263
class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
264
    """Hyperparameter optimizer for the MLLM backend"""
265
266
    def _prepare(self, n_jobs=1):
267
        self._backend.initialize()
268
        self._train_x, self._train_y = self._backend._load_train_data()
269
        self._candidates = []
270
        self._gold_subjects = []
271
272
        # TODO parallelize generation of candidates
273
        for doc in self._corpus.documents:
274
            candidates = self._backend._generate_candidates(doc.text)
275
            self._candidates.append(candidates)
276
            self._gold_subjects.append(
277
                annif.corpus.SubjectSet((doc.uris, doc.labels)))
278
279
    def _objective(self, trial):
280
        params = {
281
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30),
282
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000),
283
            'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
284
            'use_hidden_labels':
285
                trial.suggest_categorical('use_hidden_labels', [True, False]),
286
            'limit': 100
287
        }
288
        model = self._backend._model._create_classifier(params)
289
        model.fit(self._train_x, self._train_y)
290
291
        batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
292
        for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
293
            if candidates:
294
                features = \
295
                    self._backend._model._candidates_to_features(candidates)
296
                scores = model.predict_proba(features)
297
                ranking = self._backend._model._prediction_to_list(
298
                    scores, candidates)
299
            else:
300
                ranking = []
301
            results = self._backend._prediction_to_result(ranking, params)
302
            batch.evaluate(results, goldsubj)
303
        results = batch.results(metrics=[self._metric])
304
        return results[self._metric]
305
306
    def _postprocess(self, study):
307
        bp = study.best_params
308
        lines = [
309
            f"min_samples_leaf={bp['min_samples_leaf']}",
310
            f"max_leaf_nodes={bp['max_leaf_nodes']}",
311
            f"max_samples={bp['max_samples']:.4f}",
312
            f"use_hidden_labels={bp['use_hidden_labels']}"
313
        ]
314
        return hyperopt.HPRecommendation(lines=lines, score=study.best_value)
315
316
317
class MLLMBackend(hyperopt.AnnifHyperoptBackend):
318
    """Maui-like Lexical Matching backend for Annif"""
319
    name = "mllm"
320
    needs_subject_index = True
321
322
    # defaults for unitialized instances
323
    _model = None
324
325
    MODEL_FILE = 'mllm-model.gz'
326
    TRAIN_FILE = 'mllm-train.gz'
327
328
    DEFAULT_PARAMETERS = {
329
        'min_samples_leaf': 20,
330
        'max_leaf_nodes': 1000,
331
        'max_samples': 0.9,
332
        'use_hidden_labels': False
333
    }
334
335
    def get_hp_optimizer(self, corpus, metric):
336
        return MLLMOptimizer(self, corpus, metric)
337
338
    def default_params(self):
339
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
340
        params.update(self.DEFAULT_PARAMETERS)
341
        return params
342
343
    def _load_model(self):
344
        path = os.path.join(self.datadir, self.MODEL_FILE)
345
        self.debug('loading model from {}'.format(path))
346
        if os.path.exists(path):
347
            return joblib.load(path)
348
        else:
349
            raise NotInitializedException(
350
                'model {} not found'.format(path),
351
                backend_id=self.backend_id)
352
353
    def _load_train_data(self):
354
        path = os.path.join(self.datadir, self.TRAIN_FILE)
355
        if os.path.exists(path):
356
            return joblib.load(path)
357
        else:
358
            raise NotInitializedException(
359
                'train data file {} not found'.format(path),
360
                backend_id=self.backend_id)
361
362
    def initialize(self):
363
        if self._model is None:
364
            self._model = self._load_model()
365
366
    def _train(self, corpus, params):
367
        self.info('starting train')
368
        if corpus != 'cached':
369
            self.info("preparing training data")
370
            self._model = MLLMModel()
371
            train_data = self._model.prepare_train(corpus,
372
                                                   self.project.vocab,
373
                                                   self.project.analyzer,
374
                                                   params)
375
            annif.util.atomic_save(train_data,
376
                                   self.datadir,
377
                                   self.TRAIN_FILE,
378
                                   method=joblib.dump)
379
        else:
380
            self.info("reusing cached training data from previous run")
381
            self._model = self._load_model()
382
            train_data = self._load_train_data()
383
384
        self.info("training model")
385
        self._model.train(train_data[0], train_data[1], params)
386
387
        self.info('saving model')
388
        annif.util.atomic_save(
389
            self._model,
390
            self.datadir,
391
            self.MODEL_FILE,
392
            method=joblib.dump)
393
394
    def _generate_candidates(self, text):
395
        return self._model.generate_candidates(text, self.project.analyzer)
396
397
    def _prediction_to_result(self, prediction, params):
398
        vector = np.zeros(len(self.project.subjects), dtype=np.float32)
399
        for score, subject_id in prediction:
400
            vector[subject_id] = score
401
        result = VectorSuggestionResult(vector)
402
        return result.filter(self.project.subjects,
403
                             limit=int(params['limit']))
404
405
    def _suggest(self, text, params):
406
        candidates = self._generate_candidates(text)
407
        prediction = self._model.predict(candidates)
408
        return self._prediction_to_result(prediction, params)
409