1
|
|
|
"""Maui-like Lexical Matching backend""" |
2
|
|
|
|
3
|
|
|
import collections |
4
|
|
|
import math |
5
|
|
|
from enum import IntEnum |
6
|
|
|
from statistics import mean |
7
|
|
|
import os.path |
8
|
|
|
import joblib |
9
|
|
|
import numpy as np |
10
|
|
|
from rdflib import URIRef |
11
|
|
|
from rdflib.namespace import SKOS |
12
|
|
|
from scipy.sparse import lil_matrix |
13
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
14
|
|
|
from sklearn.ensemble import BaggingClassifier |
15
|
|
|
from sklearn.tree import DecisionTreeClassifier |
16
|
|
|
import annif.util |
17
|
|
|
from annif.exception import NotInitializedException |
18
|
|
|
from annif.suggestion import VectorSuggestionResult |
19
|
|
|
from . import backend |
20
|
|
|
from . import hyperopt |
21
|
|
|
|
22
|
|
|
Term = collections.namedtuple('Term', 'subject_id label is_pref') |
23
|
|
|
Match = collections.namedtuple( |
24
|
|
|
'Match', 'subject_id is_pref n_tokens pos ambiguity') |
25
|
|
|
Candidate = collections.namedtuple( |
26
|
|
|
'Candidate', |
27
|
|
|
'doc_length subject_id freq is_pref n_tokens ambiguity ' + |
28
|
|
|
'first_occ last_occ spread') |
29
|
|
|
|
30
|
|
|
Feature = IntEnum( |
31
|
|
|
'Feature', |
32
|
|
|
'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' + |
33
|
|
|
'first_occ last_occ spread doc_length ' + |
34
|
|
|
'broader narrower related', |
35
|
|
|
start=0) |
36
|
|
|
|
37
|
|
|
|
38
|
|
|
class TokenSet: |
39
|
|
|
"""Represents a set of tokens (expressed as integer token IDs) that can |
40
|
|
|
be matched with another set of tokens. A TokenSet can optionally |
41
|
|
|
be associated with a subject from the vocabulary.""" |
42
|
|
|
|
43
|
|
|
def __init__(self, tokens, subject_id=None, is_pref=False): |
44
|
|
|
self._tokens = set(tokens) |
45
|
|
|
self.subject_id = subject_id |
46
|
|
|
self.is_pref = is_pref |
47
|
|
|
|
48
|
|
|
def __len__(self): |
49
|
|
|
return len(self._tokens) |
50
|
|
|
|
51
|
|
|
def __iter__(self): |
52
|
|
|
return iter(self._tokens) |
53
|
|
|
|
54
|
|
|
def contains(self, other): |
55
|
|
|
"""Returns True iff the tokens in the other TokenSet are all |
56
|
|
|
included within this TokenSet.""" |
57
|
|
|
|
58
|
|
|
return other._tokens.issubset(self._tokens) |
59
|
|
|
|
60
|
|
|
def sample(self): |
61
|
|
|
"""Return an arbitrary token from this TokenSet, or None if empty""" |
62
|
|
|
try: |
63
|
|
|
return next(iter(self._tokens)) |
64
|
|
|
except StopIteration: |
65
|
|
|
return None |
66
|
|
|
|
67
|
|
|
|
68
|
|
|
class TokenSetIndex: |
69
|
|
|
"""A searchable index of TokenSets (representing vocabulary terms)""" |
70
|
|
|
|
71
|
|
|
def __init__(self): |
72
|
|
|
self._index = collections.defaultdict(set) |
73
|
|
|
|
74
|
|
|
def __len__(self): |
75
|
|
|
return len(self._index) |
76
|
|
|
|
77
|
|
|
def add(self, tset): |
78
|
|
|
"""Add a TokenSet into this index""" |
79
|
|
|
token = tset.sample() |
80
|
|
|
if token is not None: |
81
|
|
|
self._index[token].add(tset) |
82
|
|
|
|
83
|
|
|
def search(self, tset): |
84
|
|
|
"""Return the TokenSets that are contained in the given TokenSet. |
85
|
|
|
The matches are returned as a list of (TokenSet, ambiguity) pairs |
86
|
|
|
where ambiguity is an integer indicating the number of other TokenSets |
87
|
|
|
that also match the same tokens.""" |
88
|
|
|
|
89
|
|
|
subj_tsets = {} |
90
|
|
|
subj_ambiguity = collections.Counter() |
91
|
|
|
|
92
|
|
|
for token in tset: |
93
|
|
|
for ts in self._index[token]: |
94
|
|
|
if not tset.contains(ts): |
95
|
|
|
continue |
96
|
|
|
if ts.subject_id not in subj_tsets or \ |
97
|
|
|
not subj_tsets[ts.subject_id].is_pref: |
98
|
|
|
subj_tsets[ts.subject_id] = ts |
99
|
|
|
|
100
|
|
|
for ts in subj_tsets.values(): |
101
|
|
|
for other in subj_tsets.values(): |
102
|
|
|
if ts == other: |
103
|
|
|
continue |
104
|
|
|
if other.contains(ts): |
105
|
|
|
subj_ambiguity.update([ts.subject_id]) |
106
|
|
|
|
107
|
|
|
return [(ts, subj_ambiguity[ts.subject_id]) |
108
|
|
|
for uri, ts in subj_tsets.items()] |
109
|
|
|
|
110
|
|
|
|
111
|
|
|
class MLLMModel: |
112
|
|
|
"""Maui-like Lexical Matching model""" |
113
|
|
|
|
114
|
|
|
def _conflate_matches(self, matches, doc_length): |
115
|
|
|
subj_matches = collections.defaultdict(list) |
116
|
|
|
for match in matches: |
117
|
|
|
subj_matches[match.subject_id].append(match) |
118
|
|
|
return [ |
119
|
|
|
Candidate( |
120
|
|
|
doc_length=doc_length, |
121
|
|
|
subject_id=subject_id, |
122
|
|
|
freq=len(matches) / doc_length, |
123
|
|
|
is_pref=mean((float(m.is_pref) for m in matches)), |
124
|
|
|
n_tokens=mean((m.n_tokens for m in matches)), |
125
|
|
|
ambiguity=mean((m.ambiguity for m in matches)), |
126
|
|
|
first_occ=matches[0].pos / doc_length, |
127
|
|
|
last_occ=matches[-1].pos / doc_length, |
128
|
|
|
spread=(matches[-1].pos - matches[0].pos) / doc_length |
129
|
|
|
) |
130
|
|
|
for subject_id, matches in subj_matches.items()] |
131
|
|
|
|
132
|
|
|
def generate_candidates(self, text, analyzer): |
133
|
|
|
sentences = analyzer.tokenize_sentences(text) |
134
|
|
|
sent_tokens = self._vectorizer.transform(sentences) |
135
|
|
|
matches = [] |
136
|
|
|
|
137
|
|
|
for sent_idx, token_matrix in enumerate(sent_tokens): |
138
|
|
|
tset = TokenSet(token_matrix.nonzero()[1]) |
139
|
|
|
for ts, ambiguity in self._index.search(tset): |
140
|
|
|
matches.append(Match(subject_id=ts.subject_id, |
141
|
|
|
is_pref=ts.is_pref, |
142
|
|
|
n_tokens=len(ts), |
143
|
|
|
pos=sent_idx, |
144
|
|
|
ambiguity=ambiguity)) |
145
|
|
|
|
146
|
|
|
return self._conflate_matches(matches, len(sentences)) |
147
|
|
|
|
148
|
|
|
def _candidates_to_features(self, candidates): |
149
|
|
|
"""Convert a list of Candidates to a NumPy feature matrix""" |
150
|
|
|
matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32) |
151
|
|
|
c_ids = [c.subject_id for c in candidates] |
152
|
|
|
c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool) |
153
|
|
|
c_vec[c_ids] = True |
154
|
|
|
broader = self._broader_matrix.multiply(c_vec).sum(axis=1) |
155
|
|
|
narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1) |
156
|
|
|
related = self._related_matrix.multiply(c_vec).sum(axis=1) |
157
|
|
|
for idx, c in enumerate(candidates): |
158
|
|
|
subj = c.subject_id |
159
|
|
|
matrix[idx, Feature.freq] = c.freq |
160
|
|
|
matrix[idx, Feature.doc_freq] = self._doc_freq[subj] |
161
|
|
|
matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1 |
162
|
|
|
matrix[idx, Feature.tfidf] = c.freq * self._idf[subj] |
163
|
|
|
matrix[idx, Feature.is_pref] = c.is_pref |
164
|
|
|
matrix[idx, Feature.n_tokens] = c.n_tokens |
165
|
|
|
matrix[idx, Feature.ambiguity] = c.ambiguity |
166
|
|
|
matrix[idx, Feature.first_occ] = c.first_occ |
167
|
|
|
matrix[idx, Feature.last_occ] = c.last_occ |
168
|
|
|
matrix[idx, Feature.spread] = c.spread |
169
|
|
|
matrix[idx, Feature.doc_length] = c.doc_length |
170
|
|
|
matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids) |
171
|
|
|
matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids) |
172
|
|
|
matrix[idx, Feature.related] = related[subj, 0] / len(c_ids) |
173
|
|
|
return matrix |
174
|
|
|
|
175
|
|
|
def _prepare_terms(self, graph, vocab, params): |
176
|
|
|
terms = [] |
177
|
|
|
subject_ids = [] |
178
|
|
|
for subj_id, (uri, pref, _) in enumerate(vocab.subjects): |
179
|
|
|
if pref is None: |
180
|
|
|
continue # deprecated subject |
181
|
|
|
subject_ids.append(subj_id) |
182
|
|
|
terms.append(Term(subject_id=subj_id, label=pref, is_pref=True)) |
183
|
|
|
|
184
|
|
|
if annif.util.boolean(params['use_hidden_labels']): |
185
|
|
|
label_props = [SKOS.altLabel, SKOS.hiddenLabel] |
186
|
|
|
else: |
187
|
|
|
label_props = [SKOS.altLabel] |
188
|
|
|
|
189
|
|
|
for prop in label_props: |
190
|
|
|
for label in graph.objects(URIRef(uri), prop): |
191
|
|
|
if label.language != params['language']: |
192
|
|
|
continue |
193
|
|
|
terms.append(Term(subject_id=subj_id, |
194
|
|
|
label=str(label), |
195
|
|
|
is_pref=False)) |
196
|
|
|
return (terms, subject_ids) |
197
|
|
|
|
198
|
|
|
def _prepare_relations(self, graph, vocab): |
199
|
|
|
n_subj = len(vocab.subjects) |
200
|
|
|
self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool) |
201
|
|
|
self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool) |
202
|
|
|
self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool) |
203
|
|
|
|
204
|
|
|
prop_matrix = [ |
205
|
|
|
(SKOS.broader, self._broader_matrix), |
206
|
|
|
(SKOS.narrower, self._narrower_matrix), |
207
|
|
|
(SKOS.related, self._related_matrix) |
208
|
|
|
] |
209
|
|
|
|
210
|
|
|
for subj_id, (uri, pref, _) in enumerate(vocab.subjects): |
211
|
|
|
if pref is None: |
212
|
|
|
continue # deprecated subject |
213
|
|
|
|
214
|
|
|
for prop, matrix in prop_matrix: |
215
|
|
|
for other in graph.objects(URIRef(uri), prop): |
216
|
|
|
other_id = vocab.subjects.by_uri(str(other), |
217
|
|
|
warnings=False) |
218
|
|
|
if other_id is not None: |
219
|
|
|
matrix[subj_id, other_id] = True |
220
|
|
|
|
221
|
|
|
def _prepare_train_index(self, vocab, analyzer, params): |
222
|
|
|
graph = vocab.as_graph() |
223
|
|
|
terms, subject_ids = self._prepare_terms(graph, vocab, params) |
224
|
|
|
self._prepare_relations(graph, vocab) |
225
|
|
|
|
226
|
|
|
self._vectorizer = CountVectorizer( |
227
|
|
|
binary=True, |
228
|
|
|
tokenizer=analyzer.tokenize_words |
229
|
|
|
) |
230
|
|
|
label_corpus = self._vectorizer.fit_transform((t.label for t in terms)) |
231
|
|
|
|
232
|
|
|
self._index = TokenSetIndex() |
233
|
|
|
for term, label_matrix in zip(terms, label_corpus): |
234
|
|
|
tokens = label_matrix.nonzero()[1] |
235
|
|
|
tset = TokenSet(tokens, term.subject_id, term.is_pref) |
236
|
|
|
self._index.add(tset) |
237
|
|
|
|
238
|
|
|
return subject_ids |
239
|
|
|
|
240
|
|
|
def prepare_train(self, corpus, vocab, analyzer, params): |
241
|
|
|
subject_ids = self._prepare_train_index(vocab, analyzer, params) |
242
|
|
|
|
243
|
|
|
# frequency of subjects (by id) in the generated candidates |
244
|
|
|
self._doc_freq = collections.Counter() |
245
|
|
|
# frequency of manually assigned subjects ("domain keyphraseness") |
246
|
|
|
self._subj_freq = collections.Counter() |
247
|
|
|
doc_count = 0 |
248
|
|
|
train_x = [] |
249
|
|
|
train_y = [] |
250
|
|
|
for idx, doc in enumerate(corpus.documents): |
251
|
|
|
doc_subject_ids = [vocab.subjects.by_uri(uri) |
252
|
|
|
for uri in doc.uris] |
253
|
|
|
self._subj_freq.update(doc_subject_ids) |
254
|
|
|
candidates = self.generate_candidates(doc.text, analyzer) |
255
|
|
|
self._doc_freq.update([c.subject_id for c in candidates]) |
256
|
|
|
train_x.append(candidates) |
257
|
|
|
train_y += [(c.subject_id in doc_subject_ids) for c in candidates] |
258
|
|
|
doc_count += 1 |
259
|
|
|
|
260
|
|
|
# precalculate idf values for candidate subjects |
261
|
|
|
self._idf = collections.defaultdict(float) |
262
|
|
|
for subj_id in subject_ids: |
263
|
|
|
self._idf[subj_id] = math.log((doc_count + 1) / |
264
|
|
|
(self._doc_freq[subj_id] + 1)) + 1 |
265
|
|
|
return (np.vstack([self._candidates_to_features(candidates) |
266
|
|
|
for candidates in train_x]), np.array(train_y)) |
267
|
|
|
|
268
|
|
|
def _create_classifier(self, params): |
269
|
|
|
return BaggingClassifier( |
270
|
|
|
DecisionTreeClassifier( |
271
|
|
|
min_samples_leaf=int(params['min_samples_leaf']), |
272
|
|
|
max_leaf_nodes=int(params['max_leaf_nodes']) |
273
|
|
|
), max_samples=float(params['max_samples'])) |
274
|
|
|
|
275
|
|
|
def train(self, train_x, train_y, params): |
276
|
|
|
# fit the model on the training corpus |
277
|
|
|
self._classifier = self._create_classifier(params) |
278
|
|
|
self._classifier.fit(train_x, train_y) |
279
|
|
|
|
280
|
|
|
def _prediction_to_list(self, scores, candidates): |
281
|
|
|
subj_scores = [(score[1], c.subject_id) |
282
|
|
|
for score, c in zip(scores, candidates)] |
283
|
|
|
return sorted(subj_scores, reverse=True) |
284
|
|
|
|
285
|
|
|
def predict(self, candidates): |
286
|
|
|
if not candidates: |
287
|
|
|
return [] |
288
|
|
|
features = self._candidates_to_features(candidates) |
289
|
|
|
scores = self._classifier.predict_proba(features) |
290
|
|
|
return self._prediction_to_list(scores, candidates) |
291
|
|
|
|
292
|
|
|
|
293
|
|
|
class MLLMOptimizer(hyperopt.HyperparameterOptimizer): |
294
|
|
|
"""Hyperparameter optimizer for the MLLM backend""" |
295
|
|
|
|
296
|
|
|
def _prepare(self, n_jobs=1): |
297
|
|
|
self._backend.initialize() |
298
|
|
|
self._train_x, self._train_y = self._backend._load_train_data() |
299
|
|
|
self._candidates = [] |
300
|
|
|
self._gold_subjects = [] |
301
|
|
|
|
302
|
|
|
# TODO parallelize generation of candidates |
303
|
|
|
for doc in self._corpus.documents: |
304
|
|
|
candidates = self._backend._generate_candidates(doc.text) |
305
|
|
|
self._candidates.append(candidates) |
306
|
|
|
self._gold_subjects.append( |
307
|
|
|
annif.corpus.SubjectSet((doc.uris, doc.labels))) |
308
|
|
|
|
309
|
|
|
def _objective(self, trial): |
310
|
|
|
params = { |
311
|
|
|
'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30), |
312
|
|
|
'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000), |
313
|
|
|
'max_samples': trial.suggest_float('max_samples', 0.5, 1.0), |
314
|
|
|
'use_hidden_labels': |
315
|
|
|
trial.suggest_categorical('use_hidden_labels', [True, False]), |
316
|
|
|
'limit': 100 |
317
|
|
|
} |
318
|
|
|
model = self._backend._model._create_classifier(params) |
319
|
|
|
model.fit(self._train_x, self._train_y) |
320
|
|
|
|
321
|
|
|
batch = annif.eval.EvaluationBatch(self._backend.project.subjects) |
322
|
|
|
for goldsubj, candidates in zip(self._gold_subjects, self._candidates): |
323
|
|
|
if candidates: |
324
|
|
|
features = \ |
325
|
|
|
self._backend._model._candidates_to_features(candidates) |
326
|
|
|
scores = model.predict_proba(features) |
327
|
|
|
ranking = self._backend._model._prediction_to_list( |
328
|
|
|
scores, candidates) |
329
|
|
|
else: |
330
|
|
|
ranking = [] |
331
|
|
|
results = self._backend._prediction_to_result(ranking, params) |
332
|
|
|
batch.evaluate(results, goldsubj) |
333
|
|
|
results = batch.results(metrics=[self._metric]) |
334
|
|
|
return results[self._metric] |
335
|
|
|
|
336
|
|
|
def _postprocess(self, study): |
337
|
|
|
bp = study.best_params |
338
|
|
|
lines = [ |
339
|
|
|
f"min_samples_leaf={bp['min_samples_leaf']}", |
340
|
|
|
f"max_leaf_nodes={bp['max_leaf_nodes']}", |
341
|
|
|
f"max_samples={bp['max_samples']:.4f}", |
342
|
|
|
f"use_hidden_labels={bp['use_hidden_labels']}" |
343
|
|
|
] |
344
|
|
|
return hyperopt.HPRecommendation(lines=lines, score=study.best_value) |
345
|
|
|
|
346
|
|
|
|
347
|
|
|
class MLLMBackend(hyperopt.AnnifHyperoptBackend): |
348
|
|
|
"""Maui-like Lexical Matching backend for Annif""" |
349
|
|
|
name = "mllm" |
350
|
|
|
needs_subject_index = True |
351
|
|
|
|
352
|
|
|
# defaults for unitialized instances |
353
|
|
|
_model = None |
354
|
|
|
|
355
|
|
|
MODEL_FILE = 'mllm-model.gz' |
356
|
|
|
TRAIN_FILE = 'mllm-train.gz' |
357
|
|
|
|
358
|
|
|
DEFAULT_PARAMETERS = { |
359
|
|
|
'min_samples_leaf': 20, |
360
|
|
|
'max_leaf_nodes': 1000, |
361
|
|
|
'max_samples': 0.9, |
362
|
|
|
'use_hidden_labels': False |
363
|
|
|
} |
364
|
|
|
|
365
|
|
|
def get_hp_optimizer(self, corpus, metric): |
366
|
|
|
return MLLMOptimizer(self, corpus, metric) |
367
|
|
|
|
368
|
|
|
def default_params(self): |
369
|
|
|
params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() |
370
|
|
|
params.update(self.DEFAULT_PARAMETERS) |
371
|
|
|
return params |
372
|
|
|
|
373
|
|
|
def _load_model(self): |
374
|
|
|
path = os.path.join(self.datadir, self.MODEL_FILE) |
375
|
|
|
self.debug('loading model from {}'.format(path)) |
376
|
|
|
if os.path.exists(path): |
377
|
|
|
return joblib.load(path) |
378
|
|
|
else: |
379
|
|
|
raise NotInitializedException( |
380
|
|
|
'model {} not found'.format(path), |
381
|
|
|
backend_id=self.backend_id) |
382
|
|
|
|
383
|
|
|
def _load_train_data(self): |
384
|
|
|
path = os.path.join(self.datadir, self.TRAIN_FILE) |
385
|
|
|
if os.path.exists(path): |
386
|
|
|
return joblib.load(path) |
387
|
|
|
else: |
388
|
|
|
raise NotInitializedException( |
389
|
|
|
'train data file {} not found'.format(path), |
390
|
|
|
backend_id=self.backend_id) |
391
|
|
|
|
392
|
|
|
def initialize(self): |
393
|
|
|
if self._model is None: |
394
|
|
|
self._model = self._load_model() |
395
|
|
|
|
396
|
|
|
def _train(self, corpus, params): |
397
|
|
|
self.info('starting train') |
398
|
|
|
if corpus != 'cached': |
399
|
|
|
self.info("preparing training data") |
400
|
|
|
self._model = MLLMModel() |
401
|
|
|
train_data = self._model.prepare_train(corpus, |
402
|
|
|
self.project.vocab, |
403
|
|
|
self.project.analyzer, |
404
|
|
|
params) |
405
|
|
|
annif.util.atomic_save(train_data, |
406
|
|
|
self.datadir, |
407
|
|
|
self.TRAIN_FILE, |
408
|
|
|
method=joblib.dump) |
409
|
|
|
else: |
410
|
|
|
self.info("reusing cached training data from previous run") |
411
|
|
|
self._model = self._load_model() |
412
|
|
|
train_data = self._load_train_data() |
413
|
|
|
|
414
|
|
|
self.info("training model") |
415
|
|
|
self._model.train(train_data[0], train_data[1], params) |
416
|
|
|
|
417
|
|
|
self.info('saving model') |
418
|
|
|
annif.util.atomic_save( |
419
|
|
|
self._model, |
420
|
|
|
self.datadir, |
421
|
|
|
self.MODEL_FILE, |
422
|
|
|
method=joblib.dump) |
423
|
|
|
|
424
|
|
|
def _generate_candidates(self, text): |
425
|
|
|
return self._model.generate_candidates(text, self.project.analyzer) |
426
|
|
|
|
427
|
|
|
def _prediction_to_result(self, prediction, params): |
428
|
|
|
vector = np.zeros(len(self.project.subjects), dtype=np.float32) |
429
|
|
|
for score, subject_id in prediction: |
430
|
|
|
vector[subject_id] = score |
431
|
|
|
result = VectorSuggestionResult(vector) |
432
|
|
|
return result.filter(self.project.subjects, |
433
|
|
|
limit=int(params['limit'])) |
434
|
|
|
|
435
|
|
|
def _suggest(self, text, params): |
436
|
|
|
candidates = self._generate_candidates(text) |
437
|
|
|
prediction = self._model.predict(candidates) |
438
|
|
|
return self._prediction_to_result(prediction, params) |
439
|
|
|
|