1
|
|
|
"""MLLM (Maui-like Lexical Matchin) model for Annif""" |
2
|
|
|
|
3
|
|
|
import collections |
4
|
|
|
import math |
5
|
|
|
import joblib |
6
|
|
|
from statistics import mean |
7
|
|
|
from enum import IntEnum |
8
|
|
|
import numpy as np |
9
|
|
|
from rdflib import URIRef |
10
|
|
|
from rdflib.namespace import SKOS |
11
|
|
|
from scipy.sparse import lil_matrix, csc_matrix |
12
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
13
|
|
|
from sklearn.ensemble import BaggingClassifier |
14
|
|
|
from sklearn.tree import DecisionTreeClassifier |
15
|
|
|
import annif.util |
16
|
|
|
from annif.lexical.tokenset import TokenSet, TokenSetIndex |
17
|
|
|
|
18
|
|
|
|
19
|
|
|
Term = collections.namedtuple('Term', 'subject_id label is_pref') |
20
|
|
|
|
21
|
|
|
Match = collections.namedtuple( |
22
|
|
|
'Match', 'subject_id is_pref n_tokens pos ambiguity') |
23
|
|
|
|
24
|
|
|
Candidate = collections.namedtuple( |
25
|
|
|
'Candidate', |
26
|
|
|
'doc_length subject_id freq is_pref n_tokens ambiguity ' + |
27
|
|
|
'first_occ last_occ spread') |
28
|
|
|
|
29
|
|
|
Feature = IntEnum( |
30
|
|
|
'Feature', |
31
|
|
|
'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' + |
32
|
|
|
'first_occ last_occ spread doc_length ' + |
33
|
|
|
'broader narrower related collection', |
34
|
|
|
start=0) |
35
|
|
|
|
36
|
|
|
|
37
|
|
|
class MLLMModel: |
38
|
|
|
"""Maui-like Lexical Matching model""" |
39
|
|
|
|
40
|
|
|
def _conflate_matches(self, matches, doc_length): |
41
|
|
|
subj_matches = collections.defaultdict(list) |
42
|
|
|
for match in matches: |
43
|
|
|
subj_matches[match.subject_id].append(match) |
44
|
|
|
return [ |
45
|
|
|
Candidate( |
46
|
|
|
doc_length=doc_length, |
47
|
|
|
subject_id=subject_id, |
48
|
|
|
freq=len(matches) / doc_length, |
49
|
|
|
is_pref=mean((float(m.is_pref) for m in matches)), |
50
|
|
|
n_tokens=mean((m.n_tokens for m in matches)), |
51
|
|
|
ambiguity=mean((m.ambiguity for m in matches)), |
52
|
|
|
first_occ=matches[0].pos / doc_length, |
53
|
|
|
last_occ=matches[-1].pos / doc_length, |
54
|
|
|
spread=(matches[-1].pos - matches[0].pos) / doc_length |
55
|
|
|
) |
56
|
|
|
for subject_id, matches in subj_matches.items()] |
57
|
|
|
|
58
|
|
|
def generate_candidates(self, text, analyzer): |
59
|
|
|
sentences = analyzer.tokenize_sentences(text) |
60
|
|
|
sent_tokens = self._vectorizer.transform(sentences) |
61
|
|
|
matches = [] |
62
|
|
|
|
63
|
|
|
for sent_idx, token_matrix in enumerate(sent_tokens): |
64
|
|
|
tset = TokenSet(token_matrix.nonzero()[1]) |
65
|
|
|
for ts, ambiguity in self._index.search(tset): |
66
|
|
|
matches.append(Match(subject_id=ts.subject_id, |
67
|
|
|
is_pref=ts.is_pref, |
68
|
|
|
n_tokens=len(ts), |
69
|
|
|
pos=sent_idx, |
70
|
|
|
ambiguity=ambiguity)) |
71
|
|
|
|
72
|
|
|
return self._conflate_matches(matches, len(sentences)) |
73
|
|
|
|
74
|
|
|
def _candidates_to_features(self, candidates): |
75
|
|
|
"""Convert a list of Candidates to a NumPy feature matrix""" |
76
|
|
|
matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32) |
77
|
|
|
c_ids = [c.subject_id for c in candidates] |
78
|
|
|
c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool) |
79
|
|
|
c_vec[c_ids] = True |
80
|
|
|
broader = self._broader_matrix.multiply(c_vec).sum(axis=1) |
81
|
|
|
narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1) |
82
|
|
|
related = self._related_matrix.multiply(c_vec).sum(axis=1) |
83
|
|
|
collection = self._collection_matrix.multiply(c_vec).T.dot( |
84
|
|
|
self._collection_matrix).sum(axis=0) |
85
|
|
|
for idx, c in enumerate(candidates): |
86
|
|
|
subj = c.subject_id |
87
|
|
|
matrix[idx, Feature.freq] = c.freq |
88
|
|
|
matrix[idx, Feature.doc_freq] = self._doc_freq[subj] |
89
|
|
|
matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1 |
90
|
|
|
matrix[idx, Feature.tfidf] = c.freq * self._idf[subj] |
91
|
|
|
matrix[idx, Feature.is_pref] = c.is_pref |
92
|
|
|
matrix[idx, Feature.n_tokens] = c.n_tokens |
93
|
|
|
matrix[idx, Feature.ambiguity] = c.ambiguity |
94
|
|
|
matrix[idx, Feature.first_occ] = c.first_occ |
95
|
|
|
matrix[idx, Feature.last_occ] = c.last_occ |
96
|
|
|
matrix[idx, Feature.spread] = c.spread |
97
|
|
|
matrix[idx, Feature.doc_length] = c.doc_length |
98
|
|
|
matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids) |
99
|
|
|
matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids) |
100
|
|
|
matrix[idx, Feature.related] = related[subj, 0] / len(c_ids) |
101
|
|
|
matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids) |
102
|
|
|
return matrix |
103
|
|
|
|
104
|
|
|
def _prepare_terms(self, graph, vocab, params): |
105
|
|
|
terms = [] |
106
|
|
|
subject_ids = [] |
107
|
|
|
for subj_id, uri, pref, _ in vocab.subjects.active: |
108
|
|
|
subject_ids.append(subj_id) |
109
|
|
|
terms.append(Term(subject_id=subj_id, label=pref, is_pref=True)) |
110
|
|
|
|
111
|
|
|
if annif.util.boolean(params['use_hidden_labels']): |
112
|
|
|
label_props = [SKOS.altLabel, SKOS.hiddenLabel] |
113
|
|
|
else: |
114
|
|
|
label_props = [SKOS.altLabel] |
115
|
|
|
|
116
|
|
|
for prop in label_props: |
117
|
|
|
for label in graph.objects(URIRef(uri), prop): |
118
|
|
|
if label.language != params['language']: |
119
|
|
|
continue |
120
|
|
|
terms.append(Term(subject_id=subj_id, |
121
|
|
|
label=str(label), |
122
|
|
|
is_pref=False)) |
123
|
|
|
return (terms, subject_ids) |
124
|
|
|
|
125
|
|
|
def _make_relation_matrix(self, graph, vocab, property): |
126
|
|
|
n_subj = len(vocab.subjects) |
127
|
|
|
matrix = lil_matrix((n_subj, n_subj), dtype=np.bool) |
128
|
|
|
|
129
|
|
|
for subj_id, uri, pref, _ in vocab.subjects.active: |
130
|
|
|
for other in graph.objects(URIRef(uri), property): |
131
|
|
|
other_id = vocab.subjects.by_uri(str(other), |
132
|
|
|
warnings=False) |
133
|
|
|
if other_id is not None: |
134
|
|
|
matrix[subj_id, other_id] = True |
135
|
|
|
|
136
|
|
|
return csc_matrix(matrix) |
137
|
|
|
|
138
|
|
|
def _make_collection_matrix(self, graph, vocab): |
139
|
|
|
# make an index with all collection members |
140
|
|
|
c_members = collections.defaultdict(list) |
141
|
|
|
for coll, member in graph.subject_objects(SKOS.member): |
142
|
|
|
member_id = vocab.subjects.by_uri(str(member), warnings=False) |
143
|
|
|
if member_id is not None: |
144
|
|
|
c_members[str(coll)].append(member_id) |
145
|
|
|
|
146
|
|
|
c_matrix = lil_matrix((len(c_members), len(vocab.subjects)), |
147
|
|
|
dtype=np.bool) |
148
|
|
|
|
149
|
|
|
# populate the matrix for collection -> subject_id |
150
|
|
|
for c_id, members in enumerate(c_members.values()): |
151
|
|
|
c_matrix[c_id, members] = True |
152
|
|
|
|
153
|
|
|
return csc_matrix(c_matrix) |
154
|
|
|
|
155
|
|
|
def _prepare_relations(self, graph, vocab): |
156
|
|
|
self._broader_matrix = self._make_relation_matrix( |
157
|
|
|
graph, vocab, SKOS.broader) |
158
|
|
|
self._narrower_matrix = self._make_relation_matrix( |
159
|
|
|
graph, vocab, SKOS.narrower) |
160
|
|
|
self._related_matrix = self._make_relation_matrix( |
161
|
|
|
graph, vocab, SKOS.related) |
162
|
|
|
self._collection_matrix = self._make_collection_matrix(graph, vocab) |
163
|
|
|
|
164
|
|
|
def _prepare_train_index(self, vocab, analyzer, params): |
165
|
|
|
graph = vocab.as_graph() |
166
|
|
|
terms, subject_ids = self._prepare_terms(graph, vocab, params) |
167
|
|
|
self._prepare_relations(graph, vocab) |
168
|
|
|
|
169
|
|
|
self._vectorizer = CountVectorizer( |
170
|
|
|
binary=True, |
171
|
|
|
tokenizer=analyzer.tokenize_words |
172
|
|
|
) |
173
|
|
|
label_corpus = self._vectorizer.fit_transform((t.label for t in terms)) |
174
|
|
|
|
175
|
|
|
self._index = TokenSetIndex() |
176
|
|
|
for term, label_matrix in zip(terms, label_corpus): |
177
|
|
|
tokens = label_matrix.nonzero()[1] |
178
|
|
|
tset = TokenSet(tokens, term.subject_id, term.is_pref) |
179
|
|
|
self._index.add(tset) |
180
|
|
|
|
181
|
|
|
return subject_ids |
182
|
|
|
|
183
|
|
|
def _calculate_idf(self, subject_ids, doc_count): |
184
|
|
|
idf = collections.defaultdict(float) |
185
|
|
|
for subj_id in subject_ids: |
186
|
|
|
idf[subj_id] = math.log((doc_count + 1) / |
187
|
|
|
(self._doc_freq[subj_id] + 1)) + 1 |
188
|
|
|
|
189
|
|
|
return idf |
190
|
|
|
|
191
|
|
|
def prepare_train(self, corpus, vocab, analyzer, params): |
192
|
|
|
subject_ids = self._prepare_train_index(vocab, analyzer, params) |
193
|
|
|
|
194
|
|
|
# frequency of subjects (by id) in the generated candidates |
195
|
|
|
self._doc_freq = collections.Counter() |
196
|
|
|
# frequency of manually assigned subjects ("domain keyphraseness") |
197
|
|
|
self._subj_freq = collections.Counter() |
198
|
|
|
doc_count = 0 |
199
|
|
|
train_x = [] |
200
|
|
|
train_y = [] |
201
|
|
|
for idx, doc in enumerate(corpus.documents): |
202
|
|
|
doc_subject_ids = [vocab.subjects.by_uri(uri) |
203
|
|
|
for uri in doc.uris] |
204
|
|
|
self._subj_freq.update(doc_subject_ids) |
205
|
|
|
candidates = self.generate_candidates(doc.text, analyzer) |
206
|
|
|
self._doc_freq.update([c.subject_id for c in candidates]) |
207
|
|
|
train_x.append(candidates) |
208
|
|
|
train_y += [(c.subject_id in doc_subject_ids) for c in candidates] |
209
|
|
|
doc_count += 1 |
210
|
|
|
|
211
|
|
|
# precalculate idf values for all candidate subjects |
212
|
|
|
self._idf = self._calculate_idf(subject_ids, doc_count) |
213
|
|
|
|
214
|
|
|
return (np.vstack([self._candidates_to_features(candidates) |
215
|
|
|
for candidates in train_x]), np.array(train_y)) |
216
|
|
|
|
217
|
|
|
def _create_classifier(self, params): |
218
|
|
|
return BaggingClassifier( |
219
|
|
|
DecisionTreeClassifier( |
220
|
|
|
min_samples_leaf=int(params['min_samples_leaf']), |
221
|
|
|
max_leaf_nodes=int(params['max_leaf_nodes']) |
222
|
|
|
), max_samples=float(params['max_samples'])) |
223
|
|
|
|
224
|
|
|
def train(self, train_x, train_y, params): |
225
|
|
|
# fit the model on the training corpus |
226
|
|
|
self._classifier = self._create_classifier(params) |
227
|
|
|
self._classifier.fit(train_x, train_y) |
228
|
|
|
|
229
|
|
|
def _prediction_to_list(self, scores, candidates): |
230
|
|
|
subj_scores = [(score[1], c.subject_id) |
231
|
|
|
for score, c in zip(scores, candidates)] |
232
|
|
|
return sorted(subj_scores, reverse=True) |
233
|
|
|
|
234
|
|
|
def predict(self, candidates): |
235
|
|
|
if not candidates: |
236
|
|
|
return [] |
237
|
|
|
features = self._candidates_to_features(candidates) |
238
|
|
|
scores = self._classifier.predict_proba(features) |
239
|
|
|
return self._prediction_to_list(scores, candidates) |
240
|
|
|
|
241
|
|
|
def save(self, filename): |
242
|
|
|
return joblib.dump(self, filename) |
243
|
|
|
|
244
|
|
|
@staticmethod |
245
|
|
|
def load(filename): |
246
|
|
|
return joblib.load(filename) |
247
|
|
|
|