Passed
Pull Request — master (#511)
by Osma
01:55
created

MLLMFeatureConverter.candidates_to_features()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'broader narrower related collection',
35
    start=0)
36
37
38
def conflate_matches(matches, doc_length):
39
    subj_matches = collections.defaultdict(list)
40
    for match in matches:
41
        subj_matches[match.subject_id].append(match)
42
    return [
43
        Candidate(
44
            doc_length=doc_length,
45
            subject_id=subject_id,
46
            freq=len(matches) / doc_length,
47
            is_pref=mean((float(m.is_pref) for m in matches)),
48
            n_tokens=mean((m.n_tokens for m in matches)),
49
            ambiguity=mean((m.ambiguity for m in matches)),
50
            first_occ=matches[0].pos / doc_length,
51
            last_occ=matches[-1].pos / doc_length,
52
            spread=(matches[-1].pos - matches[0].pos) / doc_length
53
        )
54
        for subject_id, matches in subj_matches.items()]
55
56
57
def generate_candidates(text, analyzer, vectorizer, index):
58
    sentences = analyzer.tokenize_sentences(text)
59
    sent_tokens = vectorizer.transform(sentences)
60
    matches = []
61
62
    for sent_idx, token_matrix in enumerate(sent_tokens):
63
        tset = TokenSet(token_matrix.nonzero()[1])
64
        for ts, ambiguity in index.search(tset):
65
            matches.append(Match(subject_id=ts.subject_id,
66
                                 is_pref=ts.is_pref,
67
                                 n_tokens=len(ts),
68
                                 pos=sent_idx,
69
                                 ambiguity=ambiguity))
70
71
    return conflate_matches(matches, len(sentences))
72
73
74
def candidates_to_features(candidates,
75
                           related_matrix, broader_matrix,
76
                           narrower_matrix, collection_matrix,
77
                           doc_freq, subj_freq, idf):
78
    """Convert a list of Candidates to a NumPy feature matrix"""
79
80
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
81
    c_ids = [c.subject_id for c in candidates]
82
    c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
83
    c_vec[c_ids] = True
84
    broader = broader_matrix.multiply(c_vec).sum(axis=1)
85
    narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
86
    related = related_matrix.multiply(c_vec).sum(axis=1)
87
    collection = collection_matrix.multiply(c_vec).T.dot(
88
        collection_matrix).sum(axis=0)
89
    for idx, c in enumerate(candidates):
90
        subj = c.subject_id
91
        matrix[idx, Feature.freq] = c.freq
92
        matrix[idx, Feature.doc_freq] = doc_freq[subj]
93
        matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
94
        matrix[idx, Feature.tfidf] = c.freq * idf[subj]
95
        matrix[idx, Feature.is_pref] = c.is_pref
96
        matrix[idx, Feature.n_tokens] = c.n_tokens
97
        matrix[idx, Feature.ambiguity] = c.ambiguity
98
        matrix[idx, Feature.first_occ] = c.first_occ
99
        matrix[idx, Feature.last_occ] = c.last_occ
100
        matrix[idx, Feature.spread] = c.spread
101
        matrix[idx, Feature.doc_length] = c.doc_length
102
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
103
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
104
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
106
    return matrix
107
108
109
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
110
111
    @classmethod
112
    def generate_candidates(cls, doc_subject_ids, text):
113
        candidates = generate_candidates(text, **cls.args)
114
        return doc_subject_ids, candidates
115
116
117
class MLLMFeatureConverter(annif.parallel.BaseWorker):
118
119
    @classmethod
120
    def candidates_to_features(cls, candidates):
121
        return candidates_to_features(candidates, **cls.args)
122
123
124
class MLLMModel:
125
    """Maui-like Lexical Matching model"""
126
127
    def generate_candidates(self, text, analyzer):
128
        return generate_candidates(text, analyzer,
129
                                   self._vectorizer, self._index)
130
131
    def _candidates_to_features(self, candidates):
132
        return candidates_to_features(candidates,
133
                                      self._related_matrix,
134
                                      self._broader_matrix,
135
                                      self._narrower_matrix,
136
                                      self._collection_matrix,
137
                                      self._doc_freq,
138
                                      self._subj_freq,
139
                                      self._idf)
140
141
    def _prepare_terms(self, graph, vocab, params):
142
        if annif.util.boolean(params['use_hidden_labels']):
143
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
144
        else:
145
            label_props = [SKOS.altLabel]
146
147
        terms = []
148
        subject_ids = []
149
        for subj_id, uri, pref, _ in vocab.subjects.active:
150
            subject_ids.append(subj_id)
151
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
152
153
            for label in get_subject_labels(graph, uri, label_props,
154
                                            params['language']):
155
                terms.append(Term(subject_id=subj_id,
156
                                  label=label,
157
                                  is_pref=False))
158
159
        return (terms, subject_ids)
160
161
    def _prepare_relations(self, graph, vocab):
162
        self._broader_matrix = make_relation_matrix(
163
            graph, vocab, SKOS.broader)
164
        self._narrower_matrix = make_relation_matrix(
165
            graph, vocab, SKOS.narrower)
166
        self._related_matrix = make_relation_matrix(
167
            graph, vocab, SKOS.related)
168
        self._collection_matrix = make_collection_matrix(graph, vocab)
169
170
    def _prepare_train_index(self, vocab, analyzer, params):
171
        graph = vocab.as_graph()
172
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
173
        self._prepare_relations(graph, vocab)
174
175
        self._vectorizer = CountVectorizer(
176
            binary=True,
177
            tokenizer=analyzer.tokenize_words
178
        )
179
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
180
181
        # frequency of each token used in labels - how rare each word is
182
        token_freq = np.bincount(label_corpus.indices,
183
                                 minlength=label_corpus.shape[1])
184
185
        self._index = TokenSetIndex()
186
        for term, label_matrix in zip(terms, label_corpus):
187
            tokens = label_matrix.nonzero()[1]
188
            # sort tokens by frequency - use the rarest token as index key
189
            tokens = sorted(tokens, key=token_freq.__getitem__)
190
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
191
            self._index.add(tset)
192
193
        return subject_ids
194
195
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
196
        # frequency of subjects (by id) in the generated candidates
197
        self._doc_freq = collections.Counter()
198
        # frequency of manually assigned subjects ("domain keyphraseness")
199
        self._subj_freq = collections.Counter()
200
        train_x = []
201
        train_y = []
202
203
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
204
205
        cg_args = {
206
            'analyzer': analyzer,
207
            'vectorizer': self._vectorizer,
208
            'index': self._index
209
        }
210
211
        with pool_class(jobs,
212
                        initializer=MLLMCandidateGenerator.init,
213
                        initargs=(cg_args,)) as pool:
214
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
215
                       doc.text)
216
                      for doc in corpus.documents)
217
            for doc_subject_ids, candidates in pool.starmap(
218
                    MLLMCandidateGenerator.generate_candidates, params, 10):
219
220
                self._subj_freq.update(doc_subject_ids)
221
                self._doc_freq.update([c.subject_id for c in candidates])
222
                train_x.append(candidates)
223
                train_y += [(c.subject_id in doc_subject_ids)
224
                            for c in candidates]
225
226
        return (train_x, train_y)
227
228
    def _calculate_idf(self, subject_ids, doc_count):
229
        idf = collections.defaultdict(float)
230
        for subj_id in subject_ids:
231
            idf[subj_id] = math.log((doc_count + 1) /
232
                                    (self._doc_freq[subj_id] + 1)) + 1
233
234
        return idf
235
236
    def _prepare_features(self, train_x, n_jobs):
237
        fc_args = {
238
            'related_matrix': self._related_matrix,
239
            'broader_matrix': self._broader_matrix,
240
            'narrower_matrix': self._narrower_matrix,
241
            'collection_matrix': self._collection_matrix,
242
            'doc_freq': self._doc_freq,
243
            'subj_freq': self._subj_freq,
244
            'idf': self._idf
245
        }
246
247
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
248
249
        with pool_class(jobs,
250
                        initializer=MLLMFeatureConverter.init,
251
                        initargs=(fc_args,)) as pool:
252
            features = pool.map(
253
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
254
255
        return features
256
257
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
258
        # create an index from the vocabulary terms
259
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
260
261
        # convert the corpus into train data
262
        train_x, train_y = self._prepare_train_data(
263
            corpus, vocab, analyzer, n_jobs)
264
265
        # precalculate idf values for all candidate subjects
266
        self._idf = self._calculate_idf(subject_ids, len(train_x))
267
268
        # convert the train data into feature values
269
        features = self._prepare_features(train_x, n_jobs)
270
271
        return (np.vstack(features), np.array(train_y))
272
273
    def _create_classifier(self, params):
274
        return BaggingClassifier(
275
            DecisionTreeClassifier(
276
                min_samples_leaf=int(params['min_samples_leaf']),
277
                max_leaf_nodes=int(params['max_leaf_nodes'])
278
            ), max_samples=float(params['max_samples']))
279
280
    def train(self, train_x, train_y, params):
281
        # fit the model on the training corpus
282
        self._classifier = self._create_classifier(params)
283
        self._classifier.fit(train_x, train_y)
284
285
    def _prediction_to_list(self, scores, candidates):
286
        subj_scores = [(score[1], c.subject_id)
287
                       for score, c in zip(scores, candidates)]
288
        return sorted(subj_scores, reverse=True)
289
290
    def predict(self, candidates):
291
        if not candidates:
292
            return []
293
        features = self._candidates_to_features(candidates)
294
        scores = self._classifier.predict_proba(features)
295
        return self._prediction_to_list(scores, candidates)
296
297
    def save(self, filename):
298
        return joblib.dump(self, filename)
299
300
    @staticmethod
301
    def load(filename):
302
        return joblib.load(filename)
303