Passed
Pull Request — master (#511)
by Osma
01:38
created

annif.lexical.mllm.MLLMModel._prepare_features()   A

Complexity

Conditions 2

Size

Total Lines 11
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 9
nop 3
dl 0
loc 11
rs 9.95
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
ModelData = collections.namedtuple(
31
    'ModelData',
32
    'broader narrower related collection ' +
33
    'doc_freq subj_freq idf')
34
35
Feature = IntEnum(
36
    'Feature',
37
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
38
    'first_occ last_occ spread doc_length ' +
39
    'broader narrower related collection',
40
    start=0)
41
42
43
def conflate_matches(matches, doc_length):
44
    subj_matches = collections.defaultdict(list)
45
    for match in matches:
46
        subj_matches[match.subject_id].append(match)
47
    return [
48
        Candidate(
49
            doc_length=doc_length,
50
            subject_id=subject_id,
51
            freq=len(matches) / doc_length,
52
            is_pref=mean((float(m.is_pref) for m in matches)),
53
            n_tokens=mean((m.n_tokens for m in matches)),
54
            ambiguity=mean((m.ambiguity for m in matches)),
55
            first_occ=matches[0].pos / doc_length,
56
            last_occ=matches[-1].pos / doc_length,
57
            spread=(matches[-1].pos - matches[0].pos) / doc_length
58
        )
59
        for subject_id, matches in subj_matches.items()]
60
61
62
def generate_candidates(text, analyzer, vectorizer, index):
63
    sentences = analyzer.tokenize_sentences(text)
64
    sent_tokens = vectorizer.transform(sentences)
65
    matches = []
66
67
    for sent_idx, token_matrix in enumerate(sent_tokens):
68
        tset = TokenSet(token_matrix.nonzero()[1])
69
        for ts, ambiguity in index.search(tset):
70
            matches.append(Match(subject_id=ts.subject_id,
71
                                 is_pref=ts.is_pref,
72
                                 n_tokens=len(ts),
73
                                 pos=sent_idx,
74
                                 ambiguity=ambiguity))
75
76
    return conflate_matches(matches, len(sentences))
77
78
79
def candidates_to_features(candidates, mdata):
80
    """Convert a list of Candidates to a NumPy feature matrix"""
81
82
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
83
    c_ids = [c.subject_id for c in candidates]
84
    c_vec = np.zeros(mdata.related.shape[0], dtype=np.bool)
85
    c_vec[c_ids] = True
86
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
87
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
88
    related = mdata.related.multiply(c_vec).sum(axis=1)
89
    collection = mdata.collection.multiply(c_vec).T.dot(
90
        mdata.collection).sum(axis=0)
91
    for idx, c in enumerate(candidates):
92
        subj = c.subject_id
93
        matrix[idx, Feature.freq] = c.freq
94
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
95
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
96
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
97
        matrix[idx, Feature.is_pref] = c.is_pref
98
        matrix[idx, Feature.n_tokens] = c.n_tokens
99
        matrix[idx, Feature.ambiguity] = c.ambiguity
100
        matrix[idx, Feature.first_occ] = c.first_occ
101
        matrix[idx, Feature.last_occ] = c.last_occ
102
        matrix[idx, Feature.spread] = c.spread
103
        matrix[idx, Feature.doc_length] = c.doc_length
104
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
106
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
107
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
108
    return matrix
109
110
111
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
112
113
    @classmethod
114
    def generate_candidates(cls, doc_subject_ids, text):
115
        candidates = generate_candidates(text, **cls.args)
116
        return doc_subject_ids, candidates
117
118
119
class MLLMFeatureConverter(annif.parallel.BaseWorker):
120
121
    @classmethod
122
    def candidates_to_features(cls, candidates):
123
        return candidates_to_features(candidates,
124
                                      **cls.args)  # pragma: no cover
125
126
127
class MLLMModel:
128
    """Maui-like Lexical Matching model"""
129
130
    def generate_candidates(self, text, analyzer):
131
        return generate_candidates(text, analyzer,
132
                                   self._vectorizer, self._index)
133
134
    @property
135
    def _model_data(self):
136
        return ModelData(broader=self._broader_matrix,
137
                         narrower=self._narrower_matrix,
138
                         related=self._related_matrix,
139
                         collection=self._collection_matrix,
140
                         doc_freq=self._doc_freq,
141
                         subj_freq=self._subj_freq,
142
                         idf=self._idf)
143
144
    def _candidates_to_features(self, candidates):
145
        return candidates_to_features(candidates, self._model_data)
146
147
    def _prepare_terms(self, graph, vocab, params):
148
        if annif.util.boolean(params['use_hidden_labels']):
149
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
150
        else:
151
            label_props = [SKOS.altLabel]
152
153
        terms = []
154
        subject_ids = []
155
        for subj_id, uri, pref, _ in vocab.subjects.active:
156
            subject_ids.append(subj_id)
157
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
158
159
            for label in get_subject_labels(graph, uri, label_props,
160
                                            params['language']):
161
                terms.append(Term(subject_id=subj_id,
162
                                  label=label,
163
                                  is_pref=False))
164
165
        return (terms, subject_ids)
166
167
    def _prepare_relations(self, graph, vocab):
168
        self._broader_matrix = make_relation_matrix(
169
            graph, vocab, SKOS.broader)
170
        self._narrower_matrix = make_relation_matrix(
171
            graph, vocab, SKOS.narrower)
172
        self._related_matrix = make_relation_matrix(
173
            graph, vocab, SKOS.related)
174
        self._collection_matrix = make_collection_matrix(graph, vocab)
175
176
    def _prepare_train_index(self, vocab, analyzer, params):
177
        graph = vocab.as_graph()
178
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
179
        self._prepare_relations(graph, vocab)
180
181
        self._vectorizer = CountVectorizer(
182
            binary=True,
183
            tokenizer=analyzer.tokenize_words
184
        )
185
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
186
187
        # frequency of each token used in labels - how rare each word is
188
        token_freq = np.bincount(label_corpus.indices,
189
                                 minlength=label_corpus.shape[1])
190
191
        self._index = TokenSetIndex()
192
        for term, label_matrix in zip(terms, label_corpus):
193
            tokens = label_matrix.nonzero()[1]
194
            # sort tokens by frequency - use the rarest token as index key
195
            tokens = sorted(tokens, key=token_freq.__getitem__)
196
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
197
            self._index.add(tset)
198
199
        return subject_ids
200
201
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
202
        # frequency of subjects (by id) in the generated candidates
203
        self._doc_freq = collections.Counter()
204
        # frequency of manually assigned subjects ("domain keyphraseness")
205
        self._subj_freq = collections.Counter()
206
        train_x = []
207
        train_y = []
208
209
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
210
211
        cg_args = {
212
            'analyzer': analyzer,
213
            'vectorizer': self._vectorizer,
214
            'index': self._index
215
        }
216
217
        with pool_class(jobs,
218
                        initializer=MLLMCandidateGenerator.init,
219
                        initargs=(cg_args,)) as pool:
220
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
221
                       doc.text)
222
                      for doc in corpus.documents)
223
            for doc_subject_ids, candidates in pool.starmap(
224
                    MLLMCandidateGenerator.generate_candidates, params, 10):
225
226
                self._subj_freq.update(doc_subject_ids)
227
                self._doc_freq.update([c.subject_id for c in candidates])
228
                train_x.append(candidates)
229
                train_y += [(c.subject_id in doc_subject_ids)
230
                            for c in candidates]
231
232
        return (train_x, train_y)
233
234
    def _calculate_idf(self, subject_ids, doc_count):
235
        idf = collections.defaultdict(float)
236
        for subj_id in subject_ids:
237
            idf[subj_id] = math.log((doc_count + 1) /
238
                                    (self._doc_freq[subj_id] + 1)) + 1
239
240
        return idf
241
242
    def _prepare_features(self, train_x, n_jobs):
243
        fc_args = {'mdata': self._model_data}
244
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
245
246
        with pool_class(jobs,
247
                        initializer=MLLMFeatureConverter.init,
248
                        initargs=(fc_args,)) as pool:
249
            features = pool.map(
250
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
251
252
        return features
253
254
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
255
        # create an index from the vocabulary terms
256
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
257
258
        # convert the corpus into train data
259
        train_x, train_y = self._prepare_train_data(
260
            corpus, vocab, analyzer, n_jobs)
261
262
        # precalculate idf values for all candidate subjects
263
        self._idf = self._calculate_idf(subject_ids, len(train_x))
264
265
        # convert the train data into feature values
266
        features = self._prepare_features(train_x, n_jobs)
267
268
        return (np.vstack(features), np.array(train_y))
269
270
    def _create_classifier(self, params):
271
        return BaggingClassifier(
272
            DecisionTreeClassifier(
273
                min_samples_leaf=int(params['min_samples_leaf']),
274
                max_leaf_nodes=int(params['max_leaf_nodes'])
275
            ), max_samples=float(params['max_samples']))
276
277
    def train(self, train_x, train_y, params):
278
        # fit the model on the training corpus
279
        self._classifier = self._create_classifier(params)
280
        self._classifier.fit(train_x, train_y)
281
282
    def _prediction_to_list(self, scores, candidates):
283
        subj_scores = [(score[1], c.subject_id)
284
                       for score, c in zip(scores, candidates)]
285
        return sorted(subj_scores, reverse=True)
286
287
    def predict(self, candidates):
288
        if not candidates:
289
            return []
290
        features = self._candidates_to_features(candidates)
291
        scores = self._classifier.predict_proba(features)
292
        return self._prediction_to_list(scores, candidates)
293
294
    def save(self, filename):
295
        return joblib.dump(self, filename)
296
297
    @staticmethod
298
    def load(filename):
299
        return joblib.load(filename)
300