Passed
Pull Request — master (#511)
by Osma
01:59
created

annif.lexical.mllm.MLLMModel._prepare_train_data()   A

Complexity

Conditions 3

Size

Total Lines 32
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 24
nop 5
dl 0
loc 32
rs 9.304
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
ModelData = collections.namedtuple(
31
    'ModelData',
32
    'broader narrower related collection ' +
33
    'doc_freq subj_freq idf')
34
35
Feature = IntEnum(
36
    'Feature',
37
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
38
    'first_occ last_occ spread doc_length ' +
39
    'broader narrower related collection',
40
    start=0)
41
42
43
def conflate_matches(matches, doc_length):
44
    subj_matches = collections.defaultdict(list)
45
    for match in matches:
46
        subj_matches[match.subject_id].append(match)
47
    return [
48
        Candidate(
49
            doc_length=doc_length,
50
            subject_id=subject_id,
51
            freq=len(matches) / doc_length,
52
            is_pref=mean((float(m.is_pref) for m in matches)),
53
            n_tokens=mean((m.n_tokens for m in matches)),
54
            ambiguity=mean((m.ambiguity for m in matches)),
55
            first_occ=matches[0].pos / doc_length,
56
            last_occ=matches[-1].pos / doc_length,
57
            spread=(matches[-1].pos - matches[0].pos) / doc_length
58
        )
59
        for subject_id, matches in subj_matches.items()]
60
61
62
def generate_candidates(text, analyzer, vectorizer, index):
63
    sentences = analyzer.tokenize_sentences(text)
64
    sent_tokens = vectorizer.transform(sentences)
65
    matches = []
66
67
    for sent_idx, token_matrix in enumerate(sent_tokens):
68
        tset = TokenSet(token_matrix.nonzero()[1])
69
        for ts, ambiguity in index.search(tset):
70
            matches.append(Match(subject_id=ts.subject_id,
71
                                 is_pref=ts.is_pref,
72
                                 n_tokens=len(ts),
73
                                 pos=sent_idx,
74
                                 ambiguity=ambiguity))
75
76
    return conflate_matches(matches, len(sentences))
77
78
79
def candidates_to_features(candidates, mdata):
80
    """Convert a list of Candidates to a NumPy feature matrix"""
81
82
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
83
    c_ids = [c.subject_id for c in candidates]
84
    c_vec = np.zeros(mdata.related.shape[0], dtype=np.bool)
85
    c_vec[c_ids] = True
86
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
87
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
88
    related = mdata.related.multiply(c_vec).sum(axis=1)
89
    collection = mdata.collection.multiply(c_vec).T.dot(
90
        mdata.collection).sum(axis=0)
91
    for idx, c in enumerate(candidates):
92
        subj = c.subject_id
93
        matrix[idx, Feature.freq] = c.freq
94
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
95
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
96
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
97
        matrix[idx, Feature.is_pref] = c.is_pref
98
        matrix[idx, Feature.n_tokens] = c.n_tokens
99
        matrix[idx, Feature.ambiguity] = c.ambiguity
100
        matrix[idx, Feature.first_occ] = c.first_occ
101
        matrix[idx, Feature.last_occ] = c.last_occ
102
        matrix[idx, Feature.spread] = c.spread
103
        matrix[idx, Feature.doc_length] = c.doc_length
104
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
106
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
107
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
108
    return matrix
109
110
111
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
112
113
    @classmethod
114
    def generate_candidates(cls, doc_subject_ids, text):
115
        candidates = generate_candidates(text, **cls.args)
116
        return doc_subject_ids, candidates
117
118
119
class MLLMFeatureConverter(annif.parallel.BaseWorker):
120
121
    @classmethod
122
    def candidates_to_features(cls, candidates):
123
        return candidates_to_features(candidates, **cls.args)
124
125
126
class MLLMModel:
127
    """Maui-like Lexical Matching model"""
128
129
    def generate_candidates(self, text, analyzer):
130
        return generate_candidates(text, analyzer,
131
                                   self._vectorizer, self._index)
132
133
    @property
134
    def _model_data(self):
135
        return ModelData(broader=self._broader_matrix,
136
                         narrower=self._narrower_matrix,
137
                         related=self._related_matrix,
138
                         collection=self._collection_matrix,
139
                         doc_freq=self._doc_freq,
140
                         subj_freq=self._subj_freq,
141
                         idf=self._idf)
142
143
    def _candidates_to_features(self, candidates):
144
        return candidates_to_features(candidates, self._model_data)
145
146
    def _prepare_terms(self, graph, vocab, params):
147
        if annif.util.boolean(params['use_hidden_labels']):
148
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
149
        else:
150
            label_props = [SKOS.altLabel]
151
152
        terms = []
153
        subject_ids = []
154
        for subj_id, uri, pref, _ in vocab.subjects.active:
155
            subject_ids.append(subj_id)
156
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
157
158
            for label in get_subject_labels(graph, uri, label_props,
159
                                            params['language']):
160
                terms.append(Term(subject_id=subj_id,
161
                                  label=label,
162
                                  is_pref=False))
163
164
        return (terms, subject_ids)
165
166
    def _prepare_relations(self, graph, vocab):
167
        self._broader_matrix = make_relation_matrix(
168
            graph, vocab, SKOS.broader)
169
        self._narrower_matrix = make_relation_matrix(
170
            graph, vocab, SKOS.narrower)
171
        self._related_matrix = make_relation_matrix(
172
            graph, vocab, SKOS.related)
173
        self._collection_matrix = make_collection_matrix(graph, vocab)
174
175
    def _prepare_train_index(self, vocab, analyzer, params):
176
        graph = vocab.as_graph()
177
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
178
        self._prepare_relations(graph, vocab)
179
180
        self._vectorizer = CountVectorizer(
181
            binary=True,
182
            tokenizer=analyzer.tokenize_words
183
        )
184
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
185
186
        # frequency of each token used in labels - how rare each word is
187
        token_freq = np.bincount(label_corpus.indices,
188
                                 minlength=label_corpus.shape[1])
189
190
        self._index = TokenSetIndex()
191
        for term, label_matrix in zip(terms, label_corpus):
192
            tokens = label_matrix.nonzero()[1]
193
            # sort tokens by frequency - use the rarest token as index key
194
            tokens = sorted(tokens, key=token_freq.__getitem__)
195
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
196
            self._index.add(tset)
197
198
        return subject_ids
199
200
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
201
        # frequency of subjects (by id) in the generated candidates
202
        self._doc_freq = collections.Counter()
203
        # frequency of manually assigned subjects ("domain keyphraseness")
204
        self._subj_freq = collections.Counter()
205
        train_x = []
206
        train_y = []
207
208
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
209
210
        cg_args = {
211
            'analyzer': analyzer,
212
            'vectorizer': self._vectorizer,
213
            'index': self._index
214
        }
215
216
        with pool_class(jobs,
217
                        initializer=MLLMCandidateGenerator.init,
218
                        initargs=(cg_args,)) as pool:
219
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
220
                       doc.text)
221
                      for doc in corpus.documents)
222
            for doc_subject_ids, candidates in pool.starmap(
223
                    MLLMCandidateGenerator.generate_candidates, params, 10):
224
225
                self._subj_freq.update(doc_subject_ids)
226
                self._doc_freq.update([c.subject_id for c in candidates])
227
                train_x.append(candidates)
228
                train_y += [(c.subject_id in doc_subject_ids)
229
                            for c in candidates]
230
231
        return (train_x, train_y)
232
233
    def _calculate_idf(self, subject_ids, doc_count):
234
        idf = collections.defaultdict(float)
235
        for subj_id in subject_ids:
236
            idf[subj_id] = math.log((doc_count + 1) /
237
                                    (self._doc_freq[subj_id] + 1)) + 1
238
239
        return idf
240
241
    def _prepare_features(self, train_x, n_jobs):
242
        fc_args = {'mdata': self._model_data}
243
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
244
245
        with pool_class(jobs,
246
                        initializer=MLLMFeatureConverter.init,
247
                        initargs=(fc_args,)) as pool:
248
            features = pool.map(
249
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
250
251
        return features
252
253
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
254
        # create an index from the vocabulary terms
255
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
256
257
        # convert the corpus into train data
258
        train_x, train_y = self._prepare_train_data(
259
            corpus, vocab, analyzer, n_jobs)
260
261
        # precalculate idf values for all candidate subjects
262
        self._idf = self._calculate_idf(subject_ids, len(train_x))
263
264
        # convert the train data into feature values
265
        features = self._prepare_features(train_x, n_jobs)
266
267
        return (np.vstack(features), np.array(train_y))
268
269
    def _create_classifier(self, params):
270
        return BaggingClassifier(
271
            DecisionTreeClassifier(
272
                min_samples_leaf=int(params['min_samples_leaf']),
273
                max_leaf_nodes=int(params['max_leaf_nodes'])
274
            ), max_samples=float(params['max_samples']))
275
276
    def train(self, train_x, train_y, params):
277
        # fit the model on the training corpus
278
        self._classifier = self._create_classifier(params)
279
        self._classifier.fit(train_x, train_y)
280
281
    def _prediction_to_list(self, scores, candidates):
282
        subj_scores = [(score[1], c.subject_id)
283
                       for score, c in zip(scores, candidates)]
284
        return sorted(subj_scores, reverse=True)
285
286
    def predict(self, candidates):
287
        if not candidates:
288
            return []
289
        features = self._candidates_to_features(candidates)
290
        scores = self._classifier.predict_proba(features)
291
        return self._prediction_to_list(scores, candidates)
292
293
    def save(self, filename):
294
        return joblib.dump(self, filename)
295
296
    @staticmethod
297
    def load(filename):
298
        return joblib.load(filename)
299