Passed
Pull Request — master (#597)
by Osma
02:48
created

annif.lexical.mllm.MLLMModel._prepare_terms()   B

Complexity

Conditions 5

Size

Total Lines 25
Code Lines 20

Duplication

Lines 25
Ratio 100 %

Importance

Changes 0
Metric Value
cc 5
eloc 20
nop 4
dl 25
loc 25
rs 8.9332
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.exception import OperationFailedException
16
from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
from annif.lexical.util import get_subject_labels
18
from annif.lexical.util import make_relation_matrix, make_collection_matrix
19
20
21
Term = collections.namedtuple('Term', 'subject_id label is_pref')
22
23
Match = collections.namedtuple(
24
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
25
26
Candidate = collections.namedtuple(
27
    'Candidate',
28
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
29
    'first_occ last_occ spread')
30
31
ModelData = collections.namedtuple(
32
    'ModelData',
33
    'broader narrower related collection ' +
34
    'doc_freq subj_freq idf')
35
36
Feature = IntEnum(
37
    'Feature',
38
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
39
    'first_occ last_occ spread doc_length ' +
40
    'broader narrower related collection',
41
    start=0)
42
43
44 View Code Duplication
def conflate_matches(matches, doc_length):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
45
    subj_matches = collections.defaultdict(list)
46
    for match in matches:
47
        subj_matches[match.subject_id].append(match)
48
    return [
49
        Candidate(
50
            doc_length=doc_length,
51
            subject_id=subject_id,
52
            freq=len(matches) / doc_length,
53
            is_pref=mean((float(m.is_pref) for m in matches)),
54
            n_tokens=mean((m.n_tokens for m in matches)),
55
            ambiguity=mean((m.ambiguity for m in matches)),
56
            first_occ=matches[0].pos / doc_length,
57
            last_occ=matches[-1].pos / doc_length,
58
            spread=(matches[-1].pos - matches[0].pos) / doc_length
59
        )
60
        for subject_id, matches in subj_matches.items()]
61
62
63 View Code Duplication
def generate_candidates(text, analyzer, vectorizer, index):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
64
    sentences = analyzer.tokenize_sentences(text)
65
    sent_tokens = vectorizer.transform(sentences)
66
    matches = []
67
68
    for sent_idx, token_matrix in enumerate(sent_tokens):
69
        tset = TokenSet(token_matrix.nonzero()[1])
70
        for ts, ambiguity in index.search(tset):
71
            matches.append(Match(subject_id=ts.subject_id,
72
                                 is_pref=ts.is_pref,
73
                                 n_tokens=len(ts),
74
                                 pos=sent_idx,
75
                                 ambiguity=ambiguity))
76
77
    return conflate_matches(matches, len(sentences))
78
79
80 View Code Duplication
def candidates_to_features(candidates, mdata):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
81
    """Convert a list of Candidates to a NumPy feature matrix"""
82
83
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
84
    c_ids = [c.subject_id for c in candidates]
85
    c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
86
    c_vec[c_ids] = True
87
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
88
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
89
    related = mdata.related.multiply(c_vec).sum(axis=1)
90
    collection = mdata.collection.multiply(c_vec).T.dot(
91
        mdata.collection).sum(axis=0)
92
    for idx, c in enumerate(candidates):
93
        subj = c.subject_id
94
        matrix[idx, Feature.freq] = c.freq
95
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
96
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
97
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
98
        matrix[idx, Feature.is_pref] = c.is_pref
99
        matrix[idx, Feature.n_tokens] = c.n_tokens
100
        matrix[idx, Feature.ambiguity] = c.ambiguity
101
        matrix[idx, Feature.first_occ] = c.first_occ
102
        matrix[idx, Feature.last_occ] = c.last_occ
103
        matrix[idx, Feature.spread] = c.spread
104
        matrix[idx, Feature.doc_length] = c.doc_length
105
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
106
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
107
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
108
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
109
    return matrix
110
111
112
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
113
114
    @classmethod
115
    def generate_candidates(cls, doc_subject_ids, text):
116
        candidates = generate_candidates(text, **cls.args)  # pragma: no cover
117
        return doc_subject_ids, candidates  # pragma: no cover
118
119
120
class MLLMFeatureConverter(annif.parallel.BaseWorker):
121
122
    @classmethod
123
    def candidates_to_features(cls, candidates):
124
        return candidates_to_features(candidates,
125
                                      **cls.args)  # pragma: no cover
126
127
128 View Code Duplication
class MLLMModel:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
129
    """Maui-like Lexical Matching model"""
130
131
    def generate_candidates(self, text, analyzer):
132
        return generate_candidates(text, analyzer,
133
                                   self._vectorizer, self._index)
134
135
    @property
136
    def _model_data(self):
137
        return ModelData(broader=self._broader_matrix,
138
                         narrower=self._narrower_matrix,
139
                         related=self._related_matrix,
140
                         collection=self._collection_matrix,
141
                         doc_freq=self._doc_freq,
142
                         subj_freq=self._subj_freq,
143
                         idf=self._idf)
144
145
    def _candidates_to_features(self, candidates):
146
        return candidates_to_features(candidates, self._model_data)
147
148
    def _prepare_terms(self, graph, vocab, params):
149
        pref_label_props = [SKOS.prefLabel]
150
        if annif.util.boolean(params['use_hidden_labels']):
151
            nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
152
        else:
153
            nonpref_label_props = [SKOS.altLabel]
154
155
        terms = []
156
        subject_ids = []
157
        for subj_id, uri, _, _ in vocab.subjects.active:
158
            subject_ids.append(subj_id)
159
160
            for label in get_subject_labels(graph, uri, pref_label_props,
161
                                            params['language']):
162
                terms.append(Term(subject_id=subj_id,
163
                                  label=label,
164
                                  is_pref=True))
165
166
            for label in get_subject_labels(graph, uri, nonpref_label_props,
167
                                            params['language']):
168
                terms.append(Term(subject_id=subj_id,
169
                                  label=label,
170
                                  is_pref=False))
171
172
        return (terms, subject_ids)
173
174
    def _prepare_relations(self, graph, vocab):
175
        self._broader_matrix = make_relation_matrix(
176
            graph, vocab, SKOS.broader)
177
        self._narrower_matrix = make_relation_matrix(
178
            graph, vocab, SKOS.narrower)
179
        self._related_matrix = make_relation_matrix(
180
            graph, vocab, SKOS.related)
181
        self._collection_matrix = make_collection_matrix(graph, vocab)
182
183
    def _prepare_train_index(self, vocab, analyzer, params):
184
        graph = vocab.as_graph()
185
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
186
        self._prepare_relations(graph, vocab)
187
188
        self._vectorizer = CountVectorizer(
189
            binary=True,
190
            tokenizer=analyzer.tokenize_words
191
        )
192
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
193
194
        # frequency of each token used in labels - how rare each word is
195
        token_freq = np.bincount(label_corpus.indices,
196
                                 minlength=label_corpus.shape[1])
197
198
        self._index = TokenSetIndex()
199
        for term, label_matrix in zip(terms, label_corpus):
200
            tokens = label_matrix.nonzero()[1]
201
            # sort tokens by frequency - use the rarest token as index key
202
            tokens = sorted(tokens, key=token_freq.__getitem__)
203
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
204
            self._index.add(tset)
205
206
        return subject_ids
207
208
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
209
        # frequency of subjects (by id) in the generated candidates
210
        self._doc_freq = collections.Counter()
211
        # frequency of manually assigned subjects ("domain keyphraseness")
212
        self._subj_freq = collections.Counter()
213
        train_x = []
214
        train_y = []
215
216
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
217
218
        cg_args = {
219
            'analyzer': analyzer,
220
            'vectorizer': self._vectorizer,
221
            'index': self._index
222
        }
223
224
        with pool_class(jobs,
225
                        initializer=MLLMCandidateGenerator.init,
226
                        initargs=(cg_args,)) as pool:
227
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
228
                       doc.text)
229
                      for doc in corpus.documents)
230
            for doc_subject_ids, candidates in pool.starmap(
231
                    MLLMCandidateGenerator.generate_candidates, params, 10):
232
233
                self._subj_freq.update(doc_subject_ids)
234
                self._doc_freq.update([c.subject_id for c in candidates])
235
                train_x.append(candidates)
236
                train_y += [(c.subject_id in doc_subject_ids)
237
                            for c in candidates]
238
239
        return (train_x, train_y)
240
241
    def _calculate_idf(self, subject_ids, doc_count):
242
        idf = collections.defaultdict(float)
243
        for subj_id in subject_ids:
244
            idf[subj_id] = math.log((doc_count + 1) /
245
                                    (self._doc_freq[subj_id] + 1)) + 1
246
247
        return idf
248
249
    def _prepare_features(self, train_x, n_jobs):
250
        fc_args = {'mdata': self._model_data}
251
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
252
253
        with pool_class(jobs,
254
                        initializer=MLLMFeatureConverter.init,
255
                        initargs=(fc_args,)) as pool:
256
            features = pool.map(
257
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
258
259
        return features
260
261
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
262
        # create an index from the vocabulary terms
263
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
264
265
        # convert the corpus into train data
266
        train_x, train_y = self._prepare_train_data(
267
            corpus, vocab, analyzer, n_jobs)
268
269
        # precalculate idf values for all candidate subjects
270
        self._idf = self._calculate_idf(subject_ids, len(train_x))
271
272
        # convert the train data into feature values
273
        features = self._prepare_features(train_x, n_jobs)
274
275
        return (np.vstack(features), np.array(train_y))
276
277
    def _create_classifier(self, params):
278
        return BaggingClassifier(
279
            DecisionTreeClassifier(
280
                min_samples_leaf=int(params['min_samples_leaf']),
281
                max_leaf_nodes=int(params['max_leaf_nodes'])
282
            ), max_samples=float(params['max_samples']))
283
284
    def train(self, train_x, train_y, params):
285
        # fit the model on the training corpus
286
        self._classifier = self._create_classifier(params)
287
        self._classifier.fit(train_x, train_y)
288
        # sanity check: verify that the classifier has seen both classes
289
        if self._classifier.n_classes_ != 2:
290
            raise OperationFailedException(
291
                "Unable to create classifier: " +
292
                "Not enough positive and negative examples " +
293
                "in the training data. Please check that your training " +
294
                "data matches your vocabulary.")
295
296
    def _prediction_to_list(self, scores, candidates):
297
        subj_scores = [(score[1], c.subject_id)
298
                       for score, c in zip(scores, candidates)]
299
        return sorted(subj_scores, reverse=True)
300
301
    def predict(self, candidates):
302
        if not candidates:
303
            return []
304
        features = self._candidates_to_features(candidates)
305
        scores = self._classifier.predict_proba(features)
306
        return self._prediction_to_list(scores, candidates)
307
308
    def save(self, filename):
309
        return joblib.dump(self, filename)
310
311
    @staticmethod
312
    def load(filename):
313
        return joblib.load(filename)
314