Passed
Pull Request — master (#511)
by Osma
01:55
created

annif.lexical.mllm.candidates_to_features()   A

Complexity

Conditions 2

Size

Total Lines 33
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 31
nop 8
dl 0
loc 33
rs 9.1359
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'broader narrower related collection',
35
    start=0)
36
37
38
def conflate_matches(matches, doc_length):
39
    subj_matches = collections.defaultdict(list)
40
    for match in matches:
41
        subj_matches[match.subject_id].append(match)
42
    return [
43
        Candidate(
44
            doc_length=doc_length,
45
            subject_id=subject_id,
46
            freq=len(matches) / doc_length,
47
            is_pref=mean((float(m.is_pref) for m in matches)),
48
            n_tokens=mean((m.n_tokens for m in matches)),
49
            ambiguity=mean((m.ambiguity for m in matches)),
50
            first_occ=matches[0].pos / doc_length,
51
            last_occ=matches[-1].pos / doc_length,
52
            spread=(matches[-1].pos - matches[0].pos) / doc_length
53
        )
54
        for subject_id, matches in subj_matches.items()]
55
56
57
def generate_candidates(text, analyzer, vectorizer, index):
58
    sentences = analyzer.tokenize_sentences(text)
59
    sent_tokens = vectorizer.transform(sentences)
60
    matches = []
61
62
    for sent_idx, token_matrix in enumerate(sent_tokens):
63
        tset = TokenSet(token_matrix.nonzero()[1])
64
        for ts, ambiguity in index.search(tset):
65
            matches.append(Match(subject_id=ts.subject_id,
66
                                 is_pref=ts.is_pref,
67
                                 n_tokens=len(ts),
68
                                 pos=sent_idx,
69
                                 ambiguity=ambiguity))
70
71
    return conflate_matches(matches, len(sentences))
72
73
74
def candidates_to_features(candidates,
75
                           related_matrix, broader_matrix,
76
                           narrower_matrix, collection_matrix,
77
                           doc_freq, subj_freq, idf):
78
    """Convert a list of Candidates to a NumPy feature matrix"""
79
80
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
81
    c_ids = [c.subject_id for c in candidates]
82
    c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
83
    c_vec[c_ids] = True
84
    broader = broader_matrix.multiply(c_vec).sum(axis=1)
85
    narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
86
    related = related_matrix.multiply(c_vec).sum(axis=1)
87
    collection = collection_matrix.multiply(c_vec).T.dot(
88
        collection_matrix).sum(axis=0)
89
    for idx, c in enumerate(candidates):
90
        subj = c.subject_id
91
        matrix[idx, Feature.freq] = c.freq
92
        matrix[idx, Feature.doc_freq] = doc_freq[subj]
93
        matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
94
        matrix[idx, Feature.tfidf] = c.freq * idf[subj]
95
        matrix[idx, Feature.is_pref] = c.is_pref
96
        matrix[idx, Feature.n_tokens] = c.n_tokens
97
        matrix[idx, Feature.ambiguity] = c.ambiguity
98
        matrix[idx, Feature.first_occ] = c.first_occ
99
        matrix[idx, Feature.last_occ] = c.last_occ
100
        matrix[idx, Feature.spread] = c.spread
101
        matrix[idx, Feature.doc_length] = c.doc_length
102
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
103
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
104
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
106
    return matrix
107
108
109
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
110
111
    @classmethod
112
    def generate_candidates(cls, doc_subject_ids, text):
113
        candidates = generate_candidates(text, **cls.args)
114
        return doc_subject_ids, candidates
115
116
117
class MLLMFeatureConverter(annif.parallel.BaseWorker):
118
119
    @classmethod
120
    def candidates_to_features(cls, candidates):
121
        return candidates_to_features(candidates, **cls.args)
122
123
124
class MLLMModel:
125
    """Maui-like Lexical Matching model"""
126
127
    def generate_candidates(self, text, analyzer):
128
        return generate_candidates(text, analyzer,
129
                                   self._vectorizer, self._index)
130
131
    def _candidates_to_features(self, candidates):
132
        return candidates_to_features(candidates,
133
                                      self._related_matrix,
134
                                      self._broader_matrix,
135
                                      self._narrower_matrix,
136
                                      self._collection_matrix,
137
                                      self._doc_freq,
138
                                      self._subj_freq,
139
                                      self._idf)
140
141
    def _prepare_terms(self, graph, vocab, params):
142
        if annif.util.boolean(params['use_hidden_labels']):
143
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
144
        else:
145
            label_props = [SKOS.altLabel]
146
147
        terms = []
148
        subject_ids = []
149
        for subj_id, uri, pref, _ in vocab.subjects.active:
150
            subject_ids.append(subj_id)
151
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
152
153
            for label in get_subject_labels(graph, uri, label_props,
154
                                            params['language']):
155
                terms.append(Term(subject_id=subj_id,
156
                                  label=label,
157
                                  is_pref=False))
158
159
        return (terms, subject_ids)
160
161
    def _prepare_relations(self, graph, vocab):
162
        self._broader_matrix = make_relation_matrix(
163
            graph, vocab, SKOS.broader)
164
        self._narrower_matrix = make_relation_matrix(
165
            graph, vocab, SKOS.narrower)
166
        self._related_matrix = make_relation_matrix(
167
            graph, vocab, SKOS.related)
168
        self._collection_matrix = make_collection_matrix(graph, vocab)
169
170
    def _prepare_train_index(self, vocab, analyzer, params):
171
        graph = vocab.as_graph()
172
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
173
        self._prepare_relations(graph, vocab)
174
175
        self._vectorizer = CountVectorizer(
176
            binary=True,
177
            tokenizer=analyzer.tokenize_words
178
        )
179
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
180
181
        # frequency of each token used in labels - how rare each word is
182
        token_freq = np.bincount(label_corpus.indices,
183
                                 minlength=label_corpus.shape[1])
184
185
        self._index = TokenSetIndex()
186
        for term, label_matrix in zip(terms, label_corpus):
187
            tokens = label_matrix.nonzero()[1]
188
            # sort tokens by frequency - use the rarest token as index key
189
            tokens = sorted(tokens, key=token_freq.__getitem__)
190
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
191
            self._index.add(tset)
192
193
        return subject_ids
194
195
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
196
        # frequency of subjects (by id) in the generated candidates
197
        self._doc_freq = collections.Counter()
198
        # frequency of manually assigned subjects ("domain keyphraseness")
199
        self._subj_freq = collections.Counter()
200
        train_x = []
201
        train_y = []
202
203
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
204
205
        cg_args = {
206
            'analyzer': analyzer,
207
            'vectorizer': self._vectorizer,
208
            'index': self._index
209
        }
210
211
        with pool_class(jobs,
212
                        initializer=MLLMCandidateGenerator.init,
213
                        initargs=(cg_args,)) as pool:
214
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
215
                       doc.text)
216
                      for doc in corpus.documents)
217
            for doc_subject_ids, candidates in pool.starmap(
218
                    MLLMCandidateGenerator.generate_candidates, params, 10):
219
220
                self._subj_freq.update(doc_subject_ids)
221
                self._doc_freq.update([c.subject_id for c in candidates])
222
                train_x.append(candidates)
223
                train_y += [(c.subject_id in doc_subject_ids)
224
                            for c in candidates]
225
226
        return (train_x, train_y)
227
228
    def _calculate_idf(self, subject_ids, doc_count):
229
        idf = collections.defaultdict(float)
230
        for subj_id in subject_ids:
231
            idf[subj_id] = math.log((doc_count + 1) /
232
                                    (self._doc_freq[subj_id] + 1)) + 1
233
234
        return idf
235
236
    def _prepare_features(self, train_x, n_jobs):
237
        fc_args = {
238
            'related_matrix': self._related_matrix,
239
            'broader_matrix': self._broader_matrix,
240
            'narrower_matrix': self._narrower_matrix,
241
            'collection_matrix': self._collection_matrix,
242
            'doc_freq': self._doc_freq,
243
            'subj_freq': self._subj_freq,
244
            'idf': self._idf
245
        }
246
247
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
248
249
        with pool_class(jobs,
250
                        initializer=MLLMFeatureConverter.init,
251
                        initargs=(fc_args,)) as pool:
252
            features = pool.map(
253
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
254
255
        return features
256
257
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
258
        # create an index from the vocabulary terms
259
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
260
261
        # convert the corpus into train data
262
        train_x, train_y = self._prepare_train_data(
263
            corpus, vocab, analyzer, n_jobs)
264
265
        # precalculate idf values for all candidate subjects
266
        self._idf = self._calculate_idf(subject_ids, len(train_x))
267
268
        # convert the train data into feature values
269
        features = self._prepare_features(train_x, n_jobs)
270
271
        return (np.vstack(features), np.array(train_y))
272
273
    def _create_classifier(self, params):
274
        return BaggingClassifier(
275
            DecisionTreeClassifier(
276
                min_samples_leaf=int(params['min_samples_leaf']),
277
                max_leaf_nodes=int(params['max_leaf_nodes'])
278
            ), max_samples=float(params['max_samples']))
279
280
    def train(self, train_x, train_y, params):
281
        # fit the model on the training corpus
282
        self._classifier = self._create_classifier(params)
283
        self._classifier.fit(train_x, train_y)
284
285
    def _prediction_to_list(self, scores, candidates):
286
        subj_scores = [(score[1], c.subject_id)
287
                       for score, c in zip(scores, candidates)]
288
        return sorted(subj_scores, reverse=True)
289
290
    def predict(self, candidates):
291
        if not candidates:
292
            return []
293
        features = self._candidates_to_features(candidates)
294
        scores = self._classifier.predict_proba(features)
295
        return self._prediction_to_list(scores, candidates)
296
297
    def save(self, filename):
298
        return joblib.dump(self, filename)
299
300
    @staticmethod
301
    def load(filename):
302
        return joblib.load(filename)
303