Passed
Pull Request — master (#511)
by Osma
01:52
created

annif.lexical.mllm.candidates_to_features()   A

Complexity

Conditions 2

Size

Total Lines 33
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 31
nop 8
dl 0
loc 33
rs 9.1359
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'broader narrower related collection',
35
    start=0)
36
37
38
def conflate_matches(matches, doc_length):
39
    subj_matches = collections.defaultdict(list)
40
    for match in matches:
41
        subj_matches[match.subject_id].append(match)
42
    return [
43
        Candidate(
44
            doc_length=doc_length,
45
            subject_id=subject_id,
46
            freq=len(matches) / doc_length,
47
            is_pref=mean((float(m.is_pref) for m in matches)),
48
            n_tokens=mean((m.n_tokens for m in matches)),
49
            ambiguity=mean((m.ambiguity for m in matches)),
50
            first_occ=matches[0].pos / doc_length,
51
            last_occ=matches[-1].pos / doc_length,
52
            spread=(matches[-1].pos - matches[0].pos) / doc_length
53
        )
54
        for subject_id, matches in subj_matches.items()]
55
56
57
def generate_candidates(text, analyzer, vectorizer, index):
58
    sentences = analyzer.tokenize_sentences(text)
59
    sent_tokens = vectorizer.transform(sentences)
60
    matches = []
61
62
    for sent_idx, token_matrix in enumerate(sent_tokens):
63
        tset = TokenSet(token_matrix.nonzero()[1])
64
        for ts, ambiguity in index.search(tset):
65
            matches.append(Match(subject_id=ts.subject_id,
66
                                 is_pref=ts.is_pref,
67
                                 n_tokens=len(ts),
68
                                 pos=sent_idx,
69
                                 ambiguity=ambiguity))
70
71
    return conflate_matches(matches, len(sentences))
72
73
74
def candidates_to_features(candidates,
75
                           related_matrix, broader_matrix,
76
                           narrower_matrix, collection_matrix,
77
                           doc_freq, subj_freq, idf):
78
    """Convert a list of Candidates to a NumPy feature matrix"""
79
80
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
81
    c_ids = [c.subject_id for c in candidates]
82
    c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
83
    c_vec[c_ids] = True
84
    broader = broader_matrix.multiply(c_vec).sum(axis=1)
85
    narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
86
    related = related_matrix.multiply(c_vec).sum(axis=1)
87
    collection = collection_matrix.multiply(c_vec).T.dot(
88
        collection_matrix).sum(axis=0)
89
    for idx, c in enumerate(candidates):
90
        subj = c.subject_id
91
        matrix[idx, Feature.freq] = c.freq
92
        matrix[idx, Feature.doc_freq] = doc_freq[subj]
93
        matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
94
        matrix[idx, Feature.tfidf] = c.freq * idf[subj]
95
        matrix[idx, Feature.is_pref] = c.is_pref
96
        matrix[idx, Feature.n_tokens] = c.n_tokens
97
        matrix[idx, Feature.ambiguity] = c.ambiguity
98
        matrix[idx, Feature.first_occ] = c.first_occ
99
        matrix[idx, Feature.last_occ] = c.last_occ
100
        matrix[idx, Feature.spread] = c.spread
101
        matrix[idx, Feature.doc_length] = c.doc_length
102
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
103
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
104
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
106
    return matrix
107
108
109
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
110
111
    @classmethod
112
    def generate_candidates(cls, doc_subject_ids, text):
113
        candidates = generate_candidates(text, **cls.args)
114
        return doc_subject_ids, candidates
115
116
117
class MLLMFeatureConverter(annif.parallel.BaseWorker):
118
119
    @classmethod
120
    def candidates_to_features(cls, candidates):
121
        return candidates_to_features(candidates, **cls.args)
122
123
124
class MLLMModel:
125
    """Maui-like Lexical Matching model"""
126
127
    def generate_candidates(self, text, analyzer):
128
        return generate_candidates(text, analyzer,
129
                                   self._vectorizer, self._index)
130
131
    def _candidates_to_features(self, candidates):
132
        return candidates_to_features(candidates,
133
                                      self._related_matrix,
134
                                      self._broader_matrix,
135
                                      self._narrower_matrix,
136
                                      self._collection_matrix,
137
                                      self._doc_freq,
138
                                      self._subj_freq,
139
                                      self._idf)
140
141
    def _prepare_terms(self, graph, vocab, params):
142
        if annif.util.boolean(params['use_hidden_labels']):
143
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
144
        else:
145
            label_props = [SKOS.altLabel]
146
147
        terms = []
148
        subject_ids = []
149
        for subj_id, uri, pref, _ in vocab.subjects.active:
150
            subject_ids.append(subj_id)
151
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
152
153
            for label in get_subject_labels(graph, uri, label_props,
154
                                            params['language']):
155
                terms.append(Term(subject_id=subj_id,
156
                                  label=label,
157
                                  is_pref=False))
158
159
        return (terms, subject_ids)
160
161
    def _prepare_relations(self, graph, vocab):
162
        self._broader_matrix = make_relation_matrix(
163
            graph, vocab, SKOS.broader)
164
        self._narrower_matrix = make_relation_matrix(
165
            graph, vocab, SKOS.narrower)
166
        self._related_matrix = make_relation_matrix(
167
            graph, vocab, SKOS.related)
168
        self._collection_matrix = make_collection_matrix(graph, vocab)
169
170
    def _prepare_train_index(self, vocab, analyzer, params):
171
        graph = vocab.as_graph()
172
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
173
        self._prepare_relations(graph, vocab)
174
175
        self._vectorizer = CountVectorizer(
176
            binary=True,
177
            tokenizer=analyzer.tokenize_words
178
        )
179
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
180
181
        self._index = TokenSetIndex()
182
        for term, label_matrix in zip(terms, label_corpus):
183
            tokens = label_matrix.nonzero()[1]
184
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
185
            self._index.add(tset)
186
187
        return subject_ids
188
189
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
190
        # frequency of subjects (by id) in the generated candidates
191
        self._doc_freq = collections.Counter()
192
        # frequency of manually assigned subjects ("domain keyphraseness")
193
        self._subj_freq = collections.Counter()
194
        train_x = []
195
        train_y = []
196
197
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
198
199
        cg_args = {
200
            'analyzer': analyzer,
201
            'vectorizer': self._vectorizer,
202
            'index': self._index
203
        }
204
205
        with pool_class(jobs,
206
                        initializer=MLLMCandidateGenerator.init,
207
                        initargs=(cg_args,)) as pool:
208
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
209
                       doc.text)
210
                      for doc in corpus.documents)
211
            for doc_subject_ids, candidates in pool.starmap(
212
                    MLLMCandidateGenerator.generate_candidates, params, 10):
213
214
                self._subj_freq.update(doc_subject_ids)
215
                self._doc_freq.update([c.subject_id for c in candidates])
216
                train_x.append(candidates)
217
                train_y += [(c.subject_id in doc_subject_ids)
218
                            for c in candidates]
219
220
        return (train_x, train_y)
221
222
    def _calculate_idf(self, subject_ids, doc_count):
223
        idf = collections.defaultdict(float)
224
        for subj_id in subject_ids:
225
            idf[subj_id] = math.log((doc_count + 1) /
226
                                    (self._doc_freq[subj_id] + 1)) + 1
227
228
        return idf
229
230
    def _prepare_features(self, train_x, n_jobs):
231
        fc_args = {
232
            'related_matrix': self._related_matrix,
233
            'broader_matrix': self._broader_matrix,
234
            'narrower_matrix': self._narrower_matrix,
235
            'collection_matrix': self._collection_matrix,
236
            'doc_freq': self._doc_freq,
237
            'subj_freq': self._subj_freq,
238
            'idf': self._idf
239
        }
240
241
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
242
243
        with pool_class(jobs,
244
                        initializer=MLLMFeatureConverter.init,
245
                        initargs=(fc_args,)) as pool:
246
            features = pool.map(
247
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
248
249
        return features
250
251
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
252
        # create an index from the vocabulary terms
253
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
254
255
        # convert the corpus into train data
256
        train_x, train_y = self._prepare_train_data(
257
            corpus, vocab, analyzer, n_jobs)
258
259
        # precalculate idf values for all candidate subjects
260
        self._idf = self._calculate_idf(subject_ids, len(train_x))
261
262
        # convert the train data into feature values
263
        features = self._prepare_features(train_x, n_jobs)
264
265
        return (np.vstack(features), np.array(train_y))
266
267
    def _create_classifier(self, params):
268
        return BaggingClassifier(
269
            DecisionTreeClassifier(
270
                min_samples_leaf=int(params['min_samples_leaf']),
271
                max_leaf_nodes=int(params['max_leaf_nodes'])
272
            ), max_samples=float(params['max_samples']))
273
274
    def train(self, train_x, train_y, params):
275
        # fit the model on the training corpus
276
        self._classifier = self._create_classifier(params)
277
        self._classifier.fit(train_x, train_y)
278
279
    def _prediction_to_list(self, scores, candidates):
280
        subj_scores = [(score[1], c.subject_id)
281
                       for score, c in zip(scores, candidates)]
282
        return sorted(subj_scores, reverse=True)
283
284
    def predict(self, candidates):
285
        if not candidates:
286
            return []
287
        features = self._candidates_to_features(candidates)
288
        scores = self._classifier.predict_proba(features)
289
        return self._prediction_to_list(scores, candidates)
290
291
    def save(self, filename):
292
        return joblib.dump(self, filename)
293
294
    @staticmethod
295
    def load(filename):
296
        return joblib.load(filename)
297