Passed
Pull Request — master (#511)
by Osma
02:22
created

annif.lexical.mllm.candidates_to_features()   A

Complexity

Conditions 2

Size

Total Lines 33
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 31
nop 8
dl 0
loc 33
rs 9.1359
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'broader narrower related collection',
35
    start=0)
36
37
38
def conflate_matches(matches, doc_length):
39
    subj_matches = collections.defaultdict(list)
40
    for match in matches:
41
        subj_matches[match.subject_id].append(match)
42
    return [
43
        Candidate(
44
            doc_length=doc_length,
45
            subject_id=subject_id,
46
            freq=len(matches) / doc_length,
47
            is_pref=mean((float(m.is_pref) for m in matches)),
48
            n_tokens=mean((m.n_tokens for m in matches)),
49
            ambiguity=mean((m.ambiguity for m in matches)),
50
            first_occ=matches[0].pos / doc_length,
51
            last_occ=matches[-1].pos / doc_length,
52
            spread=(matches[-1].pos - matches[0].pos) / doc_length
53
        )
54
        for subject_id, matches in subj_matches.items()]
55
56
57
def generate_candidates(text, analyzer, vectorizer, index):
58
    sentences = analyzer.tokenize_sentences(text)
59
    sent_tokens = vectorizer.transform(sentences)
60
    matches = []
61
62
    for sent_idx, token_matrix in enumerate(sent_tokens):
63
        tset = TokenSet(token_matrix.nonzero()[1])
64
        for ts, ambiguity in index.search(tset):
65
            matches.append(Match(subject_id=ts.subject_id,
66
                                 is_pref=ts.is_pref,
67
                                 n_tokens=len(ts),
68
                                 pos=sent_idx,
69
                                 ambiguity=ambiguity))
70
71
    return conflate_matches(matches, len(sentences))
72
73
74
def candidates_to_features(candidates,
75
                           related_matrix, broader_matrix,
76
                           narrower_matrix, collection_matrix,
77
                           doc_freq, subj_freq, idf):
78
    """Convert a list of Candidates to a NumPy feature matrix"""
79
80
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
81
    c_ids = [c.subject_id for c in candidates]
82
    c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
83
    c_vec[c_ids] = True
84
    broader = broader_matrix.multiply(c_vec).sum(axis=1)
85
    narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
86
    related = related_matrix.multiply(c_vec).sum(axis=1)
87
    collection = collection_matrix.multiply(c_vec).T.dot(
88
        collection_matrix).sum(axis=0)
89
    for idx, c in enumerate(candidates):
90
        subj = c.subject_id
91
        matrix[idx, Feature.freq] = c.freq
92
        matrix[idx, Feature.doc_freq] = doc_freq[subj]
93
        matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
94
        matrix[idx, Feature.tfidf] = c.freq * idf[subj]
95
        matrix[idx, Feature.is_pref] = c.is_pref
96
        matrix[idx, Feature.n_tokens] = c.n_tokens
97
        matrix[idx, Feature.ambiguity] = c.ambiguity
98
        matrix[idx, Feature.first_occ] = c.first_occ
99
        matrix[idx, Feature.last_occ] = c.last_occ
100
        matrix[idx, Feature.spread] = c.spread
101
        matrix[idx, Feature.doc_length] = c.doc_length
102
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
103
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
104
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
106
    return matrix
107
108
109
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
110
111
    @classmethod
112
    def generate_candidates(cls, doc_subject_ids, text):
113
        candidates = generate_candidates(text, **cls.args)
114
        return doc_subject_ids, candidates
115
116
117
class MLLMFeatureConverter(annif.parallel.BaseWorker):
118
119
    @classmethod
120
    def candidates_to_features(cls, candidates):
121
        return candidates_to_features(candidates, **cls.args)
122
123
124
class MLLMModel:
125
    """Maui-like Lexical Matching model"""
126
127
    def generate_candidates(self, text, analyzer):
128
        return generate_candidates(text, analyzer,
129
                                   self._vectorizer, self._index)
130
131
    def _candidates_to_features(self, candidates):
132
        return candidates_to_features(candidates,
133
                                      self._related_matrix,
134
                                      self._broader_matrix,
135
                                      self._narrower_matrix,
136
                                      self._collection_matrix,
137
                                      self._doc_freq,
138
                                      self._subj_freq,
139
                                      self._idf)
140
141
    def _prepare_terms(self, graph, vocab, params):
142
        if annif.util.boolean(params['use_hidden_labels']):
143
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
144
        else:
145
            label_props = [SKOS.altLabel]
146
147
        terms = []
148
        subject_ids = []
149
        for subj_id, uri, pref, _ in vocab.subjects.active:
150
            subject_ids.append(subj_id)
151
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
152
153
            for label in get_subject_labels(graph, uri, label_props,
154
                                            params['language']):
155
                terms.append(Term(subject_id=subj_id,
156
                                  label=label,
157
                                  is_pref=False))
158
159
        return (terms, subject_ids)
160
161
    def _prepare_relations(self, graph, vocab):
162
        self._broader_matrix = make_relation_matrix(
163
            graph, vocab, SKOS.broader)
164
        self._narrower_matrix = make_relation_matrix(
165
            graph, vocab, SKOS.narrower)
166
        self._related_matrix = make_relation_matrix(
167
            graph, vocab, SKOS.related)
168
        self._collection_matrix = make_collection_matrix(graph, vocab)
169
170
    def _prepare_train_index(self, vocab, analyzer, params):
171
        graph = vocab.as_graph()
172
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
173
        self._prepare_relations(graph, vocab)
174
175
        self._vectorizer = CountVectorizer(
176
            binary=True,
177
            tokenizer=analyzer.tokenize_words
178
        )
179
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
180
181
        self._index = TokenSetIndex()
182
        for term, label_matrix in zip(terms, label_corpus):
183
            tokens = label_matrix.nonzero()[1]
184
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
185
            self._index.add(tset)
186
187
        return subject_ids
188
189
    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
190
        # frequency of subjects (by id) in the generated candidates
191
        self._doc_freq = collections.Counter()
192
        # frequency of manually assigned subjects ("domain keyphraseness")
193
        self._subj_freq = collections.Counter()
194
        train_x = []
195
        train_y = []
196
197
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
198
199
        cg_args = {
200
            'analyzer': analyzer,
201
            'vectorizer': self._vectorizer,
202
            'index': self._index
203
        }
204
205
        with pool_class(jobs,
206
                        initializer=MLLMCandidateGenerator.init,
207
                        initargs=(cg_args,)) as pool:
208
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
209
                       doc.text)
210
                      for doc in corpus.documents)
211
            for doc_subject_ids, candidates in pool.starmap(
212
                    MLLMCandidateGenerator.generate_candidates, params, 10):
213
214
                self._doc_freq.update([c.subject_id for c in candidates])
215
                train_x.append(candidates)
216
                train_y += [(c.subject_id in doc_subject_ids)
217
                            for c in candidates]
218
219
        return (train_x, train_y)
220
221
    def _calculate_idf(self, subject_ids, doc_count):
222
        idf = collections.defaultdict(float)
223
        for subj_id in subject_ids:
224
            idf[subj_id] = math.log((doc_count + 1) /
225
                                    (self._doc_freq[subj_id] + 1)) + 1
226
227
        return idf
228
229
    def _prepare_features(self, train_x, n_jobs):
230
        fc_args = {
231
            'related_matrix': self._related_matrix,
232
            'broader_matrix': self._broader_matrix,
233
            'narrower_matrix': self._narrower_matrix,
234
            'collection_matrix': self._collection_matrix,
235
            'doc_freq': self._doc_freq,
236
            'subj_freq': self._subj_freq,
237
            'idf': self._idf
238
        }
239
240
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
241
242
        with pool_class(jobs,
243
                        initializer=MLLMFeatureConverter.init,
244
                        initargs=(fc_args,)) as pool:
245
            features = pool.map(
246
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
247
248
        return features
249
250
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
251
        # create an index from the vocabulary terms
252
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
253
254
        # convert the corpus into train data
255
        train_x, train_y = self._prepare_train_data(
256
            corpus, vocab, analyzer, n_jobs)
257
258
        # precalculate idf values for all candidate subjects
259
        self._idf = self._calculate_idf(subject_ids, len(train_x))
260
261
        # convert the train data into feature values
262
        features = self._prepare_features(train_x, n_jobs)
263
264
        return (np.vstack(features), np.array(train_y))
265
266
    def _create_classifier(self, params):
267
        return BaggingClassifier(
268
            DecisionTreeClassifier(
269
                min_samples_leaf=int(params['min_samples_leaf']),
270
                max_leaf_nodes=int(params['max_leaf_nodes'])
271
            ), max_samples=float(params['max_samples']))
272
273
    def train(self, train_x, train_y, params):
274
        # fit the model on the training corpus
275
        self._classifier = self._create_classifier(params)
276
        self._classifier.fit(train_x, train_y)
277
278
    def _prediction_to_list(self, scores, candidates):
279
        subj_scores = [(score[1], c.subject_id)
280
                       for score, c in zip(scores, candidates)]
281
        return sorted(subj_scores, reverse=True)
282
283
    def predict(self, candidates):
284
        if not candidates:
285
            return []
286
        features = self._candidates_to_features(candidates)
287
        scores = self._classifier.predict_proba(features)
288
        return self._prediction_to_list(scores, candidates)
289
290
    def save(self, filename):
291
        return joblib.dump(self, filename)
292
293
    @staticmethod
294
    def load(filename):
295
        return joblib.load(filename)
296