Passed
Pull Request — master (#511)
by Osma
02:00
created

annif.lexical.mllm.candidates_to_features()   A

Complexity

Conditions 2

Size

Total Lines 33
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 31
nop 8
dl 0
loc 33
rs 9.1359
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib.namespace import SKOS
10
from sklearn.feature_extraction.text import CountVectorizer
11
from sklearn.ensemble import BaggingClassifier
12
from sklearn.tree import DecisionTreeClassifier
13
import annif.util
14
import annif.parallel
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
from annif.lexical.util import get_subject_labels
17
from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20
Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22
Match = collections.namedtuple(
23
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25
Candidate = collections.namedtuple(
26
    'Candidate',
27
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28
    'first_occ last_occ spread')
29
30
Feature = IntEnum(
31
    'Feature',
32
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33
    'first_occ last_occ spread doc_length ' +
34
    'broader narrower related collection',
35
    start=0)
36
37
38
def conflate_matches(matches, doc_length):
39
    subj_matches = collections.defaultdict(list)
40
    for match in matches:
41
        subj_matches[match.subject_id].append(match)
42
    return [
43
        Candidate(
44
            doc_length=doc_length,
45
            subject_id=subject_id,
46
            freq=len(matches) / doc_length,
47
            is_pref=mean((float(m.is_pref) for m in matches)),
48
            n_tokens=mean((m.n_tokens for m in matches)),
49
            ambiguity=mean((m.ambiguity for m in matches)),
50
            first_occ=matches[0].pos / doc_length,
51
            last_occ=matches[-1].pos / doc_length,
52
            spread=(matches[-1].pos - matches[0].pos) / doc_length
53
        )
54
        for subject_id, matches in subj_matches.items()]
55
56
57
def generate_candidates(text, analyzer, vectorizer, index):
58
    sentences = analyzer.tokenize_sentences(text)
59
    sent_tokens = vectorizer.transform(sentences)
60
    matches = []
61
62
    for sent_idx, token_matrix in enumerate(sent_tokens):
63
        tset = TokenSet(token_matrix.nonzero()[1])
64
        for ts, ambiguity in index.search(tset):
65
            matches.append(Match(subject_id=ts.subject_id,
66
                                 is_pref=ts.is_pref,
67
                                 n_tokens=len(ts),
68
                                 pos=sent_idx,
69
                                 ambiguity=ambiguity))
70
71
    return conflate_matches(matches, len(sentences))
72
73
74
def candidates_to_features(candidates,
75
                           related_matrix, broader_matrix,
76
                           narrower_matrix, collection_matrix,
77
                           doc_freq, subj_freq, idf):
78
    """Convert a list of Candidates to a NumPy feature matrix"""
79
80
    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
81
    c_ids = [c.subject_id for c in candidates]
82
    c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
83
    c_vec[c_ids] = True
84
    broader = broader_matrix.multiply(c_vec).sum(axis=1)
85
    narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
86
    related = related_matrix.multiply(c_vec).sum(axis=1)
87
    collection = collection_matrix.multiply(c_vec).T.dot(
88
        collection_matrix).sum(axis=0)
89
    for idx, c in enumerate(candidates):
90
        subj = c.subject_id
91
        matrix[idx, Feature.freq] = c.freq
92
        matrix[idx, Feature.doc_freq] = doc_freq[subj]
93
        matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
94
        matrix[idx, Feature.tfidf] = c.freq * idf[subj]
95
        matrix[idx, Feature.is_pref] = c.is_pref
96
        matrix[idx, Feature.n_tokens] = c.n_tokens
97
        matrix[idx, Feature.ambiguity] = c.ambiguity
98
        matrix[idx, Feature.first_occ] = c.first_occ
99
        matrix[idx, Feature.last_occ] = c.last_occ
100
        matrix[idx, Feature.spread] = c.spread
101
        matrix[idx, Feature.doc_length] = c.doc_length
102
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
103
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
104
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
105
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
106
    return matrix
107
108
109
class MLLMCandidateGenerator(annif.parallel.BaseWorker):
110
111
    @classmethod
112
    def generate_candidates(cls, doc_subject_ids, text):
113
        args = cls.args
114
        candidates = generate_candidates(
115
            text, args['analyzer'], args['vectorizer'], args['index'])
116
        return doc_subject_ids, candidates
117
118
119
class MLLMFeatureConverter(annif.parallel.BaseWorker):
120
121
    @classmethod
122
    def candidates_to_features(cls, candidates):
123
        return candidates_to_features(candidates, **cls.args)
124
125
126
class MLLMModel:
127
    """Maui-like Lexical Matching model"""
128
129
    def generate_candidates(self, text, analyzer):
130
        return generate_candidates(text, analyzer,
131
                                   self._vectorizer, self._index)
132
133
    def _candidates_to_features(self, candidates):
134
        return candidates_to_features(candidates,
135
                                      self._related_matrix,
136
                                      self._broader_matrix,
137
                                      self._narrower_matrix,
138
                                      self._collection_matrix,
139
                                      self._doc_freq,
140
                                      self._subj_freq,
141
                                      self._idf)
142
143
    def _prepare_terms(self, graph, vocab, params):
144
        if annif.util.boolean(params['use_hidden_labels']):
145
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
146
        else:
147
            label_props = [SKOS.altLabel]
148
149
        terms = []
150
        subject_ids = []
151
        for subj_id, uri, pref, _ in vocab.subjects.active:
152
            subject_ids.append(subj_id)
153
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
154
155
            for label in get_subject_labels(graph, uri, label_props,
156
                                            params['language']):
157
                terms.append(Term(subject_id=subj_id,
158
                                  label=label,
159
                                  is_pref=False))
160
161
        return (terms, subject_ids)
162
163
    def _prepare_relations(self, graph, vocab):
164
        self._broader_matrix = make_relation_matrix(
165
            graph, vocab, SKOS.broader)
166
        self._narrower_matrix = make_relation_matrix(
167
            graph, vocab, SKOS.narrower)
168
        self._related_matrix = make_relation_matrix(
169
            graph, vocab, SKOS.related)
170
        self._collection_matrix = make_collection_matrix(graph, vocab)
171
172
    def _prepare_train_index(self, vocab, analyzer, params):
173
        graph = vocab.as_graph()
174
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
175
        self._prepare_relations(graph, vocab)
176
177
        self._vectorizer = CountVectorizer(
178
            binary=True,
179
            tokenizer=analyzer.tokenize_words
180
        )
181
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
182
183
        self._index = TokenSetIndex()
184
        for term, label_matrix in zip(terms, label_corpus):
185
            tokens = label_matrix.nonzero()[1]
186
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
187
            self._index.add(tset)
188
189
        return subject_ids
190
191
    def _calculate_idf(self, subject_ids, doc_count):
192
        idf = collections.defaultdict(float)
193
        for subj_id in subject_ids:
194
            idf[subj_id] = math.log((doc_count + 1) /
195
                                    (self._doc_freq[subj_id] + 1)) + 1
196
197
        return idf
198
199
    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
200
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
201
202
        # frequency of subjects (by id) in the generated candidates
203
        self._doc_freq = collections.Counter()
204
        # frequency of manually assigned subjects ("domain keyphraseness")
205
        self._subj_freq = collections.Counter()
206
        doc_count = 0
207
        train_x = []
208
        train_y = []
209
210
        jobs, pool_class = annif.parallel.get_pool(n_jobs)
211
212
        cg_args = {
213
            'analyzer': analyzer,
214
            'vectorizer': self._vectorizer,
215
            'index': self._index
216
        }
217
218
        with pool_class(jobs,
219
                        initializer=MLLMCandidateGenerator.init,
220
                        initargs=(cg_args,)) as pool:
221
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
222
                       doc.text)
223
                      for doc in corpus.documents)
224
            for doc_subject_ids, candidates in pool.starmap(
225
                    MLLMCandidateGenerator.generate_candidates, params, 10):
226
227
                self._doc_freq.update([c.subject_id for c in candidates])
228
                train_x.append(candidates)
229
                train_y += [(c.subject_id in doc_subject_ids)
230
                            for c in candidates]
231
                doc_count += 1
232
233
        # precalculate idf values for all candidate subjects
234
        self._idf = self._calculate_idf(subject_ids, doc_count)
235
236
        fc_args = {
237
            'related_matrix': self._related_matrix,
238
            'broader_matrix': self._broader_matrix,
239
            'narrower_matrix': self._narrower_matrix,
240
            'collection_matrix': self._collection_matrix,
241
            'doc_freq': self._doc_freq,
242
            'subj_freq': self._subj_freq,
243
            'idf': self._idf
244
        }
245
246
        with pool_class(jobs,
247
                        initializer=MLLMFeatureConverter.init,
248
                        initargs=(fc_args,)) as pool:
249
            features = pool.map(
250
                MLLMFeatureConverter.candidates_to_features, train_x, 10)
251
        return (np.vstack(features), np.array(train_y))
252
253
    def _create_classifier(self, params):
254
        return BaggingClassifier(
255
            DecisionTreeClassifier(
256
                min_samples_leaf=int(params['min_samples_leaf']),
257
                max_leaf_nodes=int(params['max_leaf_nodes'])
258
            ), max_samples=float(params['max_samples']))
259
260
    def train(self, train_x, train_y, params):
261
        # fit the model on the training corpus
262
        self._classifier = self._create_classifier(params)
263
        self._classifier.fit(train_x, train_y)
264
265
    def _prediction_to_list(self, scores, candidates):
266
        subj_scores = [(score[1], c.subject_id)
267
                       for score, c in zip(scores, candidates)]
268
        return sorted(subj_scores, reverse=True)
269
270
    def predict(self, candidates):
271
        if not candidates:
272
            return []
273
        features = self._candidates_to_features(candidates)
274
        scores = self._classifier.predict_proba(features)
275
        return self._prediction_to_list(scores, candidates)
276
277
    def save(self, filename):
278
        return joblib.dump(self, filename)
279
280
    @staticmethod
281
    def load(filename):
282
        return joblib.load(filename)
283