Passed
Pull Request — master (#462)
by Osma
02:32
created

annif.lexical.mllm.MLLMModel._conflate_matches()   A

Complexity

Conditions 2

Size

Total Lines 17
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 16
nop 3
dl 0
loc 17
rs 9.6
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
from statistics import mean
6
from enum import IntEnum
7
import numpy as np
8
from rdflib import URIRef
9
from rdflib.namespace import SKOS
10
from scipy.sparse import lil_matrix
11
from sklearn.feature_extraction.text import CountVectorizer
12
from sklearn.ensemble import BaggingClassifier
13
from sklearn.tree import DecisionTreeClassifier
14
import annif.util
15
from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
17
18
Term = collections.namedtuple('Term', 'subject_id label is_pref')
19
20
Match = collections.namedtuple(
21
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
22
23
Candidate = collections.namedtuple(
24
    'Candidate',
25
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
26
    'first_occ last_occ spread')
27
28
Feature = IntEnum(
29
    'Feature',
30
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
31
    'first_occ last_occ spread doc_length ' +
32
    'broader narrower related',
33
    start=0)
34
35
36
class MLLMModel:
37
    """Maui-like Lexical Matching model"""
38
39
    def _conflate_matches(self, matches, doc_length):
40
        subj_matches = collections.defaultdict(list)
41
        for match in matches:
42
            subj_matches[match.subject_id].append(match)
43
        return [
44
            Candidate(
45
                doc_length=doc_length,
46
                subject_id=subject_id,
47
                freq=len(matches) / doc_length,
48
                is_pref=mean((float(m.is_pref) for m in matches)),
49
                n_tokens=mean((m.n_tokens for m in matches)),
50
                ambiguity=mean((m.ambiguity for m in matches)),
51
                first_occ=matches[0].pos / doc_length,
52
                last_occ=matches[-1].pos / doc_length,
53
                spread=(matches[-1].pos - matches[0].pos) / doc_length
54
            )
55
            for subject_id, matches in subj_matches.items()]
56
57
    def generate_candidates(self, text, analyzer):
58
        sentences = analyzer.tokenize_sentences(text)
59
        sent_tokens = self._vectorizer.transform(sentences)
60
        matches = []
61
62
        for sent_idx, token_matrix in enumerate(sent_tokens):
63
            tset = TokenSet(token_matrix.nonzero()[1])
64
            for ts, ambiguity in self._index.search(tset):
65
                matches.append(Match(subject_id=ts.subject_id,
66
                                     is_pref=ts.is_pref,
67
                                     n_tokens=len(ts),
68
                                     pos=sent_idx,
69
                                     ambiguity=ambiguity))
70
71
        return self._conflate_matches(matches, len(sentences))
72
73
    def _candidates_to_features(self, candidates):
74
        """Convert a list of Candidates to a NumPy feature matrix"""
75
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
76
        c_ids = [c.subject_id for c in candidates]
77
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
78
        c_vec[c_ids] = True
79
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
80
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
81
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
82
        for idx, c in enumerate(candidates):
83
            subj = c.subject_id
84
            matrix[idx, Feature.freq] = c.freq
85
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
86
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
87
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
88
            matrix[idx, Feature.is_pref] = c.is_pref
89
            matrix[idx, Feature.n_tokens] = c.n_tokens
90
            matrix[idx, Feature.ambiguity] = c.ambiguity
91
            matrix[idx, Feature.first_occ] = c.first_occ
92
            matrix[idx, Feature.last_occ] = c.last_occ
93
            matrix[idx, Feature.spread] = c.spread
94
            matrix[idx, Feature.doc_length] = c.doc_length
95
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
96
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
97
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
98
        return matrix
99
100
    def _prepare_terms(self, graph, vocab, params):
101
        terms = []
102
        subject_ids = []
103
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
104
            if pref is None:
105
                continue  # deprecated subject
106
            subject_ids.append(subj_id)
107
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
108
109
            if annif.util.boolean(params['use_hidden_labels']):
110
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
111
            else:
112
                label_props = [SKOS.altLabel]
113
114
            for prop in label_props:
115
                for label in graph.objects(URIRef(uri), prop):
116
                    if label.language != params['language']:
117
                        continue
118
                    terms.append(Term(subject_id=subj_id,
119
                                      label=str(label),
120
                                      is_pref=False))
121
        return (terms, subject_ids)
122
123
    def _prepare_relations(self, graph, vocab):
124
        n_subj = len(vocab.subjects)
125
        self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
126
        self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
127
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
128
129
        prop_matrix = [
130
            (SKOS.broader, self._broader_matrix),
131
            (SKOS.narrower, self._narrower_matrix),
132
            (SKOS.related, self._related_matrix)
133
        ]
134
135
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
136
            if pref is None:
137
                continue  # deprecated subject
138
139
            for prop, matrix in prop_matrix:
140
                for other in graph.objects(URIRef(uri), prop):
141
                    other_id = vocab.subjects.by_uri(str(other),
142
                                                     warnings=False)
143
                    if other_id is not None:
144
                        matrix[subj_id, other_id] = True
145
146
    def _prepare_train_index(self, vocab, analyzer, params):
147
        graph = vocab.as_graph()
148
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
149
        self._prepare_relations(graph, vocab)
150
151
        self._vectorizer = CountVectorizer(
152
            binary=True,
153
            tokenizer=analyzer.tokenize_words
154
        )
155
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
156
157
        self._index = TokenSetIndex()
158
        for term, label_matrix in zip(terms, label_corpus):
159
            tokens = label_matrix.nonzero()[1]
160
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
161
            self._index.add(tset)
162
163
        return subject_ids
164
165
    def prepare_train(self, corpus, vocab, analyzer, params):
166
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
167
168
        # frequency of subjects (by id) in the generated candidates
169
        self._doc_freq = collections.Counter()
170
        # frequency of manually assigned subjects ("domain keyphraseness")
171
        self._subj_freq = collections.Counter()
172
        doc_count = 0
173
        train_x = []
174
        train_y = []
175
        for idx, doc in enumerate(corpus.documents):
176
            doc_subject_ids = [vocab.subjects.by_uri(uri)
177
                               for uri in doc.uris]
178
            self._subj_freq.update(doc_subject_ids)
179
            candidates = self.generate_candidates(doc.text, analyzer)
180
            self._doc_freq.update([c.subject_id for c in candidates])
181
            train_x.append(candidates)
182
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
183
            doc_count += 1
184
185
        # precalculate idf values for candidate subjects
186
        self._idf = collections.defaultdict(float)
187
        for subj_id in subject_ids:
188
            self._idf[subj_id] = math.log((doc_count + 1) /
189
                                          (self._doc_freq[subj_id] + 1)) + 1
190
        return (np.vstack([self._candidates_to_features(candidates)
191
                           for candidates in train_x]), np.array(train_y))
192
193
    def _create_classifier(self, params):
194
        return BaggingClassifier(
195
            DecisionTreeClassifier(
196
                min_samples_leaf=int(params['min_samples_leaf']),
197
                max_leaf_nodes=int(params['max_leaf_nodes'])
198
            ), max_samples=float(params['max_samples']))
199
200
    def train(self, train_x, train_y, params):
201
        # fit the model on the training corpus
202
        self._classifier = self._create_classifier(params)
203
        self._classifier.fit(train_x, train_y)
204
205
    def _prediction_to_list(self, scores, candidates):
206
        subj_scores = [(score[1], c.subject_id)
207
                       for score, c in zip(scores, candidates)]
208
        return sorted(subj_scores, reverse=True)
209
210
    def predict(self, candidates):
211
        if not candidates:
212
            return []
213
        features = self._candidates_to_features(candidates)
214
        scores = self._classifier.predict_proba(features)
215
        return self._prediction_to_list(scores, candidates)
216