Passed
Pull Request — master (#462)
by Osma
02:21
created

annif.lexical.mllm.MLLMModel._prepare_terms()   B

Complexity

Conditions 7

Size

Total Lines 22
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 19
nop 4
dl 0
loc 22
rs 8
c 0
b 0
f 0
1
"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3
import collections
4
import math
5
import joblib
6
from statistics import mean
7
from enum import IntEnum
8
import numpy as np
9
from rdflib import URIRef
10
from rdflib.namespace import SKOS
11
from scipy.sparse import lil_matrix
12
from sklearn.feature_extraction.text import CountVectorizer
13
from sklearn.ensemble import BaggingClassifier
14
from sklearn.tree import DecisionTreeClassifier
15
import annif.util
16
from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
18
19
Term = collections.namedtuple('Term', 'subject_id label is_pref')
20
21
Match = collections.namedtuple(
22
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
23
24
Candidate = collections.namedtuple(
25
    'Candidate',
26
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
27
    'first_occ last_occ spread')
28
29
Feature = IntEnum(
30
    'Feature',
31
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
32
    'first_occ last_occ spread doc_length ' +
33
    'broader narrower related collection',
34
    start=0)
35
36
37
class MLLMModel:
38
    """Maui-like Lexical Matching model"""
39
40
    def _conflate_matches(self, matches, doc_length):
41
        subj_matches = collections.defaultdict(list)
42
        for match in matches:
43
            subj_matches[match.subject_id].append(match)
44
        return [
45
            Candidate(
46
                doc_length=doc_length,
47
                subject_id=subject_id,
48
                freq=len(matches) / doc_length,
49
                is_pref=mean((float(m.is_pref) for m in matches)),
50
                n_tokens=mean((m.n_tokens for m in matches)),
51
                ambiguity=mean((m.ambiguity for m in matches)),
52
                first_occ=matches[0].pos / doc_length,
53
                last_occ=matches[-1].pos / doc_length,
54
                spread=(matches[-1].pos - matches[0].pos) / doc_length
55
            )
56
            for subject_id, matches in subj_matches.items()]
57
58
    def generate_candidates(self, text, analyzer):
59
        sentences = analyzer.tokenize_sentences(text)
60
        sent_tokens = self._vectorizer.transform(sentences)
61
        matches = []
62
63
        for sent_idx, token_matrix in enumerate(sent_tokens):
64
            tset = TokenSet(token_matrix.nonzero()[1])
65
            for ts, ambiguity in self._index.search(tset):
66
                matches.append(Match(subject_id=ts.subject_id,
67
                                     is_pref=ts.is_pref,
68
                                     n_tokens=len(ts),
69
                                     pos=sent_idx,
70
                                     ambiguity=ambiguity))
71
72
        return self._conflate_matches(matches, len(sentences))
73
74
    def _candidates_to_features(self, candidates):
75
        """Convert a list of Candidates to a NumPy feature matrix"""
76
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
77
        c_ids = [c.subject_id for c in candidates]
78
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
79
        c_vec[c_ids] = True
80
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
81
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
82
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
83
        collection = self._collection_matrix.multiply(c_vec).sum(axis=1)
84
        for idx, c in enumerate(candidates):
85
            subj = c.subject_id
86
            matrix[idx, Feature.freq] = c.freq
87
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
88
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
89
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
90
            matrix[idx, Feature.is_pref] = c.is_pref
91
            matrix[idx, Feature.n_tokens] = c.n_tokens
92
            matrix[idx, Feature.ambiguity] = c.ambiguity
93
            matrix[idx, Feature.first_occ] = c.first_occ
94
            matrix[idx, Feature.last_occ] = c.last_occ
95
            matrix[idx, Feature.spread] = c.spread
96
            matrix[idx, Feature.doc_length] = c.doc_length
97
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
98
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
99
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
100
            matrix[idx, Feature.collection] = collection[subj, 0] / len(c_ids)
101
        return matrix
102
103
    def _prepare_terms(self, graph, vocab, params):
104
        terms = []
105
        subject_ids = []
106
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
107
            if pref is None:
108
                continue  # deprecated subject
109
            subject_ids.append(subj_id)
110
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
111
112
            if annif.util.boolean(params['use_hidden_labels']):
113
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
114
            else:
115
                label_props = [SKOS.altLabel]
116
117
            for prop in label_props:
118
                for label in graph.objects(URIRef(uri), prop):
119
                    if label.language != params['language']:
120
                        continue
121
                    terms.append(Term(subject_id=subj_id,
122
                                      label=str(label),
123
                                      is_pref=False))
124
        return (terms, subject_ids)
125
126
    def _make_collection_matrix(self, graph, vocab):
127
        # make an index with all collection members
128
        c_members = collections.defaultdict(list)
129
        for coll, member in graph.subject_objects(SKOS.member):
130
            member_id = vocab.subjects.by_uri(str(member), warnings=False)
131
            if member_id is not None:
132
                c_members[str(coll)].append(member_id)
133
134
        n_subj = len(vocab.subjects)
135
        c_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
136
137
        # populate the matrix by looking up members of the same collections
138
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
139
            if pref is None:
140
                continue  # deprecated subject
141
            for coll in graph.subjects(SKOS.member, URIRef(uri)):
142
                other_ids = c_members[str(coll)]
143
                c_matrix[subj_id, other_ids] = True
144
145
        return c_matrix
146
147
    def _prepare_relations(self, graph, vocab):
148
        n_subj = len(vocab.subjects)
149
        self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
150
        self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
151
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
152
        self._collection_matrix = self._make_collection_matrix(graph, vocab)
153
154
        prop_matrix = [
155
            (SKOS.broader, self._broader_matrix),
156
            (SKOS.narrower, self._narrower_matrix),
157
            (SKOS.related, self._related_matrix)
158
        ]
159
160
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
161
            if pref is None:
162
                continue  # deprecated subject
163
164
            for prop, matrix in prop_matrix:
165
                for other in graph.objects(URIRef(uri), prop):
166
                    other_id = vocab.subjects.by_uri(str(other),
167
                                                     warnings=False)
168
                    if other_id is not None:
169
                        matrix[subj_id, other_id] = True
170
171
    def _prepare_train_index(self, vocab, analyzer, params):
172
        graph = vocab.as_graph()
173
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
174
        self._prepare_relations(graph, vocab)
175
176
        self._vectorizer = CountVectorizer(
177
            binary=True,
178
            tokenizer=analyzer.tokenize_words
179
        )
180
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
181
182
        self._index = TokenSetIndex()
183
        for term, label_matrix in zip(terms, label_corpus):
184
            tokens = label_matrix.nonzero()[1]
185
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
186
            self._index.add(tset)
187
188
        return subject_ids
189
190
    def prepare_train(self, corpus, vocab, analyzer, params):
191
        subject_ids = self._prepare_train_index(vocab, analyzer, params)
192
193
        # frequency of subjects (by id) in the generated candidates
194
        self._doc_freq = collections.Counter()
195
        # frequency of manually assigned subjects ("domain keyphraseness")
196
        self._subj_freq = collections.Counter()
197
        doc_count = 0
198
        train_x = []
199
        train_y = []
200
        for idx, doc in enumerate(corpus.documents):
201
            doc_subject_ids = [vocab.subjects.by_uri(uri)
202
                               for uri in doc.uris]
203
            self._subj_freq.update(doc_subject_ids)
204
            candidates = self.generate_candidates(doc.text, analyzer)
205
            self._doc_freq.update([c.subject_id for c in candidates])
206
            train_x.append(candidates)
207
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
208
            doc_count += 1
209
210
        # precalculate idf values for candidate subjects
211
        self._idf = collections.defaultdict(float)
212
        for subj_id in subject_ids:
213
            self._idf[subj_id] = math.log((doc_count + 1) /
214
                                          (self._doc_freq[subj_id] + 1)) + 1
215
        return (np.vstack([self._candidates_to_features(candidates)
216
                           for candidates in train_x]), np.array(train_y))
217
218
    def _create_classifier(self, params):
219
        return BaggingClassifier(
220
            DecisionTreeClassifier(
221
                min_samples_leaf=int(params['min_samples_leaf']),
222
                max_leaf_nodes=int(params['max_leaf_nodes'])
223
            ), max_samples=float(params['max_samples']))
224
225
    def train(self, train_x, train_y, params):
226
        # fit the model on the training corpus
227
        self._classifier = self._create_classifier(params)
228
        self._classifier.fit(train_x, train_y)
229
230
    def _prediction_to_list(self, scores, candidates):
231
        subj_scores = [(score[1], c.subject_id)
232
                       for score, c in zip(scores, candidates)]
233
        return sorted(subj_scores, reverse=True)
234
235
    def predict(self, candidates):
236
        if not candidates:
237
            return []
238
        features = self._candidates_to_features(candidates)
239
        scores = self._classifier.predict_proba(features)
240
        return self._prediction_to_list(scores, candidates)
241
242
    def save(self, filename):
243
        return joblib.dump(self, filename)
244
245
    @staticmethod
246
    def load(filename):
247
        return joblib.load(filename)
248