annif.lexical.mllm.MLLMModel._prepare_terms() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-23 16:05 UTC

annif.lexical.mllm.MLLMModel._prepare_terms() B

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	22
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	19
nop	4
dl	0
loc	22
rs	8
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix, csc_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.lexical.tokenset import TokenSet, TokenSetIndex


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        collection = self._collection_matrix.multiply(c_vec).T.dot(
            self._collection_matrix).sum(axis=0)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
            matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        terms = []
        subject_ids = []
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            if annif.util.boolean(params['use_hidden_labels']):
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
            else:
                label_props = [SKOS.altLabel]

            for prop in label_props:
                for label in graph.objects(URIRef(uri), prop):
                    if label.language != params['language']:
                        continue
                    terms.append(Term(subject_id=subj_id,
                                      label=str(label),
                                      is_pref=False))
        return (terms, subject_ids)

    def _make_relation_matrix(self, graph, vocab, property):
        n_subj = len(vocab.subjects)
        matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)

        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject

            for other in graph.objects(URIRef(uri), property):
                other_id = vocab.subjects.by_uri(str(other),
                                                 warnings=False)
                if other_id is not None:
                    matrix[subj_id, other_id] = True

        return csc_matrix(matrix)

    def _make_collection_matrix(self, graph, vocab):
        # make an index with all collection members
        c_members = collections.defaultdict(list)
        for coll, member in graph.subject_objects(SKOS.member):
            member_id = vocab.subjects.by_uri(str(member), warnings=False)
            if member_id is not None:
                c_members[str(coll)].append(member_id)

        c_matrix = lil_matrix((len(c_members), len(vocab.subjects)),
                              dtype=np.bool)

        # populate the matrix for collection -> subject_id
        for c_id, members in enumerate(c_members.values()):
            c_matrix[c_id, members] = True

        return csc_matrix(c_matrix)

    def _prepare_relations(self, graph, vocab):
        self._broader_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.broader)
        self._narrower_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.narrower)
        self._related_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.related)
        self._collection_matrix = self._make_collection_matrix(graph, vocab)

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for candidate subjects
        self._idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            self._idf[subj_id] = math.log((doc_count + 1) /
                                          (self._doc_freq[subj_id] + 1)) + 1
        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			import joblib
6			from statistics import mean
7			from enum import IntEnum
8			import numpy as np
9			from rdflib import URIRef
10			from rdflib.namespace import SKOS
11			from scipy.sparse import lil_matrix, csc_matrix
12			from sklearn.feature_extraction.text import CountVectorizer
13			from sklearn.ensemble import BaggingClassifier
14			from sklearn.tree import DecisionTreeClassifier
15			import annif.util
16			from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
18
19			Term = collections.namedtuple('Term', 'subject_id label is_pref')
20
21			Match = collections.namedtuple(
22			'Match', 'subject_id is_pref n_tokens pos ambiguity')
23
24			Candidate = collections.namedtuple(
25			'Candidate',
26			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
27			'first_occ last_occ spread')
28
29			Feature = IntEnum(
30			'Feature',
31			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
32			'first_occ last_occ spread doc_length ' +
33			'broader narrower related collection',
34			start=0)
35
36
37			class MLLMModel:
38			"""Maui-like Lexical Matching model"""
39
40			def _conflate_matches(self, matches, doc_length):
41			subj_matches = collections.defaultdict(list)
42			for match in matches:
43			subj_matches[match.subject_id].append(match)
44			return [
45			Candidate(
46			doc_length=doc_length,
47			subject_id=subject_id,
48			freq=len(matches) / doc_length,
49			is_pref=mean((float(m.is_pref) for m in matches)),
50			n_tokens=mean((m.n_tokens for m in matches)),
51			ambiguity=mean((m.ambiguity for m in matches)),
52			first_occ=matches[0].pos / doc_length,
53			last_occ=matches[-1].pos / doc_length,
54			spread=(matches[-1].pos - matches[0].pos) / doc_length
55			)
56			for subject_id, matches in subj_matches.items()]
57
58			def generate_candidates(self, text, analyzer):
59			sentences = analyzer.tokenize_sentences(text)
60			sent_tokens = self._vectorizer.transform(sentences)
61			matches = []
62
63			for sent_idx, token_matrix in enumerate(sent_tokens):
64			tset = TokenSet(token_matrix.nonzero()[1])
65			for ts, ambiguity in self._index.search(tset):
66			matches.append(Match(subject_id=ts.subject_id,
67			is_pref=ts.is_pref,
68			n_tokens=len(ts),
69			pos=sent_idx,
70			ambiguity=ambiguity))
71
72			return self._conflate_matches(matches, len(sentences))
73
74			def _candidates_to_features(self, candidates):
75			"""Convert a list of Candidates to a NumPy feature matrix"""
76			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
77			c_ids = [c.subject_id for c in candidates]
78			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
79			c_vec[c_ids] = True
80			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
81			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
82			related = self._related_matrix.multiply(c_vec).sum(axis=1)
83			collection = self._collection_matrix.multiply(c_vec).T.dot(
84			self._collection_matrix).sum(axis=0)
85			for idx, c in enumerate(candidates):
86			subj = c.subject_id
87			matrix[idx, Feature.freq] = c.freq
88			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
89			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
90			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
91			matrix[idx, Feature.is_pref] = c.is_pref
92			matrix[idx, Feature.n_tokens] = c.n_tokens
93			matrix[idx, Feature.ambiguity] = c.ambiguity
94			matrix[idx, Feature.first_occ] = c.first_occ
95			matrix[idx, Feature.last_occ] = c.last_occ
96			matrix[idx, Feature.spread] = c.spread
97			matrix[idx, Feature.doc_length] = c.doc_length
98			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
99			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
100			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
101			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
102			return matrix
103
104			def _prepare_terms(self, graph, vocab, params):
105			terms = []
106			subject_ids = []
107			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
108			if pref is None:
109			continue # deprecated subject
110			subject_ids.append(subj_id)
111			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
112
113			if annif.util.boolean(params['use_hidden_labels']):
114			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
115			else:
116			label_props = [SKOS.altLabel]
117
118			for prop in label_props:
119			for label in graph.objects(URIRef(uri), prop):
120			if label.language != params['language']:
121			continue
122			terms.append(Term(subject_id=subj_id,
123			label=str(label),
124			is_pref=False))
125			return (terms, subject_ids)
126
127			def _make_relation_matrix(self, graph, vocab, property):
128			n_subj = len(vocab.subjects)
129			matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
130
131			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
132			if pref is None:
133			continue # deprecated subject
134
135			for other in graph.objects(URIRef(uri), property):
136			other_id = vocab.subjects.by_uri(str(other),
137			warnings=False)
138			if other_id is not None:
139			matrix[subj_id, other_id] = True
140
141			return csc_matrix(matrix)
142
143			def _make_collection_matrix(self, graph, vocab):
144			# make an index with all collection members
145			c_members = collections.defaultdict(list)
146			for coll, member in graph.subject_objects(SKOS.member):
147			member_id = vocab.subjects.by_uri(str(member), warnings=False)
148			if member_id is not None:
149			c_members[str(coll)].append(member_id)
150
151			c_matrix = lil_matrix((len(c_members), len(vocab.subjects)),
152			dtype=np.bool)
153
154			# populate the matrix for collection -> subject_id
155			for c_id, members in enumerate(c_members.values()):
156			c_matrix[c_id, members] = True
157
158			return csc_matrix(c_matrix)
159
160			def _prepare_relations(self, graph, vocab):
161			self._broader_matrix = self._make_relation_matrix(
162			graph, vocab, SKOS.broader)
163			self._narrower_matrix = self._make_relation_matrix(
164			graph, vocab, SKOS.narrower)
165			self._related_matrix = self._make_relation_matrix(
166			graph, vocab, SKOS.related)
167			self._collection_matrix = self._make_collection_matrix(graph, vocab)
168
169			def _prepare_train_index(self, vocab, analyzer, params):
170			graph = vocab.as_graph()
171			terms, subject_ids = self._prepare_terms(graph, vocab, params)
172			self._prepare_relations(graph, vocab)
173
174			self._vectorizer = CountVectorizer(
175			binary=True,
176			tokenizer=analyzer.tokenize_words
177			)
178			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
179
180			self._index = TokenSetIndex()
181			for term, label_matrix in zip(terms, label_corpus):
182			tokens = label_matrix.nonzero()[1]
183			tset = TokenSet(tokens, term.subject_id, term.is_pref)
184			self._index.add(tset)
185
186			return subject_ids
187
188			def prepare_train(self, corpus, vocab, analyzer, params):
189			subject_ids = self._prepare_train_index(vocab, analyzer, params)
190
191			# frequency of subjects (by id) in the generated candidates
192			self._doc_freq = collections.Counter()
193			# frequency of manually assigned subjects ("domain keyphraseness")
194			self._subj_freq = collections.Counter()
195			doc_count = 0
196			train_x = []
197			train_y = []
198			for idx, doc in enumerate(corpus.documents):
199			doc_subject_ids = [vocab.subjects.by_uri(uri)
200			for uri in doc.uris]
201			self._subj_freq.update(doc_subject_ids)
202			candidates = self.generate_candidates(doc.text, analyzer)
203			self._doc_freq.update([c.subject_id for c in candidates])
204			train_x.append(candidates)
205			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
206			doc_count += 1
207
208			# precalculate idf values for candidate subjects
209			self._idf = collections.defaultdict(float)
210			for subj_id in subject_ids:
211			self._idf[subj_id] = math.log((doc_count + 1) /
212			(self._doc_freq[subj_id] + 1)) + 1
213			return (np.vstack([self._candidates_to_features(candidates)
214			for candidates in train_x]), np.array(train_y))
215
216			def _create_classifier(self, params):
217			return BaggingClassifier(
218			DecisionTreeClassifier(
219			min_samples_leaf=int(params['min_samples_leaf']),
220			max_leaf_nodes=int(params['max_leaf_nodes'])
221			), max_samples=float(params['max_samples']))
222
223			def train(self, train_x, train_y, params):
224			# fit the model on the training corpus
225			self._classifier = self._create_classifier(params)
226			self._classifier.fit(train_x, train_y)
227
228			def _prediction_to_list(self, scores, candidates):
229			subj_scores = [(score[1], c.subject_id)
230			for score, c in zip(scores, candidates)]
231			return sorted(subj_scores, reverse=True)
232
233			def predict(self, candidates):
234			if not candidates:
235			return []
236			features = self._candidates_to_features(candidates)
237			scores = self._classifier.predict_proba(features)
238			return self._prediction_to_list(scores, candidates)
239
240			def save(self, filename):
241			return joblib.dump(self, filename)
242
243			@staticmethod
244			def load(filename):
245			return joblib.load(filename)
246

NatLibFi / Annif

Pull Request — master (#462)

annif.lexical.mllm.MLLMModel._prepare_terms() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like