annif.lexical.mllm.MLLMModel._conflate_matches() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-29 11:38 UTC

annif.lexical.mllm.MLLMModel._conflate_matches() A

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	17
Code Lines	16

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	16
nop	3
dl	0
loc	17
rs	9.6
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix, csc_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.lexical.tokenset import TokenSet, TokenSetIndex


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        collection = self._collection_matrix.multiply(c_vec).T.dot(
            self._collection_matrix).sum(axis=0)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
            matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        terms = []
        subject_ids = []
        for subj_id, uri, pref, _ in vocab.subjects.active:
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            if annif.util.boolean(params['use_hidden_labels']):
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
            else:
                label_props = [SKOS.altLabel]

            for prop in label_props:
                for label in graph.objects(URIRef(uri), prop):
                    if label.language != params['language']:
                        continue
                    terms.append(Term(subject_id=subj_id,
                                      label=str(label),
                                      is_pref=False))
        return (terms, subject_ids)

    def _make_relation_matrix(self, graph, vocab, property):
        n_subj = len(vocab.subjects)
        matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)

        for subj_id, uri, pref, _ in vocab.subjects.active:
            for other in graph.objects(URIRef(uri), property):
                other_id = vocab.subjects.by_uri(str(other),
                                                 warnings=False)
                if other_id is not None:
                    matrix[subj_id, other_id] = True

        return csc_matrix(matrix)

    def _make_collection_matrix(self, graph, vocab):
        # make an index with all collection members
        c_members = collections.defaultdict(list)
        for coll, member in graph.subject_objects(SKOS.member):
            member_id = vocab.subjects.by_uri(str(member), warnings=False)
            if member_id is not None:
                c_members[str(coll)].append(member_id)

        c_matrix = lil_matrix((len(c_members), len(vocab.subjects)),
                              dtype=np.bool)

        # populate the matrix for collection -> subject_id
        for c_id, members in enumerate(c_members.values()):
            c_matrix[c_id, members] = True

        return csc_matrix(c_matrix)

    def _prepare_relations(self, graph, vocab):
        self._broader_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.broader)
        self._narrower_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.narrower)
        self._related_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.related)
        self._collection_matrix = self._make_collection_matrix(graph, vocab)

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _calculate_idf(self, subject_ids, doc_count):
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) /
                                    (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, doc_count)

        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			import joblib
6			from statistics import mean
7			from enum import IntEnum
8			import numpy as np
9			from rdflib import URIRef
10			from rdflib.namespace import SKOS
11			from scipy.sparse import lil_matrix, csc_matrix
12			from sklearn.feature_extraction.text import CountVectorizer
13			from sklearn.ensemble import BaggingClassifier
14			from sklearn.tree import DecisionTreeClassifier
15			import annif.util
16			from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
18
19			Term = collections.namedtuple('Term', 'subject_id label is_pref')
20
21			Match = collections.namedtuple(
22			'Match', 'subject_id is_pref n_tokens pos ambiguity')
23
24			Candidate = collections.namedtuple(
25			'Candidate',
26			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
27			'first_occ last_occ spread')
28
29			Feature = IntEnum(
30			'Feature',
31			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
32			'first_occ last_occ spread doc_length ' +
33			'broader narrower related collection',
34			start=0)
35
36
37			class MLLMModel:
38			"""Maui-like Lexical Matching model"""
39
40			def _conflate_matches(self, matches, doc_length):
41			subj_matches = collections.defaultdict(list)
42			for match in matches:
43			subj_matches[match.subject_id].append(match)
44			return [
45			Candidate(
46			doc_length=doc_length,
47			subject_id=subject_id,
48			freq=len(matches) / doc_length,
49			is_pref=mean((float(m.is_pref) for m in matches)),
50			n_tokens=mean((m.n_tokens for m in matches)),
51			ambiguity=mean((m.ambiguity for m in matches)),
52			first_occ=matches[0].pos / doc_length,
53			last_occ=matches[-1].pos / doc_length,
54			spread=(matches[-1].pos - matches[0].pos) / doc_length
55			)
56			for subject_id, matches in subj_matches.items()]
57
58			def generate_candidates(self, text, analyzer):
59			sentences = analyzer.tokenize_sentences(text)
60			sent_tokens = self._vectorizer.transform(sentences)
61			matches = []
62
63			for sent_idx, token_matrix in enumerate(sent_tokens):
64			tset = TokenSet(token_matrix.nonzero()[1])
65			for ts, ambiguity in self._index.search(tset):
66			matches.append(Match(subject_id=ts.subject_id,
67			is_pref=ts.is_pref,
68			n_tokens=len(ts),
69			pos=sent_idx,
70			ambiguity=ambiguity))
71
72			return self._conflate_matches(matches, len(sentences))
73
74			def _candidates_to_features(self, candidates):
75			"""Convert a list of Candidates to a NumPy feature matrix"""
76			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
77			c_ids = [c.subject_id for c in candidates]
78			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
79			c_vec[c_ids] = True
80			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
81			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
82			related = self._related_matrix.multiply(c_vec).sum(axis=1)
83			collection = self._collection_matrix.multiply(c_vec).T.dot(
84			self._collection_matrix).sum(axis=0)
85			for idx, c in enumerate(candidates):
86			subj = c.subject_id
87			matrix[idx, Feature.freq] = c.freq
88			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
89			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
90			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
91			matrix[idx, Feature.is_pref] = c.is_pref
92			matrix[idx, Feature.n_tokens] = c.n_tokens
93			matrix[idx, Feature.ambiguity] = c.ambiguity
94			matrix[idx, Feature.first_occ] = c.first_occ
95			matrix[idx, Feature.last_occ] = c.last_occ
96			matrix[idx, Feature.spread] = c.spread
97			matrix[idx, Feature.doc_length] = c.doc_length
98			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
99			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
100			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
101			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
102			return matrix
103
104			def _prepare_terms(self, graph, vocab, params):
105			terms = []
106			subject_ids = []
107			for subj_id, uri, pref, _ in vocab.subjects.active:
108			subject_ids.append(subj_id)
109			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
110
111			if annif.util.boolean(params['use_hidden_labels']):
112			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
113			else:
114			label_props = [SKOS.altLabel]
115
116			for prop in label_props:
117			for label in graph.objects(URIRef(uri), prop):
118			if label.language != params['language']:
119			continue
120			terms.append(Term(subject_id=subj_id,
121			label=str(label),
122			is_pref=False))
123			return (terms, subject_ids)
124
125			def _make_relation_matrix(self, graph, vocab, property):
126			n_subj = len(vocab.subjects)
127			matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
128
129			for subj_id, uri, pref, _ in vocab.subjects.active:
130			for other in graph.objects(URIRef(uri), property):
131			other_id = vocab.subjects.by_uri(str(other),
132			warnings=False)
133			if other_id is not None:
134			matrix[subj_id, other_id] = True
135
136			return csc_matrix(matrix)
137
138			def _make_collection_matrix(self, graph, vocab):
139			# make an index with all collection members
140			c_members = collections.defaultdict(list)
141			for coll, member in graph.subject_objects(SKOS.member):
142			member_id = vocab.subjects.by_uri(str(member), warnings=False)
143			if member_id is not None:
144			c_members[str(coll)].append(member_id)
145
146			c_matrix = lil_matrix((len(c_members), len(vocab.subjects)),
147			dtype=np.bool)
148
149			# populate the matrix for collection -> subject_id
150			for c_id, members in enumerate(c_members.values()):
151			c_matrix[c_id, members] = True
152
153			return csc_matrix(c_matrix)
154
155			def _prepare_relations(self, graph, vocab):
156			self._broader_matrix = self._make_relation_matrix(
157			graph, vocab, SKOS.broader)
158			self._narrower_matrix = self._make_relation_matrix(
159			graph, vocab, SKOS.narrower)
160			self._related_matrix = self._make_relation_matrix(
161			graph, vocab, SKOS.related)
162			self._collection_matrix = self._make_collection_matrix(graph, vocab)
163
164			def _prepare_train_index(self, vocab, analyzer, params):
165			graph = vocab.as_graph()
166			terms, subject_ids = self._prepare_terms(graph, vocab, params)
167			self._prepare_relations(graph, vocab)
168
169			self._vectorizer = CountVectorizer(
170			binary=True,
171			tokenizer=analyzer.tokenize_words
172			)
173			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
174
175			self._index = TokenSetIndex()
176			for term, label_matrix in zip(terms, label_corpus):
177			tokens = label_matrix.nonzero()[1]
178			tset = TokenSet(tokens, term.subject_id, term.is_pref)
179			self._index.add(tset)
180
181			return subject_ids
182
183			def _calculate_idf(self, subject_ids, doc_count):
184			idf = collections.defaultdict(float)
185			for subj_id in subject_ids:
186			idf[subj_id] = math.log((doc_count + 1) /
187			(self._doc_freq[subj_id] + 1)) + 1
188
189			return idf
190
191			def prepare_train(self, corpus, vocab, analyzer, params):
192			subject_ids = self._prepare_train_index(vocab, analyzer, params)
193
194			# frequency of subjects (by id) in the generated candidates
195			self._doc_freq = collections.Counter()
196			# frequency of manually assigned subjects ("domain keyphraseness")
197			self._subj_freq = collections.Counter()
198			doc_count = 0
199			train_x = []
200			train_y = []
201			for idx, doc in enumerate(corpus.documents):
202			doc_subject_ids = [vocab.subjects.by_uri(uri)
203			for uri in doc.uris]
204			self._subj_freq.update(doc_subject_ids)
205			candidates = self.generate_candidates(doc.text, analyzer)
206			self._doc_freq.update([c.subject_id for c in candidates])
207			train_x.append(candidates)
208			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
209			doc_count += 1
210
211			# precalculate idf values for all candidate subjects
212			self._idf = self._calculate_idf(subject_ids, doc_count)
213
214			return (np.vstack([self._candidates_to_features(candidates)
215			for candidates in train_x]), np.array(train_y))
216
217			def _create_classifier(self, params):
218			return BaggingClassifier(
219			DecisionTreeClassifier(
220			min_samples_leaf=int(params['min_samples_leaf']),
221			max_leaf_nodes=int(params['max_leaf_nodes'])
222			), max_samples=float(params['max_samples']))
223
224			def train(self, train_x, train_y, params):
225			# fit the model on the training corpus
226			self._classifier = self._create_classifier(params)
227			self._classifier.fit(train_x, train_y)
228
229			def _prediction_to_list(self, scores, candidates):
230			subj_scores = [(score[1], c.subject_id)
231			for score, c in zip(scores, candidates)]
232			return sorted(subj_scores, reverse=True)
233
234			def predict(self, candidates):
235			if not candidates:
236			return []
237			features = self._candidates_to_features(candidates)
238			scores = self._classifier.predict_proba(features)
239			return self._prediction_to_list(scores, candidates)
240
241			def save(self, filename):
242			return joblib.dump(self, filename)
243
244			@staticmethod
245			def load(filename):
246			return joblib.load(filename)
247

NatLibFi / Annif

Pull Request — master (#462)

annif.lexical.mllm.MLLMModel._conflate_matches() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like