annif.lexical.mllm.MLLMModel._conflate_matches() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-22 15:04 UTC

annif.lexical.mllm.MLLMModel._conflate_matches() A

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	17
Code Lines	16

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	16
nop	3
dl	0
loc	17
rs	9.6
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.lexical.tokenset import TokenSet, TokenSetIndex


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related',
    start=0)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        terms = []
        subject_ids = []
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            if annif.util.boolean(params['use_hidden_labels']):
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
            else:
                label_props = [SKOS.altLabel]

            for prop in label_props:
                for label in graph.objects(URIRef(uri), prop):
                    if label.language != params['language']:
                        continue
                    terms.append(Term(subject_id=subj_id,
                                      label=str(label),
                                      is_pref=False))
        return (terms, subject_ids)

    def _prepare_relations(self, graph, vocab):
        n_subj = len(vocab.subjects)
        self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)

        prop_matrix = [
            (SKOS.broader, self._broader_matrix),
            (SKOS.narrower, self._narrower_matrix),
            (SKOS.related, self._related_matrix)
        ]

        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject

            for prop, matrix in prop_matrix:
                for other in graph.objects(URIRef(uri), prop):
                    other_id = vocab.subjects.by_uri(str(other),
                                                     warnings=False)
                    if other_id is not None:
                        matrix[subj_id, other_id] = True

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for candidate subjects
        self._idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            self._idf[subj_id] = math.log((doc_count + 1) /
                                          (self._doc_freq[subj_id] + 1)) + 1
        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			from statistics import mean
6			from enum import IntEnum
7			import numpy as np
8			from rdflib import URIRef
9			from rdflib.namespace import SKOS
10			from scipy.sparse import lil_matrix
11			from sklearn.feature_extraction.text import CountVectorizer
12			from sklearn.ensemble import BaggingClassifier
13			from sklearn.tree import DecisionTreeClassifier
14			import annif.util
15			from annif.lexical.tokenset import TokenSet, TokenSetIndex
16
17
18			Term = collections.namedtuple('Term', 'subject_id label is_pref')
19
20			Match = collections.namedtuple(
21			'Match', 'subject_id is_pref n_tokens pos ambiguity')
22
23			Candidate = collections.namedtuple(
24			'Candidate',
25			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
26			'first_occ last_occ spread')
27
28			Feature = IntEnum(
29			'Feature',
30			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
31			'first_occ last_occ spread doc_length ' +
32			'broader narrower related',
33			start=0)
34
35
36			class MLLMModel:
37			"""Maui-like Lexical Matching model"""
38
39			def _conflate_matches(self, matches, doc_length):
40			subj_matches = collections.defaultdict(list)
41			for match in matches:
42			subj_matches[match.subject_id].append(match)
43			return [
44			Candidate(
45			doc_length=doc_length,
46			subject_id=subject_id,
47			freq=len(matches) / doc_length,
48			is_pref=mean((float(m.is_pref) for m in matches)),
49			n_tokens=mean((m.n_tokens for m in matches)),
50			ambiguity=mean((m.ambiguity for m in matches)),
51			first_occ=matches[0].pos / doc_length,
52			last_occ=matches[-1].pos / doc_length,
53			spread=(matches[-1].pos - matches[0].pos) / doc_length
54			)
55			for subject_id, matches in subj_matches.items()]
56
57			def generate_candidates(self, text, analyzer):
58			sentences = analyzer.tokenize_sentences(text)
59			sent_tokens = self._vectorizer.transform(sentences)
60			matches = []
61
62			for sent_idx, token_matrix in enumerate(sent_tokens):
63			tset = TokenSet(token_matrix.nonzero()[1])
64			for ts, ambiguity in self._index.search(tset):
65			matches.append(Match(subject_id=ts.subject_id,
66			is_pref=ts.is_pref,
67			n_tokens=len(ts),
68			pos=sent_idx,
69			ambiguity=ambiguity))
70
71			return self._conflate_matches(matches, len(sentences))
72
73			def _candidates_to_features(self, candidates):
74			"""Convert a list of Candidates to a NumPy feature matrix"""
75			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
76			c_ids = [c.subject_id for c in candidates]
77			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
78			c_vec[c_ids] = True
79			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
80			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
81			related = self._related_matrix.multiply(c_vec).sum(axis=1)
82			for idx, c in enumerate(candidates):
83			subj = c.subject_id
84			matrix[idx, Feature.freq] = c.freq
85			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
86			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
87			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
88			matrix[idx, Feature.is_pref] = c.is_pref
89			matrix[idx, Feature.n_tokens] = c.n_tokens
90			matrix[idx, Feature.ambiguity] = c.ambiguity
91			matrix[idx, Feature.first_occ] = c.first_occ
92			matrix[idx, Feature.last_occ] = c.last_occ
93			matrix[idx, Feature.spread] = c.spread
94			matrix[idx, Feature.doc_length] = c.doc_length
95			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
96			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
97			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
98			return matrix
99
100			def _prepare_terms(self, graph, vocab, params):
101			terms = []
102			subject_ids = []
103			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
104			if pref is None:
105			continue # deprecated subject
106			subject_ids.append(subj_id)
107			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
108
109			if annif.util.boolean(params['use_hidden_labels']):
110			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
111			else:
112			label_props = [SKOS.altLabel]
113
114			for prop in label_props:
115			for label in graph.objects(URIRef(uri), prop):
116			if label.language != params['language']:
117			continue
118			terms.append(Term(subject_id=subj_id,
119			label=str(label),
120			is_pref=False))
121			return (terms, subject_ids)
122
123			def _prepare_relations(self, graph, vocab):
124			n_subj = len(vocab.subjects)
125			self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
126			self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
127			self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
128
129			prop_matrix = [
130			(SKOS.broader, self._broader_matrix),
131			(SKOS.narrower, self._narrower_matrix),
132			(SKOS.related, self._related_matrix)
133			]
134
135			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
136			if pref is None:
137			continue # deprecated subject
138
139			for prop, matrix in prop_matrix:
140			for other in graph.objects(URIRef(uri), prop):
141			other_id = vocab.subjects.by_uri(str(other),
142			warnings=False)
143			if other_id is not None:
144			matrix[subj_id, other_id] = True
145
146			def _prepare_train_index(self, vocab, analyzer, params):
147			graph = vocab.as_graph()
148			terms, subject_ids = self._prepare_terms(graph, vocab, params)
149			self._prepare_relations(graph, vocab)
150
151			self._vectorizer = CountVectorizer(
152			binary=True,
153			tokenizer=analyzer.tokenize_words
154			)
155			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
156
157			self._index = TokenSetIndex()
158			for term, label_matrix in zip(terms, label_corpus):
159			tokens = label_matrix.nonzero()[1]
160			tset = TokenSet(tokens, term.subject_id, term.is_pref)
161			self._index.add(tset)
162
163			return subject_ids
164
165			def prepare_train(self, corpus, vocab, analyzer, params):
166			subject_ids = self._prepare_train_index(vocab, analyzer, params)
167
168			# frequency of subjects (by id) in the generated candidates
169			self._doc_freq = collections.Counter()
170			# frequency of manually assigned subjects ("domain keyphraseness")
171			self._subj_freq = collections.Counter()
172			doc_count = 0
173			train_x = []
174			train_y = []
175			for idx, doc in enumerate(corpus.documents):
176			doc_subject_ids = [vocab.subjects.by_uri(uri)
177			for uri in doc.uris]
178			self._subj_freq.update(doc_subject_ids)
179			candidates = self.generate_candidates(doc.text, analyzer)
180			self._doc_freq.update([c.subject_id for c in candidates])
181			train_x.append(candidates)
182			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
183			doc_count += 1
184
185			# precalculate idf values for candidate subjects
186			self._idf = collections.defaultdict(float)
187			for subj_id in subject_ids:
188			self._idf[subj_id] = math.log((doc_count + 1) /
189			(self._doc_freq[subj_id] + 1)) + 1
190			return (np.vstack([self._candidates_to_features(candidates)
191			for candidates in train_x]), np.array(train_y))
192
193			def _create_classifier(self, params):
194			return BaggingClassifier(
195			DecisionTreeClassifier(
196			min_samples_leaf=int(params['min_samples_leaf']),
197			max_leaf_nodes=int(params['max_leaf_nodes'])
198			), max_samples=float(params['max_samples']))
199
200			def train(self, train_x, train_y, params):
201			# fit the model on the training corpus
202			self._classifier = self._create_classifier(params)
203			self._classifier.fit(train_x, train_y)
204
205			def _prediction_to_list(self, scores, candidates):
206			subj_scores = [(score[1], c.subject_id)
207			for score, c in zip(scores, candidates)]
208			return sorted(subj_scores, reverse=True)
209
210			def predict(self, candidates):
211			if not candidates:
212			return []
213			features = self._candidates_to_features(candidates)
214			scores = self._classifier.predict_proba(features)
215			return self._prediction_to_list(scores, candidates)
216

NatLibFi / Annif

Pull Request — master (#462)

annif.lexical.mllm.MLLMModel._conflate_matches() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like