annif.lexical.mllm.MLLMModel._prepare_terms() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-23 12:03 UTC

annif.lexical.mllm.MLLMModel._prepare_terms() B

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	22
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	19
nop	4
dl	0
loc	22
rs	8
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.lexical.tokenset import TokenSet, TokenSetIndex


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        collection = self._collection_matrix.multiply(c_vec).sum(axis=1)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
            matrix[idx, Feature.collection] = collection[subj, 0] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        terms = []
        subject_ids = []
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            if annif.util.boolean(params['use_hidden_labels']):
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
            else:
                label_props = [SKOS.altLabel]

            for prop in label_props:
                for label in graph.objects(URIRef(uri), prop):
                    if label.language != params['language']:
                        continue
                    terms.append(Term(subject_id=subj_id,
                                      label=str(label),
                                      is_pref=False))
        return (terms, subject_ids)

    def _make_collection_matrix(self, graph, vocab):
        # make an index with all collection members
        c_members = collections.defaultdict(list)
        for coll, member in graph.subject_objects(SKOS.member):
            member_id = vocab.subjects.by_uri(str(member), warnings=False)
            if member_id is not None:
                c_members[str(coll)].append(member_id)

        n_subj = len(vocab.subjects)
        c_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)

        # populate the matrix by looking up members of the same collections
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject
            for coll in graph.subjects(SKOS.member, URIRef(uri)):
                other_ids = c_members[str(coll)]
                c_matrix[subj_id, other_ids] = True

        return c_matrix

    def _prepare_relations(self, graph, vocab):
        n_subj = len(vocab.subjects)
        self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._collection_matrix = self._make_collection_matrix(graph, vocab)

        prop_matrix = [
            (SKOS.broader, self._broader_matrix),
            (SKOS.narrower, self._narrower_matrix),
            (SKOS.related, self._related_matrix)
        ]

        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject

            for prop, matrix in prop_matrix:
                for other in graph.objects(URIRef(uri), prop):
                    other_id = vocab.subjects.by_uri(str(other),
                                                     warnings=False)
                    if other_id is not None:
                        matrix[subj_id, other_id] = True

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for candidate subjects
        self._idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            self._idf[subj_id] = math.log((doc_count + 1) /
                                          (self._doc_freq[subj_id] + 1)) + 1
        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			import joblib
6			from statistics import mean
7			from enum import IntEnum
8			import numpy as np
9			from rdflib import URIRef
10			from rdflib.namespace import SKOS
11			from scipy.sparse import lil_matrix
12			from sklearn.feature_extraction.text import CountVectorizer
13			from sklearn.ensemble import BaggingClassifier
14			from sklearn.tree import DecisionTreeClassifier
15			import annif.util
16			from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
18
19			Term = collections.namedtuple('Term', 'subject_id label is_pref')
20
21			Match = collections.namedtuple(
22			'Match', 'subject_id is_pref n_tokens pos ambiguity')
23
24			Candidate = collections.namedtuple(
25			'Candidate',
26			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
27			'first_occ last_occ spread')
28
29			Feature = IntEnum(
30			'Feature',
31			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
32			'first_occ last_occ spread doc_length ' +
33			'broader narrower related collection',
34			start=0)
35
36
37			class MLLMModel:
38			"""Maui-like Lexical Matching model"""
39
40			def _conflate_matches(self, matches, doc_length):
41			subj_matches = collections.defaultdict(list)
42			for match in matches:
43			subj_matches[match.subject_id].append(match)
44			return [
45			Candidate(
46			doc_length=doc_length,
47			subject_id=subject_id,
48			freq=len(matches) / doc_length,
49			is_pref=mean((float(m.is_pref) for m in matches)),
50			n_tokens=mean((m.n_tokens for m in matches)),
51			ambiguity=mean((m.ambiguity for m in matches)),
52			first_occ=matches[0].pos / doc_length,
53			last_occ=matches[-1].pos / doc_length,
54			spread=(matches[-1].pos - matches[0].pos) / doc_length
55			)
56			for subject_id, matches in subj_matches.items()]
57
58			def generate_candidates(self, text, analyzer):
59			sentences = analyzer.tokenize_sentences(text)
60			sent_tokens = self._vectorizer.transform(sentences)
61			matches = []
62
63			for sent_idx, token_matrix in enumerate(sent_tokens):
64			tset = TokenSet(token_matrix.nonzero()[1])
65			for ts, ambiguity in self._index.search(tset):
66			matches.append(Match(subject_id=ts.subject_id,
67			is_pref=ts.is_pref,
68			n_tokens=len(ts),
69			pos=sent_idx,
70			ambiguity=ambiguity))
71
72			return self._conflate_matches(matches, len(sentences))
73
74			def _candidates_to_features(self, candidates):
75			"""Convert a list of Candidates to a NumPy feature matrix"""
76			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
77			c_ids = [c.subject_id for c in candidates]
78			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
79			c_vec[c_ids] = True
80			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
81			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
82			related = self._related_matrix.multiply(c_vec).sum(axis=1)
83			collection = self._collection_matrix.multiply(c_vec).sum(axis=1)
84			for idx, c in enumerate(candidates):
85			subj = c.subject_id
86			matrix[idx, Feature.freq] = c.freq
87			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
88			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
89			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
90			matrix[idx, Feature.is_pref] = c.is_pref
91			matrix[idx, Feature.n_tokens] = c.n_tokens
92			matrix[idx, Feature.ambiguity] = c.ambiguity
93			matrix[idx, Feature.first_occ] = c.first_occ
94			matrix[idx, Feature.last_occ] = c.last_occ
95			matrix[idx, Feature.spread] = c.spread
96			matrix[idx, Feature.doc_length] = c.doc_length
97			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
98			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
99			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
100			matrix[idx, Feature.collection] = collection[subj, 0] / len(c_ids)
101			return matrix
102
103			def _prepare_terms(self, graph, vocab, params):
104			terms = []
105			subject_ids = []
106			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
107			if pref is None:
108			continue # deprecated subject
109			subject_ids.append(subj_id)
110			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
111
112			if annif.util.boolean(params['use_hidden_labels']):
113			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
114			else:
115			label_props = [SKOS.altLabel]
116
117			for prop in label_props:
118			for label in graph.objects(URIRef(uri), prop):
119			if label.language != params['language']:
120			continue
121			terms.append(Term(subject_id=subj_id,
122			label=str(label),
123			is_pref=False))
124			return (terms, subject_ids)
125
126			def _make_collection_matrix(self, graph, vocab):
127			# make an index with all collection members
128			c_members = collections.defaultdict(list)
129			for coll, member in graph.subject_objects(SKOS.member):
130			member_id = vocab.subjects.by_uri(str(member), warnings=False)
131			if member_id is not None:
132			c_members[str(coll)].append(member_id)
133
134			n_subj = len(vocab.subjects)
135			c_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
136
137			# populate the matrix by looking up members of the same collections
138			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
139			if pref is None:
140			continue # deprecated subject
141			for coll in graph.subjects(SKOS.member, URIRef(uri)):
142			other_ids = c_members[str(coll)]
143			c_matrix[subj_id, other_ids] = True
144
145			return c_matrix
146
147			def _prepare_relations(self, graph, vocab):
148			n_subj = len(vocab.subjects)
149			self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
150			self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
151			self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
152			self._collection_matrix = self._make_collection_matrix(graph, vocab)
153
154			prop_matrix = [
155			(SKOS.broader, self._broader_matrix),
156			(SKOS.narrower, self._narrower_matrix),
157			(SKOS.related, self._related_matrix)
158			]
159
160			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
161			if pref is None:
162			continue # deprecated subject
163
164			for prop, matrix in prop_matrix:
165			for other in graph.objects(URIRef(uri), prop):
166			other_id = vocab.subjects.by_uri(str(other),
167			warnings=False)
168			if other_id is not None:
169			matrix[subj_id, other_id] = True
170
171			def _prepare_train_index(self, vocab, analyzer, params):
172			graph = vocab.as_graph()
173			terms, subject_ids = self._prepare_terms(graph, vocab, params)
174			self._prepare_relations(graph, vocab)
175
176			self._vectorizer = CountVectorizer(
177			binary=True,
178			tokenizer=analyzer.tokenize_words
179			)
180			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
181
182			self._index = TokenSetIndex()
183			for term, label_matrix in zip(terms, label_corpus):
184			tokens = label_matrix.nonzero()[1]
185			tset = TokenSet(tokens, term.subject_id, term.is_pref)
186			self._index.add(tset)
187
188			return subject_ids
189
190			def prepare_train(self, corpus, vocab, analyzer, params):
191			subject_ids = self._prepare_train_index(vocab, analyzer, params)
192
193			# frequency of subjects (by id) in the generated candidates
194			self._doc_freq = collections.Counter()
195			# frequency of manually assigned subjects ("domain keyphraseness")
196			self._subj_freq = collections.Counter()
197			doc_count = 0
198			train_x = []
199			train_y = []
200			for idx, doc in enumerate(corpus.documents):
201			doc_subject_ids = [vocab.subjects.by_uri(uri)
202			for uri in doc.uris]
203			self._subj_freq.update(doc_subject_ids)
204			candidates = self.generate_candidates(doc.text, analyzer)
205			self._doc_freq.update([c.subject_id for c in candidates])
206			train_x.append(candidates)
207			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
208			doc_count += 1
209
210			# precalculate idf values for candidate subjects
211			self._idf = collections.defaultdict(float)
212			for subj_id in subject_ids:
213			self._idf[subj_id] = math.log((doc_count + 1) /
214			(self._doc_freq[subj_id] + 1)) + 1
215			return (np.vstack([self._candidates_to_features(candidates)
216			for candidates in train_x]), np.array(train_y))
217
218			def _create_classifier(self, params):
219			return BaggingClassifier(
220			DecisionTreeClassifier(
221			min_samples_leaf=int(params['min_samples_leaf']),
222			max_leaf_nodes=int(params['max_leaf_nodes'])
223			), max_samples=float(params['max_samples']))
224
225			def train(self, train_x, train_y, params):
226			# fit the model on the training corpus
227			self._classifier = self._create_classifier(params)
228			self._classifier.fit(train_x, train_y)
229
230			def _prediction_to_list(self, scores, candidates):
231			subj_scores = [(score[1], c.subject_id)
232			for score, c in zip(scores, candidates)]
233			return sorted(subj_scores, reverse=True)
234
235			def predict(self, candidates):
236			if not candidates:
237			return []
238			features = self._candidates_to_features(candidates)
239			scores = self._classifier.predict_proba(features)
240			return self._prediction_to_list(scores, candidates)
241
242			def save(self, filename):
243			return joblib.dump(self, filename)
244
245			@staticmethod
246			def load(filename):
247			return joblib.load(filename)
248

NatLibFi / Annif

Pull Request — master (#462)

annif.lexical.mllm.MLLMModel._prepare_terms() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like