annif.lexical.mllm.MLLMModel.prepare_train() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-29 13:57 UTC

annif.lexical.mllm.MLLMModel.prepare_train() A

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	25
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	19
nop	5
dl	0
loc	25
rs	9.45
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix, csc_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.lexical.tokenset import TokenSet, TokenSetIndex


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


def get_subject_labels(graph, uri, properties, language):
    for prop in properties:
        for label in graph.objects(URIRef(uri), prop):
            if label.language != language:
                continue
            yield str(label)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        collection = self._collection_matrix.multiply(c_vec).T.dot(
            self._collection_matrix).sum(axis=0)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
            matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        if annif.util.boolean(params['use_hidden_labels']):
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
        else:
            label_props = [SKOS.altLabel]

        terms = []
        subject_ids = []
        for subj_id, uri, pref, _ in vocab.subjects.active:
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            for label in get_subject_labels(graph, uri, label_props,
                                            params['language']):
                terms.append(Term(subject_id=subj_id,
                                  label=label,
                                  is_pref=False))

        return (terms, subject_ids)

    def _make_relation_matrix(self, graph, vocab, property):
        n_subj = len(vocab.subjects)
        matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)

        for subj, obj in graph.subject_objects(property):
            subj_id = vocab.subjects.by_uri(str(subj), warnings=False)
            obj_id = vocab.subjects.by_uri(str(obj), warnings=False)
            if subj_id is not None and obj_id is not None:
                matrix[subj_id, obj_id] = True

        return csc_matrix(matrix)

    def _make_collection_matrix(self, graph, vocab):
        # make an index with all collection members
        c_members = collections.defaultdict(list)
        for coll, member in graph.subject_objects(SKOS.member):
            member_id = vocab.subjects.by_uri(str(member), warnings=False)
            if member_id is not None:
                c_members[str(coll)].append(member_id)

        c_matrix = lil_matrix((len(c_members), len(vocab.subjects)),
                              dtype=np.bool)

        # populate the matrix for collection -> subject_id
        for c_id, members in enumerate(c_members.values()):
            c_matrix[c_id, members] = True

        return csc_matrix(c_matrix)

    def _prepare_relations(self, graph, vocab):
        self._broader_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.broader)
        self._narrower_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.narrower)
        self._related_matrix = self._make_relation_matrix(
            graph, vocab, SKOS.related)
        self._collection_matrix = self._make_collection_matrix(graph, vocab)

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _calculate_idf(self, subject_ids, doc_count):
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) /
                                    (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, doc_count)

        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			import joblib
6			from statistics import mean
7			from enum import IntEnum
8			import numpy as np
9			from rdflib import URIRef
10			from rdflib.namespace import SKOS
11			from scipy.sparse import lil_matrix, csc_matrix
12			from sklearn.feature_extraction.text import CountVectorizer
13			from sklearn.ensemble import BaggingClassifier
14			from sklearn.tree import DecisionTreeClassifier
15			import annif.util
16			from annif.lexical.tokenset import TokenSet, TokenSetIndex
17
18
19			Term = collections.namedtuple('Term', 'subject_id label is_pref')
20
21			Match = collections.namedtuple(
22			'Match', 'subject_id is_pref n_tokens pos ambiguity')
23
24			Candidate = collections.namedtuple(
25			'Candidate',
26			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
27			'first_occ last_occ spread')
28
29			Feature = IntEnum(
30			'Feature',
31			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
32			'first_occ last_occ spread doc_length ' +
33			'broader narrower related collection',
34			start=0)
35
36
37			def get_subject_labels(graph, uri, properties, language):
38			for prop in properties:
39			for label in graph.objects(URIRef(uri), prop):
40			if label.language != language:
41			continue
42			yield str(label)
43
44
45			class MLLMModel:
46			"""Maui-like Lexical Matching model"""
47
48			def _conflate_matches(self, matches, doc_length):
49			subj_matches = collections.defaultdict(list)
50			for match in matches:
51			subj_matches[match.subject_id].append(match)
52			return [
53			Candidate(
54			doc_length=doc_length,
55			subject_id=subject_id,
56			freq=len(matches) / doc_length,
57			is_pref=mean((float(m.is_pref) for m in matches)),
58			n_tokens=mean((m.n_tokens for m in matches)),
59			ambiguity=mean((m.ambiguity for m in matches)),
60			first_occ=matches[0].pos / doc_length,
61			last_occ=matches[-1].pos / doc_length,
62			spread=(matches[-1].pos - matches[0].pos) / doc_length
63			)
64			for subject_id, matches in subj_matches.items()]
65
66			def generate_candidates(self, text, analyzer):
67			sentences = analyzer.tokenize_sentences(text)
68			sent_tokens = self._vectorizer.transform(sentences)
69			matches = []
70
71			for sent_idx, token_matrix in enumerate(sent_tokens):
72			tset = TokenSet(token_matrix.nonzero()[1])
73			for ts, ambiguity in self._index.search(tset):
74			matches.append(Match(subject_id=ts.subject_id,
75			is_pref=ts.is_pref,
76			n_tokens=len(ts),
77			pos=sent_idx,
78			ambiguity=ambiguity))
79
80			return self._conflate_matches(matches, len(sentences))
81
82			def _candidates_to_features(self, candidates):
83			"""Convert a list of Candidates to a NumPy feature matrix"""
84			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
85			c_ids = [c.subject_id for c in candidates]
86			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
87			c_vec[c_ids] = True
88			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
89			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
90			related = self._related_matrix.multiply(c_vec).sum(axis=1)
91			collection = self._collection_matrix.multiply(c_vec).T.dot(
92			self._collection_matrix).sum(axis=0)
93			for idx, c in enumerate(candidates):
94			subj = c.subject_id
95			matrix[idx, Feature.freq] = c.freq
96			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
97			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
98			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
99			matrix[idx, Feature.is_pref] = c.is_pref
100			matrix[idx, Feature.n_tokens] = c.n_tokens
101			matrix[idx, Feature.ambiguity] = c.ambiguity
102			matrix[idx, Feature.first_occ] = c.first_occ
103			matrix[idx, Feature.last_occ] = c.last_occ
104			matrix[idx, Feature.spread] = c.spread
105			matrix[idx, Feature.doc_length] = c.doc_length
106			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
107			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
108			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
109			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
110			return matrix
111
112			def _prepare_terms(self, graph, vocab, params):
113			if annif.util.boolean(params['use_hidden_labels']):
114			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
115			else:
116			label_props = [SKOS.altLabel]
117
118			terms = []
119			subject_ids = []
120			for subj_id, uri, pref, _ in vocab.subjects.active:
121			subject_ids.append(subj_id)
122			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
123
124			for label in get_subject_labels(graph, uri, label_props,
125			params['language']):
126			terms.append(Term(subject_id=subj_id,
127			label=label,
128			is_pref=False))
129
130			return (terms, subject_ids)
131
132			def _make_relation_matrix(self, graph, vocab, property):
133			n_subj = len(vocab.subjects)
134			matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
135
136			for subj, obj in graph.subject_objects(property):
137			subj_id = vocab.subjects.by_uri(str(subj), warnings=False)
138			obj_id = vocab.subjects.by_uri(str(obj), warnings=False)
139			if subj_id is not None and obj_id is not None:
140			matrix[subj_id, obj_id] = True
141
142			return csc_matrix(matrix)
143
144			def _make_collection_matrix(self, graph, vocab):
145			# make an index with all collection members
146			c_members = collections.defaultdict(list)
147			for coll, member in graph.subject_objects(SKOS.member):
148			member_id = vocab.subjects.by_uri(str(member), warnings=False)
149			if member_id is not None:
150			c_members[str(coll)].append(member_id)
151
152			c_matrix = lil_matrix((len(c_members), len(vocab.subjects)),
153			dtype=np.bool)
154
155			# populate the matrix for collection -> subject_id
156			for c_id, members in enumerate(c_members.values()):
157			c_matrix[c_id, members] = True
158
159			return csc_matrix(c_matrix)
160
161			def _prepare_relations(self, graph, vocab):
162			self._broader_matrix = self._make_relation_matrix(
163			graph, vocab, SKOS.broader)
164			self._narrower_matrix = self._make_relation_matrix(
165			graph, vocab, SKOS.narrower)
166			self._related_matrix = self._make_relation_matrix(
167			graph, vocab, SKOS.related)
168			self._collection_matrix = self._make_collection_matrix(graph, vocab)
169
170			def _prepare_train_index(self, vocab, analyzer, params):
171			graph = vocab.as_graph()
172			terms, subject_ids = self._prepare_terms(graph, vocab, params)
173			self._prepare_relations(graph, vocab)
174
175			self._vectorizer = CountVectorizer(
176			binary=True,
177			tokenizer=analyzer.tokenize_words
178			)
179			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
180
181			self._index = TokenSetIndex()
182			for term, label_matrix in zip(terms, label_corpus):
183			tokens = label_matrix.nonzero()[1]
184			tset = TokenSet(tokens, term.subject_id, term.is_pref)
185			self._index.add(tset)
186
187			return subject_ids
188
189			def _calculate_idf(self, subject_ids, doc_count):
190			idf = collections.defaultdict(float)
191			for subj_id in subject_ids:
192			idf[subj_id] = math.log((doc_count + 1) /
193			(self._doc_freq[subj_id] + 1)) + 1
194
195			return idf
196
197			def prepare_train(self, corpus, vocab, analyzer, params):
198			subject_ids = self._prepare_train_index(vocab, analyzer, params)
199
200			# frequency of subjects (by id) in the generated candidates
201			self._doc_freq = collections.Counter()
202			# frequency of manually assigned subjects ("domain keyphraseness")
203			self._subj_freq = collections.Counter()
204			doc_count = 0
205			train_x = []
206			train_y = []
207			for idx, doc in enumerate(corpus.documents):
208			doc_subject_ids = [vocab.subjects.by_uri(uri)
209			for uri in doc.uris]
210			self._subj_freq.update(doc_subject_ids)
211			candidates = self.generate_candidates(doc.text, analyzer)
212			self._doc_freq.update([c.subject_id for c in candidates])
213			train_x.append(candidates)
214			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
215			doc_count += 1
216
217			# precalculate idf values for all candidate subjects
218			self._idf = self._calculate_idf(subject_ids, doc_count)
219
220			return (np.vstack([self._candidates_to_features(candidates)
221			for candidates in train_x]), np.array(train_y))
222
223			def _create_classifier(self, params):
224			return BaggingClassifier(
225			DecisionTreeClassifier(
226			min_samples_leaf=int(params['min_samples_leaf']),
227			max_leaf_nodes=int(params['max_leaf_nodes'])
228			), max_samples=float(params['max_samples']))
229
230			def train(self, train_x, train_y, params):
231			# fit the model on the training corpus
232			self._classifier = self._create_classifier(params)
233			self._classifier.fit(train_x, train_y)
234
235			def _prediction_to_list(self, scores, candidates):
236			subj_scores = [(score[1], c.subject_id)
237			for score, c in zip(scores, candidates)]
238			return sorted(subj_scores, reverse=True)
239
240			def predict(self, candidates):
241			if not candidates:
242			return []
243			features = self._candidates_to_features(candidates)
244			scores = self._classifier.predict_proba(features)
245			return self._prediction_to_list(scores, candidates)
246
247			def save(self, filename):
248			return joblib.dump(self, filename)
249
250			@staticmethod
251			def load(filename):
252			return joblib.load(filename)
253

NatLibFi / Annif

Pull Request — master (#462)

annif.lexical.mllm.MLLMModel.prepare_train() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like