annif.lexical.mllm.MLLMModel._prepare_terms() - Code Metrics - Inspection of "Include labels without language tag and concepts w..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#597)

by Osma

created 2022-08-03 06:24 UTC

annif.lexical.mllm.MLLMModel._prepare_terms() B

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	25
Code Lines	20

Duplication

Lines	25
Ratio	100 %

Importance

Changes

Metric	Value
cc	5
eloc	20
nop	4
dl	25
loc	25
rs	8.9332
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib.namespace import SKOS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
import annif.parallel
from annif.exception import OperationFailedException
from annif.lexical.tokenset import TokenSet, TokenSetIndex
from annif.lexical.util import get_subject_labels
from annif.lexical.util import make_relation_matrix, make_collection_matrix


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

ModelData = collections.namedtuple(
    'ModelData',
    'broader narrower related collection ' +
    'doc_freq subj_freq idf')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


def conflate_matches(matches, doc_length):

    subj_matches = collections.defaultdict(list)
    for match in matches:
        subj_matches[match.subject_id].append(match)
    return [
        Candidate(
            doc_length=doc_length,
            subject_id=subject_id,
            freq=len(matches) / doc_length,
            is_pref=mean((float(m.is_pref) for m in matches)),
            n_tokens=mean((m.n_tokens for m in matches)),
            ambiguity=mean((m.ambiguity for m in matches)),
            first_occ=matches[0].pos / doc_length,
            last_occ=matches[-1].pos / doc_length,
            spread=(matches[-1].pos - matches[0].pos) / doc_length
        )
        for subject_id, matches in subj_matches.items()]


def generate_candidates(text, analyzer, vectorizer, index):

    sentences = analyzer.tokenize_sentences(text)
    sent_tokens = vectorizer.transform(sentences)
    matches = []

    for sent_idx, token_matrix in enumerate(sent_tokens):
        tset = TokenSet(token_matrix.nonzero()[1])
        for ts, ambiguity in index.search(tset):
            matches.append(Match(subject_id=ts.subject_id,
                                 is_pref=ts.is_pref,
                                 n_tokens=len(ts),
                                 pos=sent_idx,
                                 ambiguity=ambiguity))

    return conflate_matches(matches, len(sentences))


def candidates_to_features(candidates, mdata):

    """Convert a list of Candidates to a NumPy feature matrix"""

    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
    c_ids = [c.subject_id for c in candidates]
    c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
    c_vec[c_ids] = True
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
    related = mdata.related.multiply(c_vec).sum(axis=1)
    collection = mdata.collection.multiply(c_vec).T.dot(
        mdata.collection).sum(axis=0)
    for idx, c in enumerate(candidates):
        subj = c.subject_id
        matrix[idx, Feature.freq] = c.freq
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
        matrix[idx, Feature.is_pref] = c.is_pref
        matrix[idx, Feature.n_tokens] = c.n_tokens
        matrix[idx, Feature.ambiguity] = c.ambiguity
        matrix[idx, Feature.first_occ] = c.first_occ
        matrix[idx, Feature.last_occ] = c.last_occ
        matrix[idx, Feature.spread] = c.spread
        matrix[idx, Feature.doc_length] = c.doc_length
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
    return matrix


class MLLMCandidateGenerator(annif.parallel.BaseWorker):

    @classmethod
    def generate_candidates(cls, doc_subject_ids, text):
        candidates = generate_candidates(text, **cls.args)  # pragma: no cover
        return doc_subject_ids, candidates  # pragma: no cover


class MLLMFeatureConverter(annif.parallel.BaseWorker):

    @classmethod
    def candidates_to_features(cls, candidates):
        return candidates_to_features(candidates,
                                      **cls.args)  # pragma: no cover


class MLLMModel:

    """Maui-like Lexical Matching model"""

    def generate_candidates(self, text, analyzer):
        return generate_candidates(text, analyzer,
                                   self._vectorizer, self._index)

    @property
    def _model_data(self):
        return ModelData(broader=self._broader_matrix,
                         narrower=self._narrower_matrix,
                         related=self._related_matrix,
                         collection=self._collection_matrix,
                         doc_freq=self._doc_freq,
                         subj_freq=self._subj_freq,
                         idf=self._idf)

    def _candidates_to_features(self, candidates):
        return candidates_to_features(candidates, self._model_data)

    def _prepare_terms(self, graph, vocab, params):
        pref_label_props = [SKOS.prefLabel]
        if annif.util.boolean(params['use_hidden_labels']):
            nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
        else:
            nonpref_label_props = [SKOS.altLabel]

        terms = []
        subject_ids = []
        for subj_id, uri, _, _ in vocab.subjects.active:
            subject_ids.append(subj_id)

            for label in get_subject_labels(graph, uri, pref_label_props,
                                            params['language']):
                terms.append(Term(subject_id=subj_id,
                                  label=label,
                                  is_pref=True))

            for label in get_subject_labels(graph, uri, nonpref_label_props,
                                            params['language']):
                terms.append(Term(subject_id=subj_id,
                                  label=label,
                                  is_pref=False))

        return (terms, subject_ids)

    def _prepare_relations(self, graph, vocab):
        self._broader_matrix = make_relation_matrix(
            graph, vocab, SKOS.broader)
        self._narrower_matrix = make_relation_matrix(
            graph, vocab, SKOS.narrower)
        self._related_matrix = make_relation_matrix(
            graph, vocab, SKOS.related)
        self._collection_matrix = make_collection_matrix(graph, vocab)

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        # frequency of each token used in labels - how rare each word is
        token_freq = np.bincount(label_corpus.indices,
                                 minlength=label_corpus.shape[1])

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            # sort tokens by frequency - use the rarest token as index key
            tokens = sorted(tokens, key=token_freq.__getitem__)
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        train_x = []
        train_y = []

        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        cg_args = {
            'analyzer': analyzer,
            'vectorizer': self._vectorizer,
            'index': self._index
        }

        with pool_class(jobs,
                        initializer=MLLMCandidateGenerator.init,
                        initargs=(cg_args,)) as pool:
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
                       doc.text)
                      for doc in corpus.documents)
            for doc_subject_ids, candidates in pool.starmap(
                    MLLMCandidateGenerator.generate_candidates, params, 10):

                self._subj_freq.update(doc_subject_ids)
                self._doc_freq.update([c.subject_id for c in candidates])
                train_x.append(candidates)
                train_y += [(c.subject_id in doc_subject_ids)
                            for c in candidates]

        return (train_x, train_y)

    def _calculate_idf(self, subject_ids, doc_count):
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) /
                                    (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def _prepare_features(self, train_x, n_jobs):
        fc_args = {'mdata': self._model_data}
        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        with pool_class(jobs,
                        initializer=MLLMFeatureConverter.init,
                        initargs=(fc_args,)) as pool:
            features = pool.map(
                MLLMFeatureConverter.candidates_to_features, train_x, 10)

        return features

    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
        # create an index from the vocabulary terms
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # convert the corpus into train data
        train_x, train_y = self._prepare_train_data(
            corpus, vocab, analyzer, n_jobs)

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, len(train_x))

        # convert the train data into feature values
        features = self._prepare_features(train_x, n_jobs)

        return (np.vstack(features), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)
        # sanity check: verify that the classifier has seen both classes
        if self._classifier.n_classes_ != 2:
            raise OperationFailedException(
                "Unable to create classifier: " +
                "Not enough positive and negative examples " +
                "in the training data. Please check that your training " +
                "data matches your vocabulary.")

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1		"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3		import collections
4		import math
5		import joblib
6		from statistics import mean
7		from enum import IntEnum
8		import numpy as np
9		from rdflib.namespace import SKOS
10		from sklearn.feature_extraction.text import CountVectorizer
11		from sklearn.ensemble import BaggingClassifier
12		from sklearn.tree import DecisionTreeClassifier
13		import annif.util
14		import annif.parallel
15		from annif.exception import OperationFailedException
16		from annif.lexical.tokenset import TokenSet, TokenSetIndex
17		from annif.lexical.util import get_subject_labels
18		from annif.lexical.util import make_relation_matrix, make_collection_matrix
19
20
21		Term = collections.namedtuple('Term', 'subject_id label is_pref')
22
23		Match = collections.namedtuple(
24		'Match', 'subject_id is_pref n_tokens pos ambiguity')
25
26		Candidate = collections.namedtuple(
27		'Candidate',
28		'doc_length subject_id freq is_pref n_tokens ambiguity ' +
29		'first_occ last_occ spread')
30
31		ModelData = collections.namedtuple(
32		'ModelData',
33		'broader narrower related collection ' +
34		'doc_freq subj_freq idf')
35
36		Feature = IntEnum(
37		'Feature',
38		'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
39		'first_occ last_occ spread doc_length ' +
40		'broader narrower related collection',
41		start=0)
42
43
44	View Code Duplication	def conflate_matches(matches, doc_length):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
45		subj_matches = collections.defaultdict(list)
46		for match in matches:
47		subj_matches[match.subject_id].append(match)
48		return [
49		Candidate(
50		doc_length=doc_length,
51		subject_id=subject_id,
52		freq=len(matches) / doc_length,
53		is_pref=mean((float(m.is_pref) for m in matches)),
54		n_tokens=mean((m.n_tokens for m in matches)),
55		ambiguity=mean((m.ambiguity for m in matches)),
56		first_occ=matches[0].pos / doc_length,
57		last_occ=matches[-1].pos / doc_length,
58		spread=(matches[-1].pos - matches[0].pos) / doc_length
59		)
60		for subject_id, matches in subj_matches.items()]
61
62
63	View Code Duplication	def generate_candidates(text, analyzer, vectorizer, index):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
64		sentences = analyzer.tokenize_sentences(text)
65		sent_tokens = vectorizer.transform(sentences)
66		matches = []
67
68		for sent_idx, token_matrix in enumerate(sent_tokens):
69		tset = TokenSet(token_matrix.nonzero()[1])
70		for ts, ambiguity in index.search(tset):
71		matches.append(Match(subject_id=ts.subject_id,
72		is_pref=ts.is_pref,
73		n_tokens=len(ts),
74		pos=sent_idx,
75		ambiguity=ambiguity))
76
77		return conflate_matches(matches, len(sentences))
78
79
80	View Code Duplication	def candidates_to_features(candidates, mdata):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
81		"""Convert a list of Candidates to a NumPy feature matrix"""
82
83		matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
84		c_ids = [c.subject_id for c in candidates]
85		c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
86		c_vec[c_ids] = True
87		broader = mdata.broader.multiply(c_vec).sum(axis=1)
88		narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
89		related = mdata.related.multiply(c_vec).sum(axis=1)
90		collection = mdata.collection.multiply(c_vec).T.dot(
91		mdata.collection).sum(axis=0)
92		for idx, c in enumerate(candidates):
93		subj = c.subject_id
94		matrix[idx, Feature.freq] = c.freq
95		matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
96		matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
97		matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
98		matrix[idx, Feature.is_pref] = c.is_pref
99		matrix[idx, Feature.n_tokens] = c.n_tokens
100		matrix[idx, Feature.ambiguity] = c.ambiguity
101		matrix[idx, Feature.first_occ] = c.first_occ
102		matrix[idx, Feature.last_occ] = c.last_occ
103		matrix[idx, Feature.spread] = c.spread
104		matrix[idx, Feature.doc_length] = c.doc_length
105		matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
106		matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
107		matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
108		matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
109		return matrix
110
111
112		class MLLMCandidateGenerator(annif.parallel.BaseWorker):
113
114		@classmethod
115		def generate_candidates(cls, doc_subject_ids, text):
116		candidates = generate_candidates(text, **cls.args) # pragma: no cover
117		return doc_subject_ids, candidates # pragma: no cover
118
119
120		class MLLMFeatureConverter(annif.parallel.BaseWorker):
121
122		@classmethod
123		def candidates_to_features(cls, candidates):
124		return candidates_to_features(candidates,
125		**cls.args) # pragma: no cover
126
127
128	View Code Duplication	class MLLMModel:
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
129		"""Maui-like Lexical Matching model"""
130
131		def generate_candidates(self, text, analyzer):
132		return generate_candidates(text, analyzer,
133		self._vectorizer, self._index)
134
135		@property
136		def _model_data(self):
137		return ModelData(broader=self._broader_matrix,
138		narrower=self._narrower_matrix,
139		related=self._related_matrix,
140		collection=self._collection_matrix,
141		doc_freq=self._doc_freq,
142		subj_freq=self._subj_freq,
143		idf=self._idf)
144
145		def _candidates_to_features(self, candidates):
146		return candidates_to_features(candidates, self._model_data)
147
148		def _prepare_terms(self, graph, vocab, params):
149		pref_label_props = [SKOS.prefLabel]
150		if annif.util.boolean(params['use_hidden_labels']):
151		nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
152		else:
153		nonpref_label_props = [SKOS.altLabel]
154
155		terms = []
156		subject_ids = []
157		for subj_id, uri, _, _ in vocab.subjects.active:
158		subject_ids.append(subj_id)
159
160		for label in get_subject_labels(graph, uri, pref_label_props,
161		params['language']):
162		terms.append(Term(subject_id=subj_id,
163		label=label,
164		is_pref=True))
165
166		for label in get_subject_labels(graph, uri, nonpref_label_props,
167		params['language']):
168		terms.append(Term(subject_id=subj_id,
169		label=label,
170		is_pref=False))
171
172		return (terms, subject_ids)
173
174		def _prepare_relations(self, graph, vocab):
175		self._broader_matrix = make_relation_matrix(
176		graph, vocab, SKOS.broader)
177		self._narrower_matrix = make_relation_matrix(
178		graph, vocab, SKOS.narrower)
179		self._related_matrix = make_relation_matrix(
180		graph, vocab, SKOS.related)
181		self._collection_matrix = make_collection_matrix(graph, vocab)
182
183		def _prepare_train_index(self, vocab, analyzer, params):
184		graph = vocab.as_graph()
185		terms, subject_ids = self._prepare_terms(graph, vocab, params)
186		self._prepare_relations(graph, vocab)
187
188		self._vectorizer = CountVectorizer(
189		binary=True,
190		tokenizer=analyzer.tokenize_words
191		)
192		label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
193
194		# frequency of each token used in labels - how rare each word is
195		token_freq = np.bincount(label_corpus.indices,
196		minlength=label_corpus.shape[1])
197
198		self._index = TokenSetIndex()
199		for term, label_matrix in zip(terms, label_corpus):
200		tokens = label_matrix.nonzero()[1]
201		# sort tokens by frequency - use the rarest token as index key
202		tokens = sorted(tokens, key=token_freq.__getitem__)
203		tset = TokenSet(tokens, term.subject_id, term.is_pref)
204		self._index.add(tset)
205
206		return subject_ids
207
208		def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
209		# frequency of subjects (by id) in the generated candidates
210		self._doc_freq = collections.Counter()
211		# frequency of manually assigned subjects ("domain keyphraseness")
212		self._subj_freq = collections.Counter()
213		train_x = []
214		train_y = []
215
216		jobs, pool_class = annif.parallel.get_pool(n_jobs)
217
218		cg_args = {
219		'analyzer': analyzer,
220		'vectorizer': self._vectorizer,
221		'index': self._index
222		}
223
224		with pool_class(jobs,
225		initializer=MLLMCandidateGenerator.init,
226		initargs=(cg_args,)) as pool:
227		params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
228		doc.text)
229		for doc in corpus.documents)
230		for doc_subject_ids, candidates in pool.starmap(
231		MLLMCandidateGenerator.generate_candidates, params, 10):
232
233		self._subj_freq.update(doc_subject_ids)
234		self._doc_freq.update([c.subject_id for c in candidates])
235		train_x.append(candidates)
236		train_y += [(c.subject_id in doc_subject_ids)
237		for c in candidates]
238
239		return (train_x, train_y)
240
241		def _calculate_idf(self, subject_ids, doc_count):
242		idf = collections.defaultdict(float)
243		for subj_id in subject_ids:
244		idf[subj_id] = math.log((doc_count + 1) /
245		(self._doc_freq[subj_id] + 1)) + 1
246
247		return idf
248
249		def _prepare_features(self, train_x, n_jobs):
250		fc_args = {'mdata': self._model_data}
251		jobs, pool_class = annif.parallel.get_pool(n_jobs)
252
253		with pool_class(jobs,
254		initializer=MLLMFeatureConverter.init,
255		initargs=(fc_args,)) as pool:
256		features = pool.map(
257		MLLMFeatureConverter.candidates_to_features, train_x, 10)
258
259		return features
260
261		def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
262		# create an index from the vocabulary terms
263		subject_ids = self._prepare_train_index(vocab, analyzer, params)
264
265		# convert the corpus into train data
266		train_x, train_y = self._prepare_train_data(
267		corpus, vocab, analyzer, n_jobs)
268
269		# precalculate idf values for all candidate subjects
270		self._idf = self._calculate_idf(subject_ids, len(train_x))
271
272		# convert the train data into feature values
273		features = self._prepare_features(train_x, n_jobs)
274
275		return (np.vstack(features), np.array(train_y))
276
277		def _create_classifier(self, params):
278		return BaggingClassifier(
279		DecisionTreeClassifier(
280		min_samples_leaf=int(params['min_samples_leaf']),
281		max_leaf_nodes=int(params['max_leaf_nodes'])
282		), max_samples=float(params['max_samples']))
283
284		def train(self, train_x, train_y, params):
285		# fit the model on the training corpus
286		self._classifier = self._create_classifier(params)
287		self._classifier.fit(train_x, train_y)
288		# sanity check: verify that the classifier has seen both classes
289		if self._classifier.n_classes_ != 2:
290		raise OperationFailedException(
291		"Unable to create classifier: " +
292		"Not enough positive and negative examples " +
293		"in the training data. Please check that your training " +
294		"data matches your vocabulary.")
295
296		def _prediction_to_list(self, scores, candidates):
297		subj_scores = [(score[1], c.subject_id)
298		for score, c in zip(scores, candidates)]
299		return sorted(subj_scores, reverse=True)
300
301		def predict(self, candidates):
302		if not candidates:
303		return []
304		features = self._candidates_to_features(candidates)
305		scores = self._classifier.predict_proba(features)
306		return self._prediction_to_list(scores, candidates)
307
308		def save(self, filename):
309		return joblib.dump(self, filename)
310
311		@staticmethod
312		def load(filename):
313		return joblib.load(filename)
314

NatLibFi / Annif

Pull Request — master (#597)

annif.lexical.mllm.MLLMModel._prepare_terms() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like