annif.lexical.mllm.candidates_to_features() - Code Metrics - Inspection of "Process training docs in parallel in MLLM backend" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#511)

by Osma

created 2021-08-20 08:54 UTC

annif.lexical.mllm.candidates_to_features() A

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	33
Code Lines	31

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	31
nop	8
dl	0
loc	33
rs	9.1359
c	0
b	0
f	0

How to fix Many Parameters

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib.namespace import SKOS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
import annif.parallel
from annif.lexical.tokenset import TokenSet, TokenSetIndex
from annif.lexical.util import get_subject_labels
from annif.lexical.util import make_relation_matrix, make_collection_matrix


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


def conflate_matches(matches, doc_length):
    subj_matches = collections.defaultdict(list)
    for match in matches:
        subj_matches[match.subject_id].append(match)
    return [
        Candidate(
            doc_length=doc_length,
            subject_id=subject_id,
            freq=len(matches) / doc_length,
            is_pref=mean((float(m.is_pref) for m in matches)),
            n_tokens=mean((m.n_tokens for m in matches)),
            ambiguity=mean((m.ambiguity for m in matches)),
            first_occ=matches[0].pos / doc_length,
            last_occ=matches[-1].pos / doc_length,
            spread=(matches[-1].pos - matches[0].pos) / doc_length
        )
        for subject_id, matches in subj_matches.items()]


def generate_candidates(text, analyzer, vectorizer, index):
    sentences = analyzer.tokenize_sentences(text)
    sent_tokens = vectorizer.transform(sentences)
    matches = []

    for sent_idx, token_matrix in enumerate(sent_tokens):
        tset = TokenSet(token_matrix.nonzero()[1])
        for ts, ambiguity in index.search(tset):
            matches.append(Match(subject_id=ts.subject_id,
                                 is_pref=ts.is_pref,
                                 n_tokens=len(ts),
                                 pos=sent_idx,
                                 ambiguity=ambiguity))

    return conflate_matches(matches, len(sentences))


def candidates_to_features(candidates,
                           related_matrix, broader_matrix,
                           narrower_matrix, collection_matrix,
                           doc_freq, subj_freq, idf):
    """Convert a list of Candidates to a NumPy feature matrix"""

    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
    c_ids = [c.subject_id for c in candidates]
    c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
    c_vec[c_ids] = True
    broader = broader_matrix.multiply(c_vec).sum(axis=1)
    narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
    related = related_matrix.multiply(c_vec).sum(axis=1)
    collection = collection_matrix.multiply(c_vec).T.dot(
        collection_matrix).sum(axis=0)
    for idx, c in enumerate(candidates):
        subj = c.subject_id
        matrix[idx, Feature.freq] = c.freq
        matrix[idx, Feature.doc_freq] = doc_freq[subj]
        matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
        matrix[idx, Feature.tfidf] = c.freq * idf[subj]
        matrix[idx, Feature.is_pref] = c.is_pref
        matrix[idx, Feature.n_tokens] = c.n_tokens
        matrix[idx, Feature.ambiguity] = c.ambiguity
        matrix[idx, Feature.first_occ] = c.first_occ
        matrix[idx, Feature.last_occ] = c.last_occ
        matrix[idx, Feature.spread] = c.spread
        matrix[idx, Feature.doc_length] = c.doc_length
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
    return matrix


class MLLMCandidateGenerator(annif.parallel.BaseWorker):

    @classmethod
    def generate_candidates(cls, doc_subject_ids, text):
        candidates = generate_candidates(text, **cls.args)
        return doc_subject_ids, candidates


class MLLMFeatureConverter(annif.parallel.BaseWorker):

    @classmethod
    def candidates_to_features(cls, candidates):
        return candidates_to_features(candidates, **cls.args)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def generate_candidates(self, text, analyzer):
        return generate_candidates(text, analyzer,
                                   self._vectorizer, self._index)

    def _candidates_to_features(self, candidates):
        return candidates_to_features(candidates,
                                      self._related_matrix,
                                      self._broader_matrix,
                                      self._narrower_matrix,
                                      self._collection_matrix,
                                      self._doc_freq,
                                      self._subj_freq,
                                      self._idf)

    def _prepare_terms(self, graph, vocab, params):
        if annif.util.boolean(params['use_hidden_labels']):
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
        else:
            label_props = [SKOS.altLabel]

        terms = []
        subject_ids = []
        for subj_id, uri, pref, _ in vocab.subjects.active:
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            for label in get_subject_labels(graph, uri, label_props,
                                            params['language']):
                terms.append(Term(subject_id=subj_id,
                                  label=label,
                                  is_pref=False))

        return (terms, subject_ids)

    def _prepare_relations(self, graph, vocab):
        self._broader_matrix = make_relation_matrix(
            graph, vocab, SKOS.broader)
        self._narrower_matrix = make_relation_matrix(
            graph, vocab, SKOS.narrower)
        self._related_matrix = make_relation_matrix(
            graph, vocab, SKOS.related)
        self._collection_matrix = make_collection_matrix(graph, vocab)

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        # frequency of each token used in labels - how rare each word is
        token_freq = np.bincount(label_corpus.indices,
                                 minlength=label_corpus.shape[1])

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            # sort tokens by frequency - use the rarest token as index key
            tokens = sorted(tokens, key=token_freq.__getitem__)
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        train_x = []
        train_y = []

        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        cg_args = {
            'analyzer': analyzer,
            'vectorizer': self._vectorizer,
            'index': self._index
        }

        with pool_class(jobs,
                        initializer=MLLMCandidateGenerator.init,
                        initargs=(cg_args,)) as pool:
            params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
                       doc.text)
                      for doc in corpus.documents)
            for doc_subject_ids, candidates in pool.starmap(
                    MLLMCandidateGenerator.generate_candidates, params, 10):

                self._subj_freq.update(doc_subject_ids)
                self._doc_freq.update([c.subject_id for c in candidates])
                train_x.append(candidates)
                train_y += [(c.subject_id in doc_subject_ids)
                            for c in candidates]

        return (train_x, train_y)

    def _calculate_idf(self, subject_ids, doc_count):
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) /
                                    (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def _prepare_features(self, train_x, n_jobs):
        fc_args = {
            'related_matrix': self._related_matrix,
            'broader_matrix': self._broader_matrix,
            'narrower_matrix': self._narrower_matrix,
            'collection_matrix': self._collection_matrix,
            'doc_freq': self._doc_freq,
            'subj_freq': self._subj_freq,
            'idf': self._idf
        }

        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        with pool_class(jobs,
                        initializer=MLLMFeatureConverter.init,
                        initargs=(fc_args,)) as pool:
            features = pool.map(
                MLLMFeatureConverter.candidates_to_features, train_x, 10)

        return features

    def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
        # create an index from the vocabulary terms
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # convert the corpus into train data
        train_x, train_y = self._prepare_train_data(
            corpus, vocab, analyzer, n_jobs)

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, len(train_x))

        # convert the train data into feature values
        features = self._prepare_features(train_x, n_jobs)

        return (np.vstack(features), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			import joblib
6			from statistics import mean
7			from enum import IntEnum
8			import numpy as np
9			from rdflib.namespace import SKOS
10			from sklearn.feature_extraction.text import CountVectorizer
11			from sklearn.ensemble import BaggingClassifier
12			from sklearn.tree import DecisionTreeClassifier
13			import annif.util
14			import annif.parallel
15			from annif.lexical.tokenset import TokenSet, TokenSetIndex
16			from annif.lexical.util import get_subject_labels
17			from annif.lexical.util import make_relation_matrix, make_collection_matrix
18
19
20			Term = collections.namedtuple('Term', 'subject_id label is_pref')
21
22			Match = collections.namedtuple(
23			'Match', 'subject_id is_pref n_tokens pos ambiguity')
24
25			Candidate = collections.namedtuple(
26			'Candidate',
27			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28			'first_occ last_occ spread')
29
30			Feature = IntEnum(
31			'Feature',
32			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33			'first_occ last_occ spread doc_length ' +
34			'broader narrower related collection',
35			start=0)
36
37
38			def conflate_matches(matches, doc_length):
39			subj_matches = collections.defaultdict(list)
40			for match in matches:
41			subj_matches[match.subject_id].append(match)
42			return [
43			Candidate(
44			doc_length=doc_length,
45			subject_id=subject_id,
46			freq=len(matches) / doc_length,
47			is_pref=mean((float(m.is_pref) for m in matches)),
48			n_tokens=mean((m.n_tokens for m in matches)),
49			ambiguity=mean((m.ambiguity for m in matches)),
50			first_occ=matches[0].pos / doc_length,
51			last_occ=matches[-1].pos / doc_length,
52			spread=(matches[-1].pos - matches[0].pos) / doc_length
53			)
54			for subject_id, matches in subj_matches.items()]
55
56
57			def generate_candidates(text, analyzer, vectorizer, index):
58			sentences = analyzer.tokenize_sentences(text)
59			sent_tokens = vectorizer.transform(sentences)
60			matches = []
61
62			for sent_idx, token_matrix in enumerate(sent_tokens):
63			tset = TokenSet(token_matrix.nonzero()[1])
64			for ts, ambiguity in index.search(tset):
65			matches.append(Match(subject_id=ts.subject_id,
66			is_pref=ts.is_pref,
67			n_tokens=len(ts),
68			pos=sent_idx,
69			ambiguity=ambiguity))
70
71			return conflate_matches(matches, len(sentences))
72
73
74			def candidates_to_features(candidates,
75			related_matrix, broader_matrix,
76			narrower_matrix, collection_matrix,
77			doc_freq, subj_freq, idf):
78			"""Convert a list of Candidates to a NumPy feature matrix"""
79
80			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
81			c_ids = [c.subject_id for c in candidates]
82			c_vec = np.zeros(related_matrix.shape[0], dtype=np.bool)
83			c_vec[c_ids] = True
84			broader = broader_matrix.multiply(c_vec).sum(axis=1)
85			narrower = narrower_matrix.multiply(c_vec).sum(axis=1)
86			related = related_matrix.multiply(c_vec).sum(axis=1)
87			collection = collection_matrix.multiply(c_vec).T.dot(
88			collection_matrix).sum(axis=0)
89			for idx, c in enumerate(candidates):
90			subj = c.subject_id
91			matrix[idx, Feature.freq] = c.freq
92			matrix[idx, Feature.doc_freq] = doc_freq[subj]
93			matrix[idx, Feature.subj_freq] = subj_freq.get(subj, 1) - 1
94			matrix[idx, Feature.tfidf] = c.freq * idf[subj]
95			matrix[idx, Feature.is_pref] = c.is_pref
96			matrix[idx, Feature.n_tokens] = c.n_tokens
97			matrix[idx, Feature.ambiguity] = c.ambiguity
98			matrix[idx, Feature.first_occ] = c.first_occ
99			matrix[idx, Feature.last_occ] = c.last_occ
100			matrix[idx, Feature.spread] = c.spread
101			matrix[idx, Feature.doc_length] = c.doc_length
102			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
103			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
104			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
105			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
106			return matrix
107
108
109			class MLLMCandidateGenerator(annif.parallel.BaseWorker):
110
111			@classmethod
112			def generate_candidates(cls, doc_subject_ids, text):
113			candidates = generate_candidates(text, **cls.args)
114			return doc_subject_ids, candidates
115
116
117			class MLLMFeatureConverter(annif.parallel.BaseWorker):
118
119			@classmethod
120			def candidates_to_features(cls, candidates):
121			return candidates_to_features(candidates, **cls.args)
122
123
124			class MLLMModel:
125			"""Maui-like Lexical Matching model"""
126
127			def generate_candidates(self, text, analyzer):
128			return generate_candidates(text, analyzer,
129			self._vectorizer, self._index)
130
131			def _candidates_to_features(self, candidates):
132			return candidates_to_features(candidates,
133			self._related_matrix,
134			self._broader_matrix,
135			self._narrower_matrix,
136			self._collection_matrix,
137			self._doc_freq,
138			self._subj_freq,
139			self._idf)
140
141			def _prepare_terms(self, graph, vocab, params):
142			if annif.util.boolean(params['use_hidden_labels']):
143			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
144			else:
145			label_props = [SKOS.altLabel]
146
147			terms = []
148			subject_ids = []
149			for subj_id, uri, pref, _ in vocab.subjects.active:
150			subject_ids.append(subj_id)
151			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
152
153			for label in get_subject_labels(graph, uri, label_props,
154			params['language']):
155			terms.append(Term(subject_id=subj_id,
156			label=label,
157			is_pref=False))
158
159			return (terms, subject_ids)
160
161			def _prepare_relations(self, graph, vocab):
162			self._broader_matrix = make_relation_matrix(
163			graph, vocab, SKOS.broader)
164			self._narrower_matrix = make_relation_matrix(
165			graph, vocab, SKOS.narrower)
166			self._related_matrix = make_relation_matrix(
167			graph, vocab, SKOS.related)
168			self._collection_matrix = make_collection_matrix(graph, vocab)
169
170			def _prepare_train_index(self, vocab, analyzer, params):
171			graph = vocab.as_graph()
172			terms, subject_ids = self._prepare_terms(graph, vocab, params)
173			self._prepare_relations(graph, vocab)
174
175			self._vectorizer = CountVectorizer(
176			binary=True,
177			tokenizer=analyzer.tokenize_words
178			)
179			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
180
181			# frequency of each token used in labels - how rare each word is
182			token_freq = np.bincount(label_corpus.indices,
183			minlength=label_corpus.shape[1])
184
185			self._index = TokenSetIndex()
186			for term, label_matrix in zip(terms, label_corpus):
187			tokens = label_matrix.nonzero()[1]
188			# sort tokens by frequency - use the rarest token as index key
189			tokens = sorted(tokens, key=token_freq.__getitem__)
190			tset = TokenSet(tokens, term.subject_id, term.is_pref)
191			self._index.add(tset)
192
193			return subject_ids
194
195			def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs):
196			# frequency of subjects (by id) in the generated candidates
197			self._doc_freq = collections.Counter()
198			# frequency of manually assigned subjects ("domain keyphraseness")
199			self._subj_freq = collections.Counter()
200			train_x = []
201			train_y = []
202
203			jobs, pool_class = annif.parallel.get_pool(n_jobs)
204
205			cg_args = {
206			'analyzer': analyzer,
207			'vectorizer': self._vectorizer,
208			'index': self._index
209			}
210
211			with pool_class(jobs,
212			initializer=MLLMCandidateGenerator.init,
213			initargs=(cg_args,)) as pool:
214			params = (([vocab.subjects.by_uri(uri) for uri in doc.uris],
215			doc.text)
216			for doc in corpus.documents)
217			for doc_subject_ids, candidates in pool.starmap(
218			MLLMCandidateGenerator.generate_candidates, params, 10):
219
220			self._subj_freq.update(doc_subject_ids)
221			self._doc_freq.update([c.subject_id for c in candidates])
222			train_x.append(candidates)
223			train_y += [(c.subject_id in doc_subject_ids)
224			for c in candidates]
225
226			return (train_x, train_y)
227
228			def _calculate_idf(self, subject_ids, doc_count):
229			idf = collections.defaultdict(float)
230			for subj_id in subject_ids:
231			idf[subj_id] = math.log((doc_count + 1) /
232			(self._doc_freq[subj_id] + 1)) + 1
233
234			return idf
235
236			def _prepare_features(self, train_x, n_jobs):
237			fc_args = {
238			'related_matrix': self._related_matrix,
239			'broader_matrix': self._broader_matrix,
240			'narrower_matrix': self._narrower_matrix,
241			'collection_matrix': self._collection_matrix,
242			'doc_freq': self._doc_freq,
243			'subj_freq': self._subj_freq,
244			'idf': self._idf
245			}
246
247			jobs, pool_class = annif.parallel.get_pool(n_jobs)
248
249			with pool_class(jobs,
250			initializer=MLLMFeatureConverter.init,
251			initargs=(fc_args,)) as pool:
252			features = pool.map(
253			MLLMFeatureConverter.candidates_to_features, train_x, 10)
254
255			return features
256
257			def prepare_train(self, corpus, vocab, analyzer, params, n_jobs):
258			# create an index from the vocabulary terms
259			subject_ids = self._prepare_train_index(vocab, analyzer, params)
260
261			# convert the corpus into train data
262			train_x, train_y = self._prepare_train_data(
263			corpus, vocab, analyzer, n_jobs)
264
265			# precalculate idf values for all candidate subjects
266			self._idf = self._calculate_idf(subject_ids, len(train_x))
267
268			# convert the train data into feature values
269			features = self._prepare_features(train_x, n_jobs)
270
271			return (np.vstack(features), np.array(train_y))
272
273			def _create_classifier(self, params):
274			return BaggingClassifier(
275			DecisionTreeClassifier(
276			min_samples_leaf=int(params['min_samples_leaf']),
277			max_leaf_nodes=int(params['max_leaf_nodes'])
278			), max_samples=float(params['max_samples']))
279
280			def train(self, train_x, train_y, params):
281			# fit the model on the training corpus
282			self._classifier = self._create_classifier(params)
283			self._classifier.fit(train_x, train_y)
284
285			def _prediction_to_list(self, scores, candidates):
286			subj_scores = [(score[1], c.subject_id)
287			for score, c in zip(scores, candidates)]
288			return sorted(subj_scores, reverse=True)
289
290			def predict(self, candidates):
291			if not candidates:
292			return []
293			features = self._candidates_to_features(candidates)
294			scores = self._classifier.predict_proba(features)
295			return self._prediction_to_list(scores, candidates)
296
297			def save(self, filename):
298			return joblib.dump(self, filename)
299
300			@staticmethod
301			def load(filename):
302			return joblib.load(filename)
303

NatLibFi / Annif

Pull Request — master (#511)

annif.lexical.mllm.candidates_to_features() A

Complexity

Size

Duplication

Importance

How to fix Many Parameters

Many Parameters

Duplication Side-by-Side

Filter issues like