annif.lexical.mllm.MLLMModel.train() - Code Metrics - Inspection of "Merge pull request #462 from NatLibFi/feature-mllm..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c17a7d...ea11a0 )

by Osma

created 2021-04-13 11:27 UTC

annif.lexical.mllm.MLLMModel.train() A

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	4
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	3
nop	4
dl	0
loc	4
rs	10
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

import collections
import math
import joblib
from statistics import mean
from enum import IntEnum
import numpy as np
from rdflib.namespace import SKOS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.lexical.tokenset import TokenSet, TokenSetIndex
from annif.lexical.util import get_subject_labels
from annif.lexical.util import make_relation_matrix, make_collection_matrix


Term = collections.namedtuple('Term', 'subject_id label is_pref')

Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')

Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related collection',
    start=0)


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        collection = self._collection_matrix.multiply(c_vec).T.dot(
            self._collection_matrix).sum(axis=0)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
            matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        if annif.util.boolean(params['use_hidden_labels']):
            label_props = [SKOS.altLabel, SKOS.hiddenLabel]
        else:
            label_props = [SKOS.altLabel]

        terms = []
        subject_ids = []
        for subj_id, uri, pref, _ in vocab.subjects.active:
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            for label in get_subject_labels(graph, uri, label_props,
                                            params['language']):
                terms.append(Term(subject_id=subj_id,
                                  label=label,
                                  is_pref=False))

        return (terms, subject_ids)

    def _prepare_relations(self, graph, vocab):
        self._broader_matrix = make_relation_matrix(
            graph, vocab, SKOS.broader)
        self._narrower_matrix = make_relation_matrix(
            graph, vocab, SKOS.narrower)
        self._related_matrix = make_relation_matrix(
            graph, vocab, SKOS.related)
        self._collection_matrix = make_collection_matrix(graph, vocab)

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _calculate_idf(self, subject_ids, doc_count):
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) /
                                    (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, doc_count)

        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename):
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename):
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			import collections
4			import math
5			import joblib
6			from statistics import mean
7			from enum import IntEnum
8			import numpy as np
9			from rdflib.namespace import SKOS
10			from sklearn.feature_extraction.text import CountVectorizer
11			from sklearn.ensemble import BaggingClassifier
12			from sklearn.tree import DecisionTreeClassifier
13			import annif.util
14			from annif.lexical.tokenset import TokenSet, TokenSetIndex
15			from annif.lexical.util import get_subject_labels
16			from annif.lexical.util import make_relation_matrix, make_collection_matrix
17
18
19			Term = collections.namedtuple('Term', 'subject_id label is_pref')
20
21			Match = collections.namedtuple(
22			'Match', 'subject_id is_pref n_tokens pos ambiguity')
23
24			Candidate = collections.namedtuple(
25			'Candidate',
26			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
27			'first_occ last_occ spread')
28
29			Feature = IntEnum(
30			'Feature',
31			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
32			'first_occ last_occ spread doc_length ' +
33			'broader narrower related collection',
34			start=0)
35
36
37			class MLLMModel:
38			"""Maui-like Lexical Matching model"""
39
40			def _conflate_matches(self, matches, doc_length):
41			subj_matches = collections.defaultdict(list)
42			for match in matches:
43			subj_matches[match.subject_id].append(match)
44			return [
45			Candidate(
46			doc_length=doc_length,
47			subject_id=subject_id,
48			freq=len(matches) / doc_length,
49			is_pref=mean((float(m.is_pref) for m in matches)),
50			n_tokens=mean((m.n_tokens for m in matches)),
51			ambiguity=mean((m.ambiguity for m in matches)),
52			first_occ=matches[0].pos / doc_length,
53			last_occ=matches[-1].pos / doc_length,
54			spread=(matches[-1].pos - matches[0].pos) / doc_length
55			)
56			for subject_id, matches in subj_matches.items()]
57
58			def generate_candidates(self, text, analyzer):
59			sentences = analyzer.tokenize_sentences(text)
60			sent_tokens = self._vectorizer.transform(sentences)
61			matches = []
62
63			for sent_idx, token_matrix in enumerate(sent_tokens):
64			tset = TokenSet(token_matrix.nonzero()[1])
65			for ts, ambiguity in self._index.search(tset):
66			matches.append(Match(subject_id=ts.subject_id,
67			is_pref=ts.is_pref,
68			n_tokens=len(ts),
69			pos=sent_idx,
70			ambiguity=ambiguity))
71
72			return self._conflate_matches(matches, len(sentences))
73
74			def _candidates_to_features(self, candidates):
75			"""Convert a list of Candidates to a NumPy feature matrix"""
76			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
77			c_ids = [c.subject_id for c in candidates]
78			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
79			c_vec[c_ids] = True
80			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
81			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
82			related = self._related_matrix.multiply(c_vec).sum(axis=1)
83			collection = self._collection_matrix.multiply(c_vec).T.dot(
84			self._collection_matrix).sum(axis=0)
85			for idx, c in enumerate(candidates):
86			subj = c.subject_id
87			matrix[idx, Feature.freq] = c.freq
88			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
89			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
90			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
91			matrix[idx, Feature.is_pref] = c.is_pref
92			matrix[idx, Feature.n_tokens] = c.n_tokens
93			matrix[idx, Feature.ambiguity] = c.ambiguity
94			matrix[idx, Feature.first_occ] = c.first_occ
95			matrix[idx, Feature.last_occ] = c.last_occ
96			matrix[idx, Feature.spread] = c.spread
97			matrix[idx, Feature.doc_length] = c.doc_length
98			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
99			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
100			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
101			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
102			return matrix
103
104			def _prepare_terms(self, graph, vocab, params):
105			if annif.util.boolean(params['use_hidden_labels']):
106			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
107			else:
108			label_props = [SKOS.altLabel]
109
110			terms = []
111			subject_ids = []
112			for subj_id, uri, pref, _ in vocab.subjects.active:
113			subject_ids.append(subj_id)
114			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
115
116			for label in get_subject_labels(graph, uri, label_props,
117			params['language']):
118			terms.append(Term(subject_id=subj_id,
119			label=label,
120			is_pref=False))
121
122			return (terms, subject_ids)
123
124			def _prepare_relations(self, graph, vocab):
125			self._broader_matrix = make_relation_matrix(
126			graph, vocab, SKOS.broader)
127			self._narrower_matrix = make_relation_matrix(
128			graph, vocab, SKOS.narrower)
129			self._related_matrix = make_relation_matrix(
130			graph, vocab, SKOS.related)
131			self._collection_matrix = make_collection_matrix(graph, vocab)
132
133			def _prepare_train_index(self, vocab, analyzer, params):
134			graph = vocab.as_graph()
135			terms, subject_ids = self._prepare_terms(graph, vocab, params)
136			self._prepare_relations(graph, vocab)
137
138			self._vectorizer = CountVectorizer(
139			binary=True,
140			tokenizer=analyzer.tokenize_words
141			)
142			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
143
144			self._index = TokenSetIndex()
145			for term, label_matrix in zip(terms, label_corpus):
146			tokens = label_matrix.nonzero()[1]
147			tset = TokenSet(tokens, term.subject_id, term.is_pref)
148			self._index.add(tset)
149
150			return subject_ids
151
152			def _calculate_idf(self, subject_ids, doc_count):
153			idf = collections.defaultdict(float)
154			for subj_id in subject_ids:
155			idf[subj_id] = math.log((doc_count + 1) /
156			(self._doc_freq[subj_id] + 1)) + 1
157
158			return idf
159
160			def prepare_train(self, corpus, vocab, analyzer, params):
161			subject_ids = self._prepare_train_index(vocab, analyzer, params)
162
163			# frequency of subjects (by id) in the generated candidates
164			self._doc_freq = collections.Counter()
165			# frequency of manually assigned subjects ("domain keyphraseness")
166			self._subj_freq = collections.Counter()
167			doc_count = 0
168			train_x = []
169			train_y = []
170			for idx, doc in enumerate(corpus.documents):
171			doc_subject_ids = [vocab.subjects.by_uri(uri)
172			for uri in doc.uris]
173			self._subj_freq.update(doc_subject_ids)
174			candidates = self.generate_candidates(doc.text, analyzer)
175			self._doc_freq.update([c.subject_id for c in candidates])
176			train_x.append(candidates)
177			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
178			doc_count += 1
179
180			# precalculate idf values for all candidate subjects
181			self._idf = self._calculate_idf(subject_ids, doc_count)
182
183			return (np.vstack([self._candidates_to_features(candidates)
184			for candidates in train_x]), np.array(train_y))
185
186			def _create_classifier(self, params):
187			return BaggingClassifier(
188			DecisionTreeClassifier(
189			min_samples_leaf=int(params['min_samples_leaf']),
190			max_leaf_nodes=int(params['max_leaf_nodes'])
191			), max_samples=float(params['max_samples']))
192
193			def train(self, train_x, train_y, params):
194			# fit the model on the training corpus
195			self._classifier = self._create_classifier(params)
196			self._classifier.fit(train_x, train_y)
197
198			def _prediction_to_list(self, scores, candidates):
199			subj_scores = [(score[1], c.subject_id)
200			for score, c in zip(scores, candidates)]
201			return sorted(subj_scores, reverse=True)
202
203			def predict(self, candidates):
204			if not candidates:
205			return []
206			features = self._candidates_to_features(candidates)
207			scores = self._classifier.predict_proba(features)
208			return self._prediction_to_list(scores, candidates)
209
210			def save(self, filename):
211			return joblib.dump(self, filename)
212
213			@staticmethod
214			def load(filename):
215			return joblib.load(filename)
216

NatLibFi / Annif

Push — master ( c17a7d...ea11a0 )

annif.lexical.mllm.MLLMModel.train() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like