annif.backend.mllm.MLLMModel._prepare_terms() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-03-18 15:52 UTC

annif.backend.mllm.MLLMModel._prepare_terms() B

↳ Parent: annif.backend.mllm

Complexity

Conditions

Size

Total Lines	22
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	19
nop	4
dl	0
loc	22
rs	8
c	0
b	0
f	0

"""Maui-like Lexical Matching backend"""

import collections
import math
from enum import IntEnum
from statistics import mean
import os.path
import joblib
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import backend
from . import hyperopt

Term = collections.namedtuple('Term', 'subject_id label is_pref')
Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'broader narrower related',
    start=0)


class TokenSet:
    """Represents a set of tokens (expressed as integer token IDs) that can
    be matched with another set of tokens. A TokenSet can optionally
    be associated with a subject from the vocabulary."""

    def __init__(self, tokens, subject_id=None, is_pref=False):
        self._tokens = set(tokens)
        self.subject_id = subject_id
        self.is_pref = is_pref

    def __len__(self):
        return len(self._tokens)

    def __iter__(self):
        return iter(self._tokens)

    def contains(self, other):
        """Returns True iff the tokens in the other TokenSet are all
        included within this TokenSet."""

        return other._tokens.issubset(self._tokens)

    def sample(self):
        """Return an arbitrary token from this TokenSet, or None if empty"""
        try:
            return next(iter(self._tokens))
        except StopIteration:
            return None


class TokenSetIndex:
    """A searchable index of TokenSets (representing vocabulary terms)"""

    def __init__(self):
        self._index = collections.defaultdict(set)

    def __len__(self):
        return len(self._index)

    def add(self, tset):
        """Add a TokenSet into this index"""
        token = tset.sample()
        if token is not None:
            self._index[token].add(tset)

    def search(self, tset):
        """Return the TokenSets that are contained in the given TokenSet.
        The matches are returned as a list of (TokenSet, ambiguity) pairs
        where ambiguity is an integer indicating the number of other TokenSets
        that also match the same tokens."""

        subj_tsets = {}
        subj_ambiguity = collections.Counter()

        for token in tset:
            for ts in self._index[token]:
                if not tset.contains(ts):
                    continue
                if ts.subject_id not in subj_tsets or \
                   not subj_tsets[ts.subject_id].is_pref:
                    subj_tsets[ts.subject_id] = ts

        for ts in subj_tsets.values():
            for other in subj_tsets.values():
                if ts == other:
                    continue
                if other.contains(ts):
                    subj_ambiguity.update([ts.subject_id])

        return [(ts, subj_ambiguity[ts.subject_id])
                for uri, ts in subj_tsets.items()]


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
        related = self._related_matrix.multiply(c_vec).sum(axis=1)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
        return matrix

    def _prepare_terms(self, graph, vocab, params):
        terms = []
        subject_ids = []
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            if annif.util.boolean(params['use_hidden_labels']):
                label_props = [SKOS.altLabel, SKOS.hiddenLabel]
            else:
                label_props = [SKOS.altLabel]

            for prop in label_props:
                for label in graph.objects(URIRef(uri), prop):
                    if label.language != params['language']:
                        continue
                    terms.append(Term(subject_id=subj_id,
                                      label=str(label),
                                      is_pref=False))
        return (terms, subject_ids)

    def _prepare_relations(self, graph, vocab):
        n_subj = len(vocab.subjects)
        self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)

        prop_matrix = [
            (SKOS.broader, self._broader_matrix),
            (SKOS.narrower, self._narrower_matrix),
            (SKOS.related, self._related_matrix)
        ]

        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject

            for prop, matrix in prop_matrix:
                for other in graph.objects(URIRef(uri), prop):
                    other_id = vocab.subjects.by_uri(str(other),
                                                     warnings=False)
                    if other_id is not None:
                        matrix[subj_id, other_id] = True

    def _prepare_train_index(self, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def prepare_train(self, corpus, vocab, analyzer, params):
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for candidate subjects
        self._idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            self._idf[subj_id] = math.log((doc_count + 1) /
                                          (self._doc_freq[subj_id] + 1)) + 1
        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)


class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
    """Hyperparameter optimizer for the MLLM backend"""

    def _prepare(self, n_jobs=1):
        self._backend.initialize()
        self._train_x, self._train_y = self._backend._load_train_data()
        self._candidates = []
        self._gold_subjects = []

        # TODO parallelize generation of candidates
        for doc in self._corpus.documents:
            candidates = self._backend._generate_candidates(doc.text)
            self._candidates.append(candidates)
            self._gold_subjects.append(
                annif.corpus.SubjectSet((doc.uris, doc.labels)))

    def _objective(self, trial):
        params = {
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30),
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000),
            'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
            'use_hidden_labels':
                trial.suggest_categorical('use_hidden_labels', [True, False]),
            'limit': 100
        }
        model = self._backend._model._create_classifier(params)
        model.fit(self._train_x, self._train_y)

        batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
        for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
            if candidates:
                features = \
                    self._backend._model._candidates_to_features(candidates)
                scores = model.predict_proba(features)
                ranking = self._backend._model._prediction_to_list(
                    scores, candidates)
            else:
                ranking = []
            results = self._backend._prediction_to_result(ranking, params)
            batch.evaluate(results, goldsubj)
        results = batch.results(metrics=[self._metric])
        return results[self._metric]

    def _postprocess(self, study):
        bp = study.best_params
        lines = [
            f"min_samples_leaf={bp['min_samples_leaf']}",
            f"max_leaf_nodes={bp['max_leaf_nodes']}",
            f"max_samples={bp['max_samples']:.4f}",
            f"use_hidden_labels={bp['use_hidden_labels']}"
        ]
        return hyperopt.HPRecommendation(lines=lines, score=study.best_value)


class MLLMBackend(hyperopt.AnnifHyperoptBackend):
    """Maui-like Lexical Matching backend for Annif"""
    name = "mllm"
    needs_subject_index = True

    # defaults for unitialized instances
    _model = None

    MODEL_FILE = 'mllm-model.gz'
    TRAIN_FILE = 'mllm-train.gz'

    DEFAULT_PARAMETERS = {
        'min_samples_leaf': 20,
        'max_leaf_nodes': 1000,
        'max_samples': 0.9,
        'use_hidden_labels': False
    }

    def get_hp_optimizer(self, corpus, metric):
        return MLLMOptimizer(self, corpus, metric)

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def _load_model(self):
        path = os.path.join(self.datadir, self.MODEL_FILE)
        self.debug('loading model from {}'.format(path))
        if os.path.exists(path):
            return joblib.load(path)
        else:
            raise NotInitializedException(
                'model {} not found'.format(path),
                backend_id=self.backend_id)

    def _load_train_data(self):
        path = os.path.join(self.datadir, self.TRAIN_FILE)
        if os.path.exists(path):
            return joblib.load(path)
        else:
            raise NotInitializedException(
                'train data file {} not found'.format(path),
                backend_id=self.backend_id)

    def initialize(self):
        if self._model is None:
            self._model = self._load_model()

    def _train(self, corpus, params):
        self.info('starting train')
        if corpus != 'cached':
            self.info("preparing training data")
            self._model = MLLMModel()
            train_data = self._model.prepare_train(corpus,
                                                   self.project.vocab,
                                                   self.project.analyzer,
                                                   params)
            annif.util.atomic_save(train_data,
                                   self.datadir,
                                   self.TRAIN_FILE,
                                   method=joblib.dump)
        else:
            self.info("reusing cached training data from previous run")
            self._model = self._load_model()
            train_data = self._load_train_data()

        self.info("training model")
        self._model.train(train_data[0], train_data[1], params)

        self.info('saving model')
        annif.util.atomic_save(
            self._model,
            self.datadir,
            self.MODEL_FILE,
            method=joblib.dump)

    def _generate_candidates(self, text):
        return self._model.generate_candidates(text, self.project.analyzer)

    def _prediction_to_result(self, prediction, params):
        vector = np.zeros(len(self.project.subjects), dtype=np.float32)
        for score, subject_id in prediction:
            vector[subject_id] = score
        result = VectorSuggestionResult(vector)
        return result.filter(self.project.subjects,
                             limit=int(params['limit']))

    def _suggest(self, text, params):
        candidates = self._generate_candidates(text)
        prediction = self._model.predict(candidates)
        return self._prediction_to_result(prediction, params)


1			"""Maui-like Lexical Matching backend"""
2
3			import collections
4			import math
5			from enum import IntEnum
6			from statistics import mean
7			import os.path
8			import joblib
9			import numpy as np
10			from rdflib import URIRef
11			from rdflib.namespace import SKOS
12			from scipy.sparse import lil_matrix
13			from sklearn.feature_extraction.text import CountVectorizer
14			from sklearn.ensemble import BaggingClassifier
15			from sklearn.tree import DecisionTreeClassifier
16			import annif.util
17			from annif.exception import NotInitializedException
18			from annif.suggestion import VectorSuggestionResult
19			from . import backend
20			from . import hyperopt
21
22			Term = collections.namedtuple('Term', 'subject_id label is_pref')
23			Match = collections.namedtuple(
24			'Match', 'subject_id is_pref n_tokens pos ambiguity')
25			Candidate = collections.namedtuple(
26			'Candidate',
27			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28			'first_occ last_occ spread')
29
30			Feature = IntEnum(
31			'Feature',
32			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33			'first_occ last_occ spread doc_length ' +
34			'broader narrower related',
35			start=0)
36
37
38			class TokenSet:
39			"""Represents a set of tokens (expressed as integer token IDs) that can
40			be matched with another set of tokens. A TokenSet can optionally
41			be associated with a subject from the vocabulary."""
42
43			def __init__(self, tokens, subject_id=None, is_pref=False):
44			self._tokens = set(tokens)
45			self.subject_id = subject_id
46			self.is_pref = is_pref
47
48			def __len__(self):
49			return len(self._tokens)
50
51			def __iter__(self):
52			return iter(self._tokens)
53
54			def contains(self, other):
55			"""Returns True iff the tokens in the other TokenSet are all
56			included within this TokenSet."""
57
58			return other._tokens.issubset(self._tokens)
59
60			def sample(self):
61			"""Return an arbitrary token from this TokenSet, or None if empty"""
62			try:
63			return next(iter(self._tokens))
64			except StopIteration:
65			return None
66
67
68			class TokenSetIndex:
69			"""A searchable index of TokenSets (representing vocabulary terms)"""
70
71			def __init__(self):
72			self._index = collections.defaultdict(set)
73
74			def __len__(self):
75			return len(self._index)
76
77			def add(self, tset):
78			"""Add a TokenSet into this index"""
79			token = tset.sample()
80			if token is not None:
81			self._index[token].add(tset)
82
83			def search(self, tset):
84			"""Return the TokenSets that are contained in the given TokenSet.
85			The matches are returned as a list of (TokenSet, ambiguity) pairs
86			where ambiguity is an integer indicating the number of other TokenSets
87			that also match the same tokens."""
88
89			subj_tsets = {}
90			subj_ambiguity = collections.Counter()
91
92			for token in tset:
93			for ts in self._index[token]:
94			if not tset.contains(ts):
95			continue
96			if ts.subject_id not in subj_tsets or \
97			not subj_tsets[ts.subject_id].is_pref:
98			subj_tsets[ts.subject_id] = ts
99
100			for ts in subj_tsets.values():
101			for other in subj_tsets.values():
102			if ts == other:
103			continue
104			if other.contains(ts):
105			subj_ambiguity.update([ts.subject_id])
106
107			return [(ts, subj_ambiguity[ts.subject_id])
108			for uri, ts in subj_tsets.items()]
109
110
111			class MLLMModel:
112			"""Maui-like Lexical Matching model"""
113
114			def _conflate_matches(self, matches, doc_length):
115			subj_matches = collections.defaultdict(list)
116			for match in matches:
117			subj_matches[match.subject_id].append(match)
118			return [
119			Candidate(
120			doc_length=doc_length,
121			subject_id=subject_id,
122			freq=len(matches) / doc_length,
123			is_pref=mean((float(m.is_pref) for m in matches)),
124			n_tokens=mean((m.n_tokens for m in matches)),
125			ambiguity=mean((m.ambiguity for m in matches)),
126			first_occ=matches[0].pos / doc_length,
127			last_occ=matches[-1].pos / doc_length,
128			spread=(matches[-1].pos - matches[0].pos) / doc_length
129			)
130			for subject_id, matches in subj_matches.items()]
131
132			def generate_candidates(self, text, analyzer):
133			sentences = analyzer.tokenize_sentences(text)
134			sent_tokens = self._vectorizer.transform(sentences)
135			matches = []
136
137			for sent_idx, token_matrix in enumerate(sent_tokens):
138			tset = TokenSet(token_matrix.nonzero()[1])
139			for ts, ambiguity in self._index.search(tset):
140			matches.append(Match(subject_id=ts.subject_id,
141			is_pref=ts.is_pref,
142			n_tokens=len(ts),
143			pos=sent_idx,
144			ambiguity=ambiguity))
145
146			return self._conflate_matches(matches, len(sentences))
147
148			def _candidates_to_features(self, candidates):
149			"""Convert a list of Candidates to a NumPy feature matrix"""
150			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
151			c_ids = [c.subject_id for c in candidates]
152			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
153			c_vec[c_ids] = True
154			broader = self._broader_matrix.multiply(c_vec).sum(axis=1)
155			narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)
156			related = self._related_matrix.multiply(c_vec).sum(axis=1)
157			for idx, c in enumerate(candidates):
158			subj = c.subject_id
159			matrix[idx, Feature.freq] = c.freq
160			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
161			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
162			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
163			matrix[idx, Feature.is_pref] = c.is_pref
164			matrix[idx, Feature.n_tokens] = c.n_tokens
165			matrix[idx, Feature.ambiguity] = c.ambiguity
166			matrix[idx, Feature.first_occ] = c.first_occ
167			matrix[idx, Feature.last_occ] = c.last_occ
168			matrix[idx, Feature.spread] = c.spread
169			matrix[idx, Feature.doc_length] = c.doc_length
170			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
171			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
172			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
173			return matrix
174
175			def _prepare_terms(self, graph, vocab, params):
176			terms = []
177			subject_ids = []
178			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
179			if pref is None:
180			continue # deprecated subject
181			subject_ids.append(subj_id)
182			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
183
184			if annif.util.boolean(params['use_hidden_labels']):
185			label_props = [SKOS.altLabel, SKOS.hiddenLabel]
186			else:
187			label_props = [SKOS.altLabel]
188
189			for prop in label_props:
190			for label in graph.objects(URIRef(uri), prop):
191			if label.language != params['language']:
192			continue
193			terms.append(Term(subject_id=subj_id,
194			label=str(label),
195			is_pref=False))
196			return (terms, subject_ids)
197
198			def _prepare_relations(self, graph, vocab):
199			n_subj = len(vocab.subjects)
200			self._broader_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
201			self._narrower_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
202			self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
203
204			prop_matrix = [
205			(SKOS.broader, self._broader_matrix),
206			(SKOS.narrower, self._narrower_matrix),
207			(SKOS.related, self._related_matrix)
208			]
209
210			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
211			if pref is None:
212			continue # deprecated subject
213
214			for prop, matrix in prop_matrix:
215			for other in graph.objects(URIRef(uri), prop):
216			other_id = vocab.subjects.by_uri(str(other),
217			warnings=False)
218			if other_id is not None:
219			matrix[subj_id, other_id] = True
220
221			def _prepare_train_index(self, vocab, analyzer, params):
222			graph = vocab.as_graph()
223			terms, subject_ids = self._prepare_terms(graph, vocab, params)
224			self._prepare_relations(graph, vocab)
225
226			self._vectorizer = CountVectorizer(
227			binary=True,
228			tokenizer=analyzer.tokenize_words
229			)
230			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
231
232			self._index = TokenSetIndex()
233			for term, label_matrix in zip(terms, label_corpus):
234			tokens = label_matrix.nonzero()[1]
235			tset = TokenSet(tokens, term.subject_id, term.is_pref)
236			self._index.add(tset)
237
238			return subject_ids
239
240			def prepare_train(self, corpus, vocab, analyzer, params):
241			subject_ids = self._prepare_train_index(vocab, analyzer, params)
242
243			# frequency of subjects (by id) in the generated candidates
244			self._doc_freq = collections.Counter()
245			# frequency of manually assigned subjects ("domain keyphraseness")
246			self._subj_freq = collections.Counter()
247			doc_count = 0
248			train_x = []
249			train_y = []
250			for idx, doc in enumerate(corpus.documents):
251			doc_subject_ids = [vocab.subjects.by_uri(uri)
252			for uri in doc.uris]
253			self._subj_freq.update(doc_subject_ids)
254			candidates = self.generate_candidates(doc.text, analyzer)
255			self._doc_freq.update([c.subject_id for c in candidates])
256			train_x.append(candidates)
257			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
258			doc_count += 1
259
260			# precalculate idf values for candidate subjects
261			self._idf = collections.defaultdict(float)
262			for subj_id in subject_ids:
263			self._idf[subj_id] = math.log((doc_count + 1) /
264			(self._doc_freq[subj_id] + 1)) + 1
265			return (np.vstack([self._candidates_to_features(candidates)
266			for candidates in train_x]), np.array(train_y))
267
268			def _create_classifier(self, params):
269			return BaggingClassifier(
270			DecisionTreeClassifier(
271			min_samples_leaf=int(params['min_samples_leaf']),
272			max_leaf_nodes=int(params['max_leaf_nodes'])
273			), max_samples=float(params['max_samples']))
274
275			def train(self, train_x, train_y, params):
276			# fit the model on the training corpus
277			self._classifier = self._create_classifier(params)
278			self._classifier.fit(train_x, train_y)
279
280			def _prediction_to_list(self, scores, candidates):
281			subj_scores = [(score[1], c.subject_id)
282			for score, c in zip(scores, candidates)]
283			return sorted(subj_scores, reverse=True)
284
285			def predict(self, candidates):
286			if not candidates:
287			return []
288			features = self._candidates_to_features(candidates)
289			scores = self._classifier.predict_proba(features)
290			return self._prediction_to_list(scores, candidates)
291
292
293			class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
294			"""Hyperparameter optimizer for the MLLM backend"""
295
296			def _prepare(self, n_jobs=1):
297			self._backend.initialize()
298			self._train_x, self._train_y = self._backend._load_train_data()
299			self._candidates = []
300			self._gold_subjects = []
301
302			# TODO parallelize generation of candidates
303			for doc in self._corpus.documents:
304			candidates = self._backend._generate_candidates(doc.text)
305			self._candidates.append(candidates)
306			self._gold_subjects.append(
307			annif.corpus.SubjectSet((doc.uris, doc.labels)))
308
309			def _objective(self, trial):
310			params = {
311			'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30),
312			'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000),
313			'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
314			'use_hidden_labels':
315			trial.suggest_categorical('use_hidden_labels', [True, False]),
316			'limit': 100
317			}
318			model = self._backend._model._create_classifier(params)
319			model.fit(self._train_x, self._train_y)
320
321			batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
322			for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
323			if candidates:
324			features = \
325			self._backend._model._candidates_to_features(candidates)
326			scores = model.predict_proba(features)
327			ranking = self._backend._model._prediction_to_list(
328			scores, candidates)
329			else:
330			ranking = []
331			results = self._backend._prediction_to_result(ranking, params)
332			batch.evaluate(results, goldsubj)
333			results = batch.results(metrics=[self._metric])
334			return results[self._metric]
335
336			def _postprocess(self, study):
337			bp = study.best_params
338			lines = [
339			f"min_samples_leaf={bp['min_samples_leaf']}",
340			f"max_leaf_nodes={bp['max_leaf_nodes']}",
341			f"max_samples={bp['max_samples']:.4f}",
342			f"use_hidden_labels={bp['use_hidden_labels']}"
343			]
344			return hyperopt.HPRecommendation(lines=lines, score=study.best_value)
345
346
347			class MLLMBackend(hyperopt.AnnifHyperoptBackend):
348			"""Maui-like Lexical Matching backend for Annif"""
349			name = "mllm"
350			needs_subject_index = True
351
352			# defaults for unitialized instances
353			_model = None
354
355			MODEL_FILE = 'mllm-model.gz'
356			TRAIN_FILE = 'mllm-train.gz'
357
358			DEFAULT_PARAMETERS = {
359			'min_samples_leaf': 20,
360			'max_leaf_nodes': 1000,
361			'max_samples': 0.9,
362			'use_hidden_labels': False
363			}
364
365			def get_hp_optimizer(self, corpus, metric):
366			return MLLMOptimizer(self, corpus, metric)
367
368			def default_params(self):
369			params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
370			params.update(self.DEFAULT_PARAMETERS)
371			return params
372
373			def _load_model(self):
374			path = os.path.join(self.datadir, self.MODEL_FILE)
375			self.debug('loading model from {}'.format(path))
376			if os.path.exists(path):
377			return joblib.load(path)
378			else:
379			raise NotInitializedException(
380			'model {} not found'.format(path),
381			backend_id=self.backend_id)
382
383			def _load_train_data(self):
384			path = os.path.join(self.datadir, self.TRAIN_FILE)
385			if os.path.exists(path):
386			return joblib.load(path)
387			else:
388			raise NotInitializedException(
389			'train data file {} not found'.format(path),
390			backend_id=self.backend_id)
391
392			def initialize(self):
393			if self._model is None:
394			self._model = self._load_model()
395
396			def _train(self, corpus, params):
397			self.info('starting train')
398			if corpus != 'cached':
399			self.info("preparing training data")
400			self._model = MLLMModel()
401			train_data = self._model.prepare_train(corpus,
402			self.project.vocab,
403			self.project.analyzer,
404			params)
405			annif.util.atomic_save(train_data,
406			self.datadir,
407			self.TRAIN_FILE,
408			method=joblib.dump)
409			else:
410			self.info("reusing cached training data from previous run")
411			self._model = self._load_model()
412			train_data = self._load_train_data()
413
414			self.info("training model")
415			self._model.train(train_data[0], train_data[1], params)
416
417			self.info('saving model')
418			annif.util.atomic_save(
419			self._model,
420			self.datadir,
421			self.MODEL_FILE,
422			method=joblib.dump)
423
424			def _generate_candidates(self, text):
425			return self._model.generate_candidates(text, self.project.analyzer)
426
427			def _prediction_to_result(self, prediction, params):
428			vector = np.zeros(len(self.project.subjects), dtype=np.float32)
429			for score, subject_id in prediction:
430			vector[subject_id] = score
431			result = VectorSuggestionResult(vector)
432			return result.filter(self.project.subjects,
433			limit=int(params['limit']))
434
435			def _suggest(self, text, params):
436			candidates = self._generate_candidates(text)
437			prediction = self._model.predict(candidates)
438			return self._prediction_to_result(prediction, params)
439

NatLibFi / Annif

Pull Request — master (#462)

annif.backend.mllm.MLLMModel._prepare_terms() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like