annif.backend.mllm.TokenSet.__iter__() - Code Metrics - Inspection of "New lexical backend MLLM" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#462)

by Osma

created 2021-02-15 15:12 UTC

annif.backend.mllm.TokenSet.iter() A

↳ Parent: annif.backend.mllm

Complexity

Conditions

Size

Total Lines	2
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	2
dl	0
loc	2
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""Maui-like Lexical Matching backend"""

import collections
import math
from enum import IntEnum
from statistics import mean
import os.path
import joblib
import numpy as np
from rdflib import URIRef
from rdflib.namespace import SKOS
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import annif.util
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import backend
from . import hyperopt

Term = collections.namedtuple('Term', 'subject_id label is_pref')
Match = collections.namedtuple(
    'Match', 'subject_id is_pref n_tokens pos ambiguity')
Candidate = collections.namedtuple(
    'Candidate',
    'doc_length subject_id freq is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread')

Feature = IntEnum(
    'Feature',
    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
    'first_occ last_occ spread doc_length ' +
    'related',
    start=0)


class TokenSet:
    """Represents a set of tokens (expressed as integer token IDs) that can
    be matched with another set of tokens. A TokenSet can optionally
    be associated with a subject from the vocabulary."""

    def __init__(self, tokens, subject_id=None, is_pref=False):
        self._tokens = set(tokens)
        self.subject_id = subject_id
        self.is_pref = is_pref

    def __len__(self):
        return len(self._tokens)

    def __iter__(self):
        return iter(self._tokens)

    def contains(self, other):
        """Returns True iff the tokens in the other TokenSet are all
        included within this TokenSet."""

        return other._tokens.issubset(self._tokens)

    def sample(self):
        """Return an arbitrary token from this TokenSet, or None if empty"""
        try:
            return next(iter(self._tokens))
        except StopIteration:
            return None


class TokenSetIndex:
    """A searchable index of TokenSets (representing vocabulary terms)"""

    def __init__(self):
        self._index = collections.defaultdict(set)

    def __len__(self):
        return len(self._index)

    def add(self, tset):
        """Add a TokenSet into this index"""
        token = tset.sample()
        if token is not None:
            self._index[token].add(tset)

    def search(self, tset):
        """Return the TokenSets that are contained in the given TokenSet.
        The matches are returned as a list of (TokenSet, ambiguity) pairs
        where ambiguity is an integer indicating the number of other TokenSets
        that also match the same tokens."""

        subj_tsets = {}
        subj_ambiguity = collections.Counter()

        for token in tset:
            for ts in self._index[token]:
                if not tset.contains(ts):
                    continue
                if ts.subject_id not in subj_tsets or \
                   not subj_tsets[ts.subject_id].is_pref:
                    subj_tsets[ts.subject_id] = ts

        for ts in subj_tsets.values():
            for other in subj_tsets.values():
                if ts == other:
                    continue
                if other.contains(ts):
                    subj_ambiguity.update([ts.subject_id])

        return [(ts, subj_ambiguity[ts.subject_id])
                for uri, ts in subj_tsets.items()]


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def _conflate_matches(self, matches, doc_length):
        subj_matches = collections.defaultdict(list)
        for match in matches:
            subj_matches[match.subject_id].append(match)
        return [
            Candidate(
                doc_length=doc_length,
                subject_id=subject_id,
                freq=len(matches) / doc_length,
                is_pref=mean((float(m.is_pref) for m in matches)),
                n_tokens=mean((m.n_tokens for m in matches)),
                ambiguity=mean((m.ambiguity for m in matches)),
                first_occ=matches[0].pos / doc_length,
                last_occ=matches[-1].pos / doc_length,
                spread=(matches[-1].pos - matches[0].pos) / doc_length
            )
            for subject_id, matches in subj_matches.items()]

    def generate_candidates(self, text, analyzer):
        sentences = analyzer.tokenize_sentences(text)
        sent_tokens = self._vectorizer.transform(sentences)
        matches = []

        for sent_idx, token_matrix in enumerate(sent_tokens):
            tset = TokenSet(token_matrix.nonzero()[1])
            for ts, ambiguity in self._index.search(tset):
                matches.append(Match(subject_id=ts.subject_id,
                                     is_pref=ts.is_pref,
                                     n_tokens=len(ts),
                                     pos=sent_idx,
                                     ambiguity=ambiguity))

        return self._conflate_matches(matches, len(sentences))

    def _candidates_to_features(self, candidates):
        """Convert a list of Candidates to a NumPy feature matrix"""
        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
        c_ids = [c.subject_id for c in candidates]
        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
        c_vec[c_ids] = True
        rels = self._related_matrix.multiply(c_vec).sum(axis=1)
        for idx, c in enumerate(candidates):
            subj = c.subject_id
            matrix[idx, Feature.freq] = c.freq
            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
            matrix[idx, Feature.is_pref] = c.is_pref
            matrix[idx, Feature.n_tokens] = c.n_tokens
            matrix[idx, Feature.ambiguity] = c.ambiguity
            matrix[idx, Feature.first_occ] = c.first_occ
            matrix[idx, Feature.last_occ] = c.last_occ
            matrix[idx, Feature.spread] = c.spread
            matrix[idx, Feature.doc_length] = c.doc_length
            matrix[idx, Feature.related] = rels[subj, 0] / len(c_ids)
        return matrix

    def prepare_train(self, corpus, vocab, analyzer, params):
        graph = vocab.as_graph()
        terms = []
        subject_ids = []
        n_subj = len(vocab.subjects)
        self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
        for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
            if pref is None:
                continue  # deprecated subject
            subject_ids.append(subj_id)
            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))

            if annif.util.boolean(params['use_hidden_labels']):
                props = [SKOS.altLabel, SKOS.hiddenLabel]
            else:
                props = [SKOS.altLabel]

            non_pref = graph.preferredLabel(URIRef(uri),
                                            lang=params['language'],
                                            labelProperties=props)
            for label, _ in non_pref:
                terms.append(Term(subject_id=subj_id,
                                  label=str(label),
                                  is_pref=False))

            for related in graph.objects(URIRef(uri), SKOS.related):
                broad_id = vocab.subjects.by_uri(str(related), warnings=False)
                if broad_id is not None:
                    self._related_matrix[subj_id, broad_id] = True

        self._vectorizer = CountVectorizer(
            binary=True,
            tokenizer=analyzer.tokenize_words
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        doc_count = 0
        train_x = []
        train_y = []
        for idx, doc in enumerate(corpus.documents):
            doc_subject_ids = [vocab.subjects.by_uri(uri)
                               for uri in doc.uris]
            self._subj_freq.update(doc_subject_ids)
            candidates = self.generate_candidates(doc.text, analyzer)
            self._doc_freq.update([c.subject_id for c in candidates])
            train_x.append(candidates)
            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
            doc_count += 1

        # precalculate idf values for candidate subjects
        self._idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            self._idf[uri] = math.log((doc_count + 1) /
                                      (self._doc_freq[subj_id] + 1)) + 1
        return (np.vstack([self._candidates_to_features(candidates)
                           for candidates in train_x]), np.array(train_y))

    def _create_classifier(self, params):
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params['min_samples_leaf']),
                max_leaf_nodes=int(params['max_leaf_nodes'])
            ), max_samples=float(params['max_samples']))

    def train(self, train_x, train_y, params):
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)

    def _prediction_to_list(self, scores, candidates):
        subj_scores = [(score[1], c.subject_id)
                       for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates):
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)


class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
    """Hyperparameter optimizer for the MLLM backend"""

    def _prepare(self, n_jobs=1):
        self._backend.initialize()
        self._train_x, self._train_y = self._backend._load_train_data()
        self._candidates = []
        self._gold_subjects = []

        # TODO parallelize generation of candidates
        for doc in self._corpus.documents:
            candidates = self._backend._generate_candidates(doc.text)
            self._candidates.append(candidates)
            self._gold_subjects.append(
                annif.corpus.SubjectSet((doc.uris, doc.labels)))

    def _objective(self, trial):
        params = {
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30),
            'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000),
            'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
            'use_hidden_labels':
                trial.suggest_categorical('use_hidden_labels', [True, False]),
            'limit': 100
        }
        model = self._backend._model._create_classifier(params)
        model.fit(self._train_x, self._train_y)

        batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
        for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
            if candidates:
                features = \
                    self._backend._model._candidates_to_features(candidates)
                scores = model.predict_proba(features)
                ranking = self._backend._model._prediction_to_list(
                    scores, candidates)
            else:
                ranking = []
            results = self._backend._prediction_to_result(ranking, params)
            batch.evaluate(results, goldsubj)
        results = batch.results(metrics=[self._metric])
        return results[self._metric]

    def _postprocess(self, study):
        bp = study.best_params
        lines = [
            f"min_samples_leaf={bp['min_samples_leaf']}",
            f"max_leaf_nodes={bp['max_leaf_nodes']}",
            f"max_samples={bp['max_samples']:.4f}",
            f"use_hidden_labels={bp['use_hidden_labels']}"
        ]
        return hyperopt.HPRecommendation(lines=lines, score=study.best_value)


class MLLMBackend(hyperopt.AnnifHyperoptBackend):
    """Maui-like Lexical Matching backend for Annif"""
    name = "mllm"
    needs_subject_index = True

    # defaults for unitialized instances
    _model = None

    MODEL_FILE = 'mllm-model.gz'
    TRAIN_FILE = 'mllm-train.gz'

    DEFAULT_PARAMETERS = {
        'min_samples_leaf': 20,
        'max_leaf_nodes': 1000,
        'max_samples': 0.9,
        'use_hidden_labels': False
    }

    def get_hp_optimizer(self, corpus, metric):
        return MLLMOptimizer(self, corpus, metric)

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def _load_model(self):
        path = os.path.join(self.datadir, self.MODEL_FILE)
        self.debug('loading model from {}'.format(path))
        if os.path.exists(path):
            return joblib.load(path)
        else:
            raise NotInitializedException(
                'model {} not found'.format(path),
                backend_id=self.backend_id)

    def _load_train_data(self):
        path = os.path.join(self.datadir, self.TRAIN_FILE)
        if os.path.exists(path):
            return joblib.load(path)
        else:
            raise NotInitializedException(
                'train data file {} not found'.format(path),
                backend_id=self.backend_id)

    def initialize(self):
        if self._model is None:
            self._model = self._load_model()

    def _train(self, corpus, params):
        self.info('starting train')
        if corpus != 'cached':
            self.info("preparing training data")
            self._model = MLLMModel()
            train_data = self._model.prepare_train(corpus,
                                                   self.project.vocab,
                                                   self.project.analyzer,
                                                   params)
            annif.util.atomic_save(train_data,
                                   self.datadir,
                                   self.TRAIN_FILE,
                                   method=joblib.dump)
        else:
            self.info("reusing cached training data from previous run")
            self._model = self._load_model()
            train_data = self._load_train_data()

        self.info("training model")
        self._model.train(train_data[0], train_data[1], params)

        self.info('saving model')
        annif.util.atomic_save(
            self._model,
            self.datadir,
            self.MODEL_FILE,
            method=joblib.dump)

    def _generate_candidates(self, text):
        return self._model.generate_candidates(text, self.project.analyzer)

    def _prediction_to_result(self, prediction, params):
        vector = np.zeros(len(self.project.subjects), dtype=np.float32)
        for score, subject_id in prediction:
            vector[subject_id] = score
        result = VectorSuggestionResult(vector)
        return result.filter(self.project.subjects,
                             limit=int(params['limit']))

    def _suggest(self, text, params):
        candidates = self._generate_candidates(text)
        prediction = self._model.predict(candidates)
        return self._prediction_to_result(prediction, params)


1			"""Maui-like Lexical Matching backend"""
2
3			import collections
4			import math
5			from enum import IntEnum
6			from statistics import mean
7			import os.path
8			import joblib
9			import numpy as np
10			from rdflib import URIRef
11			from rdflib.namespace import SKOS
12			from scipy.sparse import lil_matrix
13			from sklearn.feature_extraction.text import CountVectorizer
14			from sklearn.ensemble import BaggingClassifier
15			from sklearn.tree import DecisionTreeClassifier
16			import annif.util
17			from annif.exception import NotInitializedException
18			from annif.suggestion import VectorSuggestionResult
19			from . import backend
20			from . import hyperopt
21
22			Term = collections.namedtuple('Term', 'subject_id label is_pref')
23			Match = collections.namedtuple(
24			'Match', 'subject_id is_pref n_tokens pos ambiguity')
25			Candidate = collections.namedtuple(
26			'Candidate',
27			'doc_length subject_id freq is_pref n_tokens ambiguity ' +
28			'first_occ last_occ spread')
29
30			Feature = IntEnum(
31			'Feature',
32			'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +
33			'first_occ last_occ spread doc_length ' +
34			'related',
35			start=0)
36
37
38			class TokenSet:
39			"""Represents a set of tokens (expressed as integer token IDs) that can
40			be matched with another set of tokens. A TokenSet can optionally
41			be associated with a subject from the vocabulary."""
42
43			def __init__(self, tokens, subject_id=None, is_pref=False):
44			self._tokens = set(tokens)
45			self.subject_id = subject_id
46			self.is_pref = is_pref
47
48			def __len__(self):
49			return len(self._tokens)
50
51			def __iter__(self):
52			return iter(self._tokens)
53
54			def contains(self, other):
55			"""Returns True iff the tokens in the other TokenSet are all
56			included within this TokenSet."""
57
58			return other._tokens.issubset(self._tokens)
59
60			def sample(self):
61			"""Return an arbitrary token from this TokenSet, or None if empty"""
62			try:
63			return next(iter(self._tokens))
64			except StopIteration:
65			return None
66
67
68			class TokenSetIndex:
69			"""A searchable index of TokenSets (representing vocabulary terms)"""
70
71			def __init__(self):
72			self._index = collections.defaultdict(set)
73
74			def __len__(self):
75			return len(self._index)
76
77			def add(self, tset):
78			"""Add a TokenSet into this index"""
79			token = tset.sample()
80			if token is not None:
81			self._index[token].add(tset)
82
83			def search(self, tset):
84			"""Return the TokenSets that are contained in the given TokenSet.
85			The matches are returned as a list of (TokenSet, ambiguity) pairs
86			where ambiguity is an integer indicating the number of other TokenSets
87			that also match the same tokens."""
88
89			subj_tsets = {}
90			subj_ambiguity = collections.Counter()
91
92			for token in tset:
93			for ts in self._index[token]:
94			if not tset.contains(ts):
95			continue
96			if ts.subject_id not in subj_tsets or \
97			not subj_tsets[ts.subject_id].is_pref:
98			subj_tsets[ts.subject_id] = ts
99
100			for ts in subj_tsets.values():
101			for other in subj_tsets.values():
102			if ts == other:
103			continue
104			if other.contains(ts):
105			subj_ambiguity.update([ts.subject_id])
106
107			return [(ts, subj_ambiguity[ts.subject_id])
108			for uri, ts in subj_tsets.items()]
109
110
111			class MLLMModel:
112			"""Maui-like Lexical Matching model"""
113
114			def _conflate_matches(self, matches, doc_length):
115			subj_matches = collections.defaultdict(list)
116			for match in matches:
117			subj_matches[match.subject_id].append(match)
118			return [
119			Candidate(
120			doc_length=doc_length,
121			subject_id=subject_id,
122			freq=len(matches) / doc_length,
123			is_pref=mean((float(m.is_pref) for m in matches)),
124			n_tokens=mean((m.n_tokens for m in matches)),
125			ambiguity=mean((m.ambiguity for m in matches)),
126			first_occ=matches[0].pos / doc_length,
127			last_occ=matches[-1].pos / doc_length,
128			spread=(matches[-1].pos - matches[0].pos) / doc_length
129			)
130			for subject_id, matches in subj_matches.items()]
131
132			def generate_candidates(self, text, analyzer):
133			sentences = analyzer.tokenize_sentences(text)
134			sent_tokens = self._vectorizer.transform(sentences)
135			matches = []
136
137			for sent_idx, token_matrix in enumerate(sent_tokens):
138			tset = TokenSet(token_matrix.nonzero()[1])
139			for ts, ambiguity in self._index.search(tset):
140			matches.append(Match(subject_id=ts.subject_id,
141			is_pref=ts.is_pref,
142			n_tokens=len(ts),
143			pos=sent_idx,
144			ambiguity=ambiguity))
145
146			return self._conflate_matches(matches, len(sentences))
147
148			def _candidates_to_features(self, candidates):
149			"""Convert a list of Candidates to a NumPy feature matrix"""
150			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
151			c_ids = [c.subject_id for c in candidates]
152			c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)
153			c_vec[c_ids] = True
154			rels = self._related_matrix.multiply(c_vec).sum(axis=1)
155			for idx, c in enumerate(candidates):
156			subj = c.subject_id
157			matrix[idx, Feature.freq] = c.freq
158			matrix[idx, Feature.doc_freq] = self._doc_freq[subj]
159			matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1
160			matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]
161			matrix[idx, Feature.is_pref] = c.is_pref
162			matrix[idx, Feature.n_tokens] = c.n_tokens
163			matrix[idx, Feature.ambiguity] = c.ambiguity
164			matrix[idx, Feature.first_occ] = c.first_occ
165			matrix[idx, Feature.last_occ] = c.last_occ
166			matrix[idx, Feature.spread] = c.spread
167			matrix[idx, Feature.doc_length] = c.doc_length
168			matrix[idx, Feature.related] = rels[subj, 0] / len(c_ids)
169			return matrix
170
171			def prepare_train(self, corpus, vocab, analyzer, params):
172			graph = vocab.as_graph()
173			terms = []
174			subject_ids = []
175			n_subj = len(vocab.subjects)
176			self._related_matrix = lil_matrix((n_subj, n_subj), dtype=np.bool)
177			for subj_id, (uri, pref, _) in enumerate(vocab.subjects):
178			if pref is None:
179			continue # deprecated subject
180			subject_ids.append(subj_id)
181			terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))
182
183			if annif.util.boolean(params['use_hidden_labels']):
184			props = [SKOS.altLabel, SKOS.hiddenLabel]
185			else:
186			props = [SKOS.altLabel]
187
188			non_pref = graph.preferredLabel(URIRef(uri),
189			lang=params['language'],
190			labelProperties=props)
191			for label, _ in non_pref:
192			terms.append(Term(subject_id=subj_id,
193			label=str(label),
194			is_pref=False))
195
196			for related in graph.objects(URIRef(uri), SKOS.related):
197			broad_id = vocab.subjects.by_uri(str(related), warnings=False)
198			if broad_id is not None:
199			self._related_matrix[subj_id, broad_id] = True
200
201			self._vectorizer = CountVectorizer(
202			binary=True,
203			tokenizer=analyzer.tokenize_words
204			)
205			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
206
207			self._index = TokenSetIndex()
208			for term, label_matrix in zip(terms, label_corpus):
209			tokens = label_matrix.nonzero()[1]
210			tset = TokenSet(tokens, term.subject_id, term.is_pref)
211			self._index.add(tset)
212
213			# frequency of subjects (by id) in the generated candidates
214			self._doc_freq = collections.Counter()
215			# frequency of manually assigned subjects ("domain keyphraseness")
216			self._subj_freq = collections.Counter()
217			doc_count = 0
218			train_x = []
219			train_y = []
220			for idx, doc in enumerate(corpus.documents):
221			doc_subject_ids = [vocab.subjects.by_uri(uri)
222			for uri in doc.uris]
223			self._subj_freq.update(doc_subject_ids)
224			candidates = self.generate_candidates(doc.text, analyzer)
225			self._doc_freq.update([c.subject_id for c in candidates])
226			train_x.append(candidates)
227			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
228			doc_count += 1
229
230			# precalculate idf values for candidate subjects
231			self._idf = collections.defaultdict(float)
232			for subj_id in subject_ids:
233			self._idf[uri] = math.log((doc_count + 1) /
234			(self._doc_freq[subj_id] + 1)) + 1
235			return (np.vstack([self._candidates_to_features(candidates)
236			for candidates in train_x]), np.array(train_y))
237
238			def _create_classifier(self, params):
239			return BaggingClassifier(
240			DecisionTreeClassifier(
241			min_samples_leaf=int(params['min_samples_leaf']),
242			max_leaf_nodes=int(params['max_leaf_nodes'])
243			), max_samples=float(params['max_samples']))
244
245			def train(self, train_x, train_y, params):
246			# fit the model on the training corpus
247			self._classifier = self._create_classifier(params)
248			self._classifier.fit(train_x, train_y)
249
250			def _prediction_to_list(self, scores, candidates):
251			subj_scores = [(score[1], c.subject_id)
252			for score, c in zip(scores, candidates)]
253			return sorted(subj_scores, reverse=True)
254
255			def predict(self, candidates):
256			if not candidates:
257			return []
258			features = self._candidates_to_features(candidates)
259			scores = self._classifier.predict_proba(features)
260			return self._prediction_to_list(scores, candidates)
261
262
263			class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
264			"""Hyperparameter optimizer for the MLLM backend"""
265
266			def _prepare(self, n_jobs=1):
267			self._backend.initialize()
268			self._train_x, self._train_y = self._backend._load_train_data()
269			self._candidates = []
270			self._gold_subjects = []
271
272			# TODO parallelize generation of candidates
273			for doc in self._corpus.documents:
274			candidates = self._backend._generate_candidates(doc.text)
275			self._candidates.append(candidates)
276			self._gold_subjects.append(
277			annif.corpus.SubjectSet((doc.uris, doc.labels)))
278
279			def _objective(self, trial):
280			params = {
281			'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 30),
282			'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 2000),
283			'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
284			'use_hidden_labels':
285			trial.suggest_categorical('use_hidden_labels', [True, False]),
286			'limit': 100
287			}
288			model = self._backend._model._create_classifier(params)
289			model.fit(self._train_x, self._train_y)
290
291			batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
292			for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
293			if candidates:
294			features = \
295			self._backend._model._candidates_to_features(candidates)
296			scores = model.predict_proba(features)
297			ranking = self._backend._model._prediction_to_list(
298			scores, candidates)
299			else:
300			ranking = []
301			results = self._backend._prediction_to_result(ranking, params)
302			batch.evaluate(results, goldsubj)
303			results = batch.results(metrics=[self._metric])
304			return results[self._metric]
305
306			def _postprocess(self, study):
307			bp = study.best_params
308			lines = [
309			f"min_samples_leaf={bp['min_samples_leaf']}",
310			f"max_leaf_nodes={bp['max_leaf_nodes']}",
311			f"max_samples={bp['max_samples']:.4f}",
312			f"use_hidden_labels={bp['use_hidden_labels']}"
313			]
314			return hyperopt.HPRecommendation(lines=lines, score=study.best_value)
315
316
317			class MLLMBackend(hyperopt.AnnifHyperoptBackend):
318			"""Maui-like Lexical Matching backend for Annif"""
319			name = "mllm"
320			needs_subject_index = True
321
322			# defaults for unitialized instances
323			_model = None
324
325			MODEL_FILE = 'mllm-model.gz'
326			TRAIN_FILE = 'mllm-train.gz'
327
328			DEFAULT_PARAMETERS = {
329			'min_samples_leaf': 20,
330			'max_leaf_nodes': 1000,
331			'max_samples': 0.9,
332			'use_hidden_labels': False
333			}
334
335			def get_hp_optimizer(self, corpus, metric):
336			return MLLMOptimizer(self, corpus, metric)
337
338			def default_params(self):
339			params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
340			params.update(self.DEFAULT_PARAMETERS)
341			return params
342
343			def _load_model(self):
344			path = os.path.join(self.datadir, self.MODEL_FILE)
345			self.debug('loading model from {}'.format(path))
346			if os.path.exists(path):
347			return joblib.load(path)
348			else:
349			raise NotInitializedException(
350			'model {} not found'.format(path),
351			backend_id=self.backend_id)
352
353			def _load_train_data(self):
354			path = os.path.join(self.datadir, self.TRAIN_FILE)
355			if os.path.exists(path):
356			return joblib.load(path)
357			else:
358			raise NotInitializedException(
359			'train data file {} not found'.format(path),
360			backend_id=self.backend_id)
361
362			def initialize(self):
363			if self._model is None:
364			self._model = self._load_model()
365
366			def _train(self, corpus, params):
367			self.info('starting train')
368			if corpus != 'cached':
369			self.info("preparing training data")
370			self._model = MLLMModel()
371			train_data = self._model.prepare_train(corpus,
372			self.project.vocab,
373			self.project.analyzer,
374			params)
375			annif.util.atomic_save(train_data,
376			self.datadir,
377			self.TRAIN_FILE,
378			method=joblib.dump)
379			else:
380			self.info("reusing cached training data from previous run")
381			self._model = self._load_model()
382			train_data = self._load_train_data()
383
384			self.info("training model")
385			self._model.train(train_data[0], train_data[1], params)
386
387			self.info('saving model')
388			annif.util.atomic_save(
389			self._model,
390			self.datadir,
391			self.MODEL_FILE,
392			method=joblib.dump)
393
394			def _generate_candidates(self, text):
395			return self._model.generate_candidates(text, self.project.analyzer)
396
397			def _prediction_to_result(self, prediction, params):
398			vector = np.zeros(len(self.project.subjects), dtype=np.float32)
399			for score, subject_id in prediction:
400			vector[subject_id] = score
401			result = VectorSuggestionResult(vector)
402			return result.filter(self.project.subjects,
403			limit=int(params['limit']))
404
405			def _suggest(self, text, params):
406			candidates = self._generate_candidates(text)
407			prediction = self._model.predict(candidates)
408			return self._prediction_to_result(prediction, params)
409

NatLibFi / Annif

Pull Request — master (#462)

annif.backend.mllm.TokenSet.__iter__() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

annif.backend.mllm.TokenSet.iter() A