annif.lexical.mllm.MLLMModel._get_label_props() - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

annif.lexical.mllm.MLLMModel._get_label_props() A
last analyzed 2025-08-15 15:10 UTC

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	10
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	7
nop	1
dl	0
loc	10
rs	10
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

from __future__ import annotations

import collections
import math
from enum import IntEnum
from statistics import mean
from typing import TYPE_CHECKING, Any

import joblib
import numpy as np
from rdflib.namespace import SKOS
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

import annif.parallel
import annif.util
from annif.exception import OperationFailedException
from annif.lexical.tokenset import TokenSet, TokenSetIndex
from annif.lexical.util import (
    get_subject_labels,
    make_collection_matrix,
    make_relation_matrix,
)

if TYPE_CHECKING:
    from collections import defaultdict

    from rdflib.graph import Graph
    from rdflib.term import URIRef

    from annif.analyzer import Analyzer
    from annif.corpus.document import DocumentCorpus
    from annif.vocab import AnnifVocabulary

Term = collections.namedtuple("Term", "subject_id label is_pref")

Match = collections.namedtuple("Match", "subject_id is_pref n_tokens pos ambiguity")

Candidate = collections.namedtuple(
    "Candidate",
    "doc_length subject_id freq is_pref n_tokens ambiguity "
    + "first_occ last_occ spread",
)

ModelData = collections.namedtuple(
    "ModelData", "broader narrower related collection " + "doc_freq subj_freq idf"
)

Feature = IntEnum(
    "Feature",
    "freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity "
    + "first_occ last_occ spread doc_length "
    + "broader narrower related collection",
    start=0,
)


def conflate_matches(matches: list[Match], doc_length: int) -> list[Candidate]:
    subj_matches = collections.defaultdict(list)
    for match in matches:
        subj_matches[match.subject_id].append(match)
    return [
        Candidate(
            doc_length=doc_length,
            subject_id=subject_id,
            freq=len(matches) / doc_length,
            is_pref=mean((float(m.is_pref) for m in matches)),
            n_tokens=mean((m.n_tokens for m in matches)),
            ambiguity=mean((m.ambiguity for m in matches)),
            first_occ=matches[0].pos / doc_length,
            last_occ=matches[-1].pos / doc_length,
            spread=(matches[-1].pos - matches[0].pos) / doc_length,
        )
        for subject_id, matches in subj_matches.items()
    ]


def generate_candidates(
    text: str,
    analyzer: Analyzer,
    vectorizer: CountVectorizer,
    index: TokenSetIndex,
) -> list[Candidate]:
    sentences = analyzer.tokenize_sentences(text)
    sent_tokens = vectorizer.transform(sentences)
    matches = []

    for sent_idx, token_matrix in enumerate(sent_tokens):
        tset = TokenSet(token_matrix.nonzero()[1])
        for ts, ambiguity in index.search(tset):
            matches.append(
                Match(
                    subject_id=ts.subject_id,
                    is_pref=ts.is_pref,
                    n_tokens=len(ts),
                    pos=sent_idx,
                    ambiguity=ambiguity,
                )
            )

    return conflate_matches(matches, len(sentences))


def candidates_to_features(
    candidates: list[Candidate], mdata: "ModelData"
) -> np.ndarray:
    """Convert a list of Candidates to a NumPy feature matrix"""

    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
    c_ids = [c.subject_id for c in candidates]
    c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
    c_vec[c_ids] = True
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
    related = mdata.related.multiply(c_vec).sum(axis=1)
    collection = mdata.collection.multiply(c_vec).T.dot(mdata.collection).sum(axis=0)
    for idx, c in enumerate(candidates):
        subj = c.subject_id
        matrix[idx, Feature.freq] = c.freq
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
        matrix[idx, Feature.is_pref] = c.is_pref
        matrix[idx, Feature.n_tokens] = c.n_tokens
        matrix[idx, Feature.ambiguity] = c.ambiguity
        matrix[idx, Feature.first_occ] = c.first_occ
        matrix[idx, Feature.last_occ] = c.last_occ
        matrix[idx, Feature.spread] = c.spread
        matrix[idx, Feature.doc_length] = c.doc_length
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
    return matrix


class MLLMCandidateGenerator(annif.parallel.BaseWorker):
    @classmethod
    def generate_candidates(cls, doc_subject_set, text):
        candidates = generate_candidates(text, **cls.args)  # pragma: no cover
        return doc_subject_set, candidates  # pragma: no cover


class MLLMFeatureConverter(annif.parallel.BaseWorker):
    @classmethod
    def candidates_to_features(cls, candidates):
        return candidates_to_features(candidates, **cls.args)  # pragma: no cover


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def generate_candidates(self, text: str, analyzer: Analyzer) -> list[Candidate]:
        return generate_candidates(text, analyzer, self._vectorizer, self._index)

    @property
    def _model_data(self) -> ModelData:
        return ModelData(
            broader=self._broader_matrix,
            narrower=self._narrower_matrix,
            related=self._related_matrix,
            collection=self._collection_matrix,
            doc_freq=self._doc_freq,
            subj_freq=self._subj_freq,
            idf=self._idf,
        )

    def _candidates_to_features(self, candidates: list[Candidate]) -> np.ndarray:
        return candidates_to_features(candidates, self._model_data)

    @staticmethod
    def _get_label_props(params: dict[str, Any]) -> tuple[list[URIRef], list[URIRef]]:
        pref_label_props = [SKOS.prefLabel]

        if annif.util.boolean(params["use_hidden_labels"]):
            nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
        else:
            nonpref_label_props = [SKOS.altLabel]

        return (pref_label_props, nonpref_label_props)

    def _prepare_terms(
        self,
        graph: Graph,
        vocab: AnnifVocabulary,
        params: dict[str, Any],
    ) -> tuple[list[Term], list[int]]:
        pref_label_props, nonpref_label_props = self._get_label_props(params)

        terms = []
        subject_ids = []
        for subj_id, subject in vocab.subjects.active:
            subject_ids.append(subj_id)

            for label in get_subject_labels(
                graph, subject.uri, pref_label_props, params["language"]
            ):
                terms.append(Term(subject_id=subj_id, label=label, is_pref=True))

            for label in get_subject_labels(
                graph, subject.uri, nonpref_label_props, params["language"]
            ):
                terms.append(Term(subject_id=subj_id, label=label, is_pref=False))

        return (terms, subject_ids)

    def _prepare_relations(self, graph: Graph, vocab: AnnifVocabulary) -> None:
        self._broader_matrix = make_relation_matrix(graph, vocab, SKOS.broader)
        self._narrower_matrix = make_relation_matrix(graph, vocab, SKOS.narrower)
        self._related_matrix = make_relation_matrix(graph, vocab, SKOS.related)
        self._collection_matrix = make_collection_matrix(graph, vocab)

    def _prepare_train_index(
        self,
        vocab: AnnifVocabulary,
        analyzer: Analyzer,
        params: dict[str, Any],
    ) -> list[int]:
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        # frequency of each token used in labels - how rare each word is
        token_freq = np.bincount(label_corpus.indices, minlength=label_corpus.shape[1])

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            # sort tokens by frequency - use the rarest token as index key
            tokens = sorted(tokens, key=token_freq.__getitem__)
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _prepare_train_data(
        self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int
    ) -> tuple[list[list[Candidate]], list[bool]]:
        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        train_x = []
        train_y = []

        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        cg_args = {
            "analyzer": analyzer,
            "vectorizer": self._vectorizer,
            "index": self._index,
        }

        with pool_class(
            jobs, initializer=MLLMCandidateGenerator.init, initargs=(cg_args,)
        ) as pool:
            params = ((doc.subject_set, doc.text) for doc in corpus.documents)
            for doc_subject_ids, candidates in pool.starmap(
                MLLMCandidateGenerator.generate_candidates, params, 10
            ):
                self._subj_freq.update(doc_subject_ids)
                self._doc_freq.update([c.subject_id for c in candidates])
                train_x.append(candidates)
                train_y += [(c.subject_id in doc_subject_ids) for c in candidates]

        return (train_x, train_y)

    def _calculate_idf(
        self, subject_ids: list[int], doc_count: int
    ) -> defaultdict[int, float]:
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) / (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def _prepare_features(
        self, train_x: list[list[Candidate]], n_jobs: int
    ) -> list[np.ndarray]:
        fc_args = {"mdata": self._model_data}
        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        with pool_class(
            jobs, initializer=MLLMFeatureConverter.init, initargs=(fc_args,)
        ) as pool:
            features = pool.map(
                MLLMFeatureConverter.candidates_to_features, train_x, 10
            )

        return features

    def prepare_train(
        self,
        corpus: DocumentCorpus,
        vocab: AnnifVocabulary,
        analyzer: Analyzer,
        params: dict[str, Any],
        n_jobs: int,
    ) -> tuple[np.ndarray, np.ndarray]:
        # create an index from the vocabulary terms
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # convert the corpus into train data
        train_x, train_y = self._prepare_train_data(corpus, analyzer, n_jobs)

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, len(train_x))

        # convert the train data into feature values
        features = self._prepare_features(train_x, n_jobs)

        return (np.vstack(features), np.array(train_y))

    def _create_classifier(self, params: dict[str, Any]) -> BaggingClassifier:
        return BaggingClassifier(
            DecisionTreeClassifier(
                min_samples_leaf=int(params["min_samples_leaf"]),
                max_leaf_nodes=int(params["max_leaf_nodes"]),
            ),
            max_samples=float(params["max_samples"]),
        )

    def train(
        self,
        train_x: np.ndarray | list[tuple[int, int]],
        train_y: list[bool] | np.ndarray,
        params: dict[str, Any],
    ) -> None:
        # fit the model on the training corpus
        self._classifier = self._create_classifier(params)
        self._classifier.fit(train_x, train_y)
        # sanity check: verify that the classifier has seen both classes
        if self._classifier.n_classes_ != 2:
            raise OperationFailedException(
                "Unable to create classifier: "
                + "Not enough positive and negative examples "
                + "in the training data. Please check that your training "
                + "data matches your vocabulary."
            )

    def _prediction_to_list(
        self, scores: np.ndarray, candidates: list[Candidate]
    ) -> list[tuple[np.float64, int]]:
        subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)]
        return sorted(subj_scores, reverse=True)

    def predict(self, candidates: list[Candidate]) -> list[tuple[np.float64, int]]:
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return self._prediction_to_list(scores, candidates)

    def save(self, filename: str) -> list[str]:
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename: str) -> MLLMModel:
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			from __future__ import annotations
4
5			import collections
6			import math
7			from enum import IntEnum
8			from statistics import mean
9			from typing import TYPE_CHECKING, Any
10
11			import joblib
12			import numpy as np
13			from rdflib.namespace import SKOS
14			from sklearn.ensemble import BaggingClassifier
15			from sklearn.feature_extraction.text import CountVectorizer
16			from sklearn.tree import DecisionTreeClassifier
17
18			import annif.parallel
19			import annif.util
20			from annif.exception import OperationFailedException
21			from annif.lexical.tokenset import TokenSet, TokenSetIndex
22			from annif.lexical.util import (
23			get_subject_labels,
24			make_collection_matrix,
25			make_relation_matrix,
26			)
27
28			if TYPE_CHECKING:
29			from collections import defaultdict
30
31			from rdflib.graph import Graph
32			from rdflib.term import URIRef
33
34			from annif.analyzer import Analyzer
35			from annif.corpus.document import DocumentCorpus
36			from annif.vocab import AnnifVocabulary
37
38			Term = collections.namedtuple("Term", "subject_id label is_pref")
39
40			Match = collections.namedtuple("Match", "subject_id is_pref n_tokens pos ambiguity")
41
42			Candidate = collections.namedtuple(
43			"Candidate",
44			"doc_length subject_id freq is_pref n_tokens ambiguity "
45			+ "first_occ last_occ spread",
46			)
47
48			ModelData = collections.namedtuple(
49			"ModelData", "broader narrower related collection " + "doc_freq subj_freq idf"
50			)
51
52			Feature = IntEnum(
53			"Feature",
54			"freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity "
55			+ "first_occ last_occ spread doc_length "
56			+ "broader narrower related collection",
57			start=0,
58			)
59
60
61			def conflate_matches(matches: list[Match], doc_length: int) -> list[Candidate]:
62			subj_matches = collections.defaultdict(list)
63			for match in matches:
64			subj_matches[match.subject_id].append(match)
65			return [
66			Candidate(
67			doc_length=doc_length,
68			subject_id=subject_id,
69			freq=len(matches) / doc_length,
70			is_pref=mean((float(m.is_pref) for m in matches)),
71			n_tokens=mean((m.n_tokens for m in matches)),
72			ambiguity=mean((m.ambiguity for m in matches)),
73			first_occ=matches[0].pos / doc_length,
74			last_occ=matches[-1].pos / doc_length,
75			spread=(matches[-1].pos - matches[0].pos) / doc_length,
76			)
77			for subject_id, matches in subj_matches.items()
78			]
79
80
81			def generate_candidates(
82			text: str,
83			analyzer: Analyzer,
84			vectorizer: CountVectorizer,
85			index: TokenSetIndex,
86			) -> list[Candidate]:
87			sentences = analyzer.tokenize_sentences(text)
88			sent_tokens = vectorizer.transform(sentences)
89			matches = []
90
91			for sent_idx, token_matrix in enumerate(sent_tokens):
92			tset = TokenSet(token_matrix.nonzero()[1])
93			for ts, ambiguity in index.search(tset):
94			matches.append(
95			Match(
96			subject_id=ts.subject_id,
97			is_pref=ts.is_pref,
98			n_tokens=len(ts),
99			pos=sent_idx,
100			ambiguity=ambiguity,
101			)
102			)
103
104			return conflate_matches(matches, len(sentences))
105
106
107			def candidates_to_features(
108			candidates: list[Candidate], mdata: "ModelData"
109			) -> np.ndarray:
110			"""Convert a list of Candidates to a NumPy feature matrix"""
111
112			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
113			c_ids = [c.subject_id for c in candidates]
114			c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
115			c_vec[c_ids] = True
116			broader = mdata.broader.multiply(c_vec).sum(axis=1)
117			narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
118			related = mdata.related.multiply(c_vec).sum(axis=1)
119			collection = mdata.collection.multiply(c_vec).T.dot(mdata.collection).sum(axis=0)
120			for idx, c in enumerate(candidates):
121			subj = c.subject_id
122			matrix[idx, Feature.freq] = c.freq
123			matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
124			matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
125			matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
126			matrix[idx, Feature.is_pref] = c.is_pref
127			matrix[idx, Feature.n_tokens] = c.n_tokens
128			matrix[idx, Feature.ambiguity] = c.ambiguity
129			matrix[idx, Feature.first_occ] = c.first_occ
130			matrix[idx, Feature.last_occ] = c.last_occ
131			matrix[idx, Feature.spread] = c.spread
132			matrix[idx, Feature.doc_length] = c.doc_length
133			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
134			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
135			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
136			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
137			return matrix
138
139
140			class MLLMCandidateGenerator(annif.parallel.BaseWorker):
141			@classmethod
142			def generate_candidates(cls, doc_subject_set, text):
143			candidates = generate_candidates(text, **cls.args) # pragma: no cover
144			return doc_subject_set, candidates # pragma: no cover
145
146
147			class MLLMFeatureConverter(annif.parallel.BaseWorker):
148			@classmethod
149			def candidates_to_features(cls, candidates):
150			return candidates_to_features(candidates, **cls.args) # pragma: no cover
151
152
153			class MLLMModel:
154			"""Maui-like Lexical Matching model"""
155
156			def generate_candidates(self, text: str, analyzer: Analyzer) -> list[Candidate]:
157			return generate_candidates(text, analyzer, self._vectorizer, self._index)
158
159			@property
160			def _model_data(self) -> ModelData:
161			return ModelData(
162			broader=self._broader_matrix,
163			narrower=self._narrower_matrix,
164			related=self._related_matrix,
165			collection=self._collection_matrix,
166			doc_freq=self._doc_freq,
167			subj_freq=self._subj_freq,
168			idf=self._idf,
169			)
170
171			def _candidates_to_features(self, candidates: list[Candidate]) -> np.ndarray:
172			return candidates_to_features(candidates, self._model_data)
173
174			@staticmethod
175			def _get_label_props(params: dict[str, Any]) -> tuple[list[URIRef], list[URIRef]]:
176			pref_label_props = [SKOS.prefLabel]
177
178			if annif.util.boolean(params["use_hidden_labels"]):
179			nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
180			else:
181			nonpref_label_props = [SKOS.altLabel]
182
183			return (pref_label_props, nonpref_label_props)
184
185			def _prepare_terms(
186			self,
187			graph: Graph,
188			vocab: AnnifVocabulary,
189			params: dict[str, Any],
190			) -> tuple[list[Term], list[int]]:
191			pref_label_props, nonpref_label_props = self._get_label_props(params)
192
193			terms = []
194			subject_ids = []
195			for subj_id, subject in vocab.subjects.active:
196			subject_ids.append(subj_id)
197
198			for label in get_subject_labels(
199			graph, subject.uri, pref_label_props, params["language"]
200			):
201			terms.append(Term(subject_id=subj_id, label=label, is_pref=True))
202
203			for label in get_subject_labels(
204			graph, subject.uri, nonpref_label_props, params["language"]
205			):
206			terms.append(Term(subject_id=subj_id, label=label, is_pref=False))
207
208			return (terms, subject_ids)
209
210			def _prepare_relations(self, graph: Graph, vocab: AnnifVocabulary) -> None:
211			self._broader_matrix = make_relation_matrix(graph, vocab, SKOS.broader)
212			self._narrower_matrix = make_relation_matrix(graph, vocab, SKOS.narrower)
213			self._related_matrix = make_relation_matrix(graph, vocab, SKOS.related)
214			self._collection_matrix = make_collection_matrix(graph, vocab)
215
216			def _prepare_train_index(
217			self,
218			vocab: AnnifVocabulary,
219			analyzer: Analyzer,
220			params: dict[str, Any],
221			) -> list[int]:
222			graph = vocab.as_graph()
223			terms, subject_ids = self._prepare_terms(graph, vocab, params)
224			self._prepare_relations(graph, vocab)
225
226			self._vectorizer = CountVectorizer(
227			binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None
228			)
229			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
230
231			# frequency of each token used in labels - how rare each word is
232			token_freq = np.bincount(label_corpus.indices, minlength=label_corpus.shape[1])
233
234			self._index = TokenSetIndex()
235			for term, label_matrix in zip(terms, label_corpus):
236			tokens = label_matrix.nonzero()[1]
237			# sort tokens by frequency - use the rarest token as index key
238			tokens = sorted(tokens, key=token_freq.__getitem__)
239			tset = TokenSet(tokens, term.subject_id, term.is_pref)
240			self._index.add(tset)
241
242			return subject_ids
243
244			def _prepare_train_data(
245			self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int
246			) -> tuple[list[list[Candidate]], list[bool]]:
247			# frequency of subjects (by id) in the generated candidates
248			self._doc_freq = collections.Counter()
249			# frequency of manually assigned subjects ("domain keyphraseness")
250			self._subj_freq = collections.Counter()
251			train_x = []
252			train_y = []
253
254			jobs, pool_class = annif.parallel.get_pool(n_jobs)
255
256			cg_args = {
257			"analyzer": analyzer,
258			"vectorizer": self._vectorizer,
259			"index": self._index,
260			}
261
262			with pool_class(
263			jobs, initializer=MLLMCandidateGenerator.init, initargs=(cg_args,)
264			) as pool:
265			params = ((doc.subject_set, doc.text) for doc in corpus.documents)
266			for doc_subject_ids, candidates in pool.starmap(
267			MLLMCandidateGenerator.generate_candidates, params, 10
268			):
269			self._subj_freq.update(doc_subject_ids)
270			self._doc_freq.update([c.subject_id for c in candidates])
271			train_x.append(candidates)
272			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
273
274			return (train_x, train_y)
275
276			def _calculate_idf(
277			self, subject_ids: list[int], doc_count: int
278			) -> defaultdict[int, float]:
279			idf = collections.defaultdict(float)
280			for subj_id in subject_ids:
281			idf[subj_id] = math.log((doc_count + 1) / (self._doc_freq[subj_id] + 1)) + 1
282
283			return idf
284
285			def _prepare_features(
286			self, train_x: list[list[Candidate]], n_jobs: int
287			) -> list[np.ndarray]:
288			fc_args = {"mdata": self._model_data}
289			jobs, pool_class = annif.parallel.get_pool(n_jobs)
290
291			with pool_class(
292			jobs, initializer=MLLMFeatureConverter.init, initargs=(fc_args,)
293			) as pool:
294			features = pool.map(
295			MLLMFeatureConverter.candidates_to_features, train_x, 10
296			)
297
298			return features
299
300			def prepare_train(
301			self,
302			corpus: DocumentCorpus,
303			vocab: AnnifVocabulary,
304			analyzer: Analyzer,
305			params: dict[str, Any],
306			n_jobs: int,
307			) -> tuple[np.ndarray, np.ndarray]:
308			# create an index from the vocabulary terms
309			subject_ids = self._prepare_train_index(vocab, analyzer, params)
310
311			# convert the corpus into train data
312			train_x, train_y = self._prepare_train_data(corpus, analyzer, n_jobs)
313
314			# precalculate idf values for all candidate subjects
315			self._idf = self._calculate_idf(subject_ids, len(train_x))
316
317			# convert the train data into feature values
318			features = self._prepare_features(train_x, n_jobs)
319
320			return (np.vstack(features), np.array(train_y))
321
322			def _create_classifier(self, params: dict[str, Any]) -> BaggingClassifier:
323			return BaggingClassifier(
324			DecisionTreeClassifier(
325			min_samples_leaf=int(params["min_samples_leaf"]),
326			max_leaf_nodes=int(params["max_leaf_nodes"]),
327			),
328			max_samples=float(params["max_samples"]),
329			)
330
331			def train(
332			self,
333			train_x: np.ndarray \| list[tuple[int, int]],
334			train_y: list[bool] \| np.ndarray,
335			params: dict[str, Any],
336			) -> None:
337			# fit the model on the training corpus
338			self._classifier = self._create_classifier(params)
339			self._classifier.fit(train_x, train_y)
340			# sanity check: verify that the classifier has seen both classes
341			if self._classifier.n_classes_ != 2:
342			raise OperationFailedException(
343			"Unable to create classifier: "
344			+ "Not enough positive and negative examples "
345			+ "in the training data. Please check that your training "
346			+ "data matches your vocabulary."
347			)
348
349			def _prediction_to_list(
350			self, scores: np.ndarray, candidates: list[Candidate]
351			) -> list[tuple[np.float64, int]]:
352			subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)]
353			return sorted(subj_scores, reverse=True)
354
355			def predict(self, candidates: list[Candidate]) -> list[tuple[np.float64, int]]:
356			if not candidates:
357			return []
358			features = self._candidates_to_features(candidates)
359			scores = self._classifier.predict_proba(features)
360			return self._prediction_to_list(scores, candidates)
361
362			def save(self, filename: str) -> list[str]:
363			return joblib.dump(self, filename)
364
365			@staticmethod
366			def load(filename: str) -> MLLMModel:
367			return joblib.load(filename)
368

NatLibFi / Annif

annif.lexical.mllm.MLLMModel._get_label_props() A last analyzed 2025-08-15 15:10 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

annif.lexical.mllm.MLLMModel._get_label_props() A
last analyzed 2025-08-15 15:10 UTC