annif.lexical.mllm.MLLMModel.prepare_train() - Code Metrics - Inspection of "WIP: run hyperparameter optimization in parallel o..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#873)

by Osma

created 2025-08-15 13:35 UTC

annif.lexical.mllm.MLLMModel.prepare_train() A

↳ Parent: annif.lexical.mllm

Complexity

Conditions

Size

Total Lines	21
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	12
nop	6
dl	0
loc	21
rs	9.8
c	0
b	0
f	0

"""MLLM (Maui-like Lexical Matchin) model for Annif"""

from __future__ import annotations

import collections
import math
from enum import IntEnum
from statistics import mean
from typing import TYPE_CHECKING, Any

import joblib
import numpy as np
from rdflib.namespace import SKOS
from sklearn.ensemble import BaggingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier

import annif.parallel
import annif.util
from annif.exception import OperationFailedException
from annif.lexical.tokenset import TokenSet, TokenSetIndex
from annif.lexical.util import (
    get_subject_labels,
    make_collection_matrix,
    make_relation_matrix,
)

if TYPE_CHECKING:
    from collections import defaultdict

    from rdflib.graph import Graph
    from rdflib.term import URIRef

    from annif.analyzer import Analyzer
    from annif.corpus.document import DocumentCorpus
    from annif.vocab import AnnifVocabulary

Term = collections.namedtuple("Term", "subject_id label is_pref")

Match = collections.namedtuple("Match", "subject_id is_pref n_tokens pos ambiguity")

Candidate = collections.namedtuple(
    "Candidate",
    "doc_length subject_id freq is_pref n_tokens ambiguity "
    + "first_occ last_occ spread",
)

ModelData = collections.namedtuple(
    "ModelData", "broader narrower related collection " + "doc_freq subj_freq idf"
)

Feature = IntEnum(
    "Feature",
    "freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity "
    + "first_occ last_occ spread doc_length "
    + "broader narrower related collection",
    start=0,
)


def conflate_matches(matches: list[Match], doc_length: int) -> list[Candidate]:
    subj_matches = collections.defaultdict(list)
    for match in matches:
        subj_matches[match.subject_id].append(match)
    return [
        Candidate(
            doc_length=doc_length,
            subject_id=subject_id,
            freq=len(matches) / doc_length,
            is_pref=mean((float(m.is_pref) for m in matches)),
            n_tokens=mean((m.n_tokens for m in matches)),
            ambiguity=mean((m.ambiguity for m in matches)),
            first_occ=matches[0].pos / doc_length,
            last_occ=matches[-1].pos / doc_length,
            spread=(matches[-1].pos - matches[0].pos) / doc_length,
        )
        for subject_id, matches in subj_matches.items()
    ]


def generate_candidates(
    text: str,
    analyzer: Analyzer,
    vectorizer: CountVectorizer,
    index: TokenSetIndex,
) -> list[Candidate]:
    sentences = analyzer.tokenize_sentences(text)
    sent_tokens = vectorizer.transform(sentences)
    matches = []

    for sent_idx, token_matrix in enumerate(sent_tokens):
        tset = TokenSet(token_matrix.nonzero()[1])
        for ts, ambiguity in index.search(tset):
            matches.append(
                Match(
                    subject_id=ts.subject_id,
                    is_pref=ts.is_pref,
                    n_tokens=len(ts),
                    pos=sent_idx,
                    ambiguity=ambiguity,
                )
            )

    return conflate_matches(matches, len(sentences))


def candidates_to_features(
    candidates: list[Candidate], mdata: "ModelData"
) -> np.ndarray:
    """Convert a list of Candidates to a NumPy feature matrix"""

    matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
    c_ids = [c.subject_id for c in candidates]
    c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
    c_vec[c_ids] = True
    broader = mdata.broader.multiply(c_vec).sum(axis=1)
    narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
    related = mdata.related.multiply(c_vec).sum(axis=1)
    collection = mdata.collection.multiply(c_vec).T.dot(mdata.collection).sum(axis=0)
    for idx, c in enumerate(candidates):
        subj = c.subject_id
        matrix[idx, Feature.freq] = c.freq
        matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
        matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
        matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
        matrix[idx, Feature.is_pref] = c.is_pref
        matrix[idx, Feature.n_tokens] = c.n_tokens
        matrix[idx, Feature.ambiguity] = c.ambiguity
        matrix[idx, Feature.first_occ] = c.first_occ
        matrix[idx, Feature.last_occ] = c.last_occ
        matrix[idx, Feature.spread] = c.spread
        matrix[idx, Feature.doc_length] = c.doc_length
        matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
        matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
        matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
        matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
    return matrix


def create_classifier(params: dict[str, Any]) -> BaggingClassifier:
    return BaggingClassifier(
        DecisionTreeClassifier(
            min_samples_leaf=int(params["min_samples_leaf"]),
            max_leaf_nodes=int(params["max_leaf_nodes"]),
        ),
        max_samples=float(params["max_samples"]),
    )


def prediction_to_list(
    scores: np.ndarray, candidates: list[Candidate]
) -> list[tuple[np.float64, int]]:
    subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)]
    return sorted(subj_scores, reverse=True)


class MLLMCandidateGenerator(annif.parallel.BaseWorker):
    @classmethod
    def generate_candidates(cls, doc_subject_set, text):
        candidates = generate_candidates(text, **cls.args)  # pragma: no cover
        return doc_subject_set, candidates  # pragma: no cover


class MLLMFeatureConverter(annif.parallel.BaseWorker):
    @classmethod
    def candidates_to_features(cls, candidates):
        return candidates_to_features(candidates, **cls.args)  # pragma: no cover


class MLLMModel:
    """Maui-like Lexical Matching model"""

    def generate_candidates(self, text: str, analyzer: Analyzer) -> list[Candidate]:
        return generate_candidates(text, analyzer, self._vectorizer, self._index)

    @property
    def _model_data(self) -> ModelData:
        return ModelData(
            broader=self._broader_matrix,
            narrower=self._narrower_matrix,
            related=self._related_matrix,
            collection=self._collection_matrix,
            doc_freq=self._doc_freq,
            subj_freq=self._subj_freq,
            idf=self._idf,
        )

    def _candidates_to_features(self, candidates: list[Candidate]) -> np.ndarray:
        return candidates_to_features(candidates, self._model_data)

    @staticmethod
    def _get_label_props(params: dict[str, Any]) -> tuple[list[URIRef], list[URIRef]]:
        pref_label_props = [SKOS.prefLabel]

        if annif.util.boolean(params["use_hidden_labels"]):
            nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
        else:
            nonpref_label_props = [SKOS.altLabel]

        return (pref_label_props, nonpref_label_props)

    def _prepare_terms(
        self,
        graph: Graph,
        vocab: AnnifVocabulary,
        params: dict[str, Any],
    ) -> tuple[list[Term], list[int]]:
        pref_label_props, nonpref_label_props = self._get_label_props(params)

        terms = []
        subject_ids = []
        for subj_id, subject in vocab.subjects.active:
            subject_ids.append(subj_id)

            for label in get_subject_labels(
                graph, subject.uri, pref_label_props, params["language"]
            ):
                terms.append(Term(subject_id=subj_id, label=label, is_pref=True))

            for label in get_subject_labels(
                graph, subject.uri, nonpref_label_props, params["language"]
            ):
                terms.append(Term(subject_id=subj_id, label=label, is_pref=False))

        return (terms, subject_ids)

    def _prepare_relations(self, graph: Graph, vocab: AnnifVocabulary) -> None:
        self._broader_matrix = make_relation_matrix(graph, vocab, SKOS.broader)
        self._narrower_matrix = make_relation_matrix(graph, vocab, SKOS.narrower)
        self._related_matrix = make_relation_matrix(graph, vocab, SKOS.related)
        self._collection_matrix = make_collection_matrix(graph, vocab)

    def _prepare_train_index(
        self,
        vocab: AnnifVocabulary,
        analyzer: Analyzer,
        params: dict[str, Any],
    ) -> list[int]:
        graph = vocab.as_graph()
        terms, subject_ids = self._prepare_terms(graph, vocab, params)
        self._prepare_relations(graph, vocab)

        self._vectorizer = CountVectorizer(
            binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None
        )
        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))

        # frequency of each token used in labels - how rare each word is
        token_freq = np.bincount(label_corpus.indices, minlength=label_corpus.shape[1])

        self._index = TokenSetIndex()
        for term, label_matrix in zip(terms, label_corpus):
            tokens = label_matrix.nonzero()[1]
            # sort tokens by frequency - use the rarest token as index key
            tokens = sorted(tokens, key=token_freq.__getitem__)
            tset = TokenSet(tokens, term.subject_id, term.is_pref)
            self._index.add(tset)

        return subject_ids

    def _prepare_train_data(
        self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int
    ) -> tuple[list[list[Candidate]], list[bool]]:
        # frequency of subjects (by id) in the generated candidates
        self._doc_freq = collections.Counter()
        # frequency of manually assigned subjects ("domain keyphraseness")
        self._subj_freq = collections.Counter()
        train_x = []
        train_y = []

        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        cg_args = {
            "analyzer": analyzer,
            "vectorizer": self._vectorizer,
            "index": self._index,
        }

        with pool_class(
            jobs, initializer=MLLMCandidateGenerator.init, initargs=(cg_args,)
        ) as pool:
            params = ((doc.subject_set, doc.text) for doc in corpus.documents)
            for doc_subject_ids, candidates in pool.starmap(
                MLLMCandidateGenerator.generate_candidates, params, 10
            ):
                self._subj_freq.update(doc_subject_ids)
                self._doc_freq.update([c.subject_id for c in candidates])
                train_x.append(candidates)
                train_y += [(c.subject_id in doc_subject_ids) for c in candidates]

        return (train_x, train_y)

    def _calculate_idf(
        self, subject_ids: list[int], doc_count: int
    ) -> defaultdict[int, float]:
        idf = collections.defaultdict(float)
        for subj_id in subject_ids:
            idf[subj_id] = math.log((doc_count + 1) / (self._doc_freq[subj_id] + 1)) + 1

        return idf

    def _prepare_features(
        self, train_x: list[list[Candidate]], n_jobs: int
    ) -> list[np.ndarray]:
        fc_args = {"mdata": self._model_data}
        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        with pool_class(
            jobs, initializer=MLLMFeatureConverter.init, initargs=(fc_args,)
        ) as pool:
            features = pool.map(
                MLLMFeatureConverter.candidates_to_features, train_x, 10
            )

        return features

    def prepare_train(
        self,
        corpus: DocumentCorpus,
        vocab: AnnifVocabulary,
        analyzer: Analyzer,
        params: dict[str, Any],
        n_jobs: int,
    ) -> tuple[np.ndarray, np.ndarray]:
        # create an index from the vocabulary terms
        subject_ids = self._prepare_train_index(vocab, analyzer, params)

        # convert the corpus into train data
        train_x, train_y = self._prepare_train_data(corpus, analyzer, n_jobs)

        # precalculate idf values for all candidate subjects
        self._idf = self._calculate_idf(subject_ids, len(train_x))

        # convert the train data into feature values
        features = self._prepare_features(train_x, n_jobs)

        return (np.vstack(features), np.array(train_y))

    def train(
        self,
        train_x: np.ndarray | list[tuple[int, int]],
        train_y: list[bool] | np.ndarray,
        params: dict[str, Any],
    ) -> None:
        # fit the model on the training corpus
        self._classifier = create_classifier(params)
        self._classifier.fit(train_x, train_y)
        # sanity check: verify that the classifier has seen both classes
        if self._classifier.n_classes_ != 2:
            raise OperationFailedException(
                "Unable to create classifier: "
                + "Not enough positive and negative examples "
                + "in the training data. Please check that your training "
                + "data matches your vocabulary."
            )

    def predict(self, candidates: list[Candidate]) -> list[tuple[np.float64, int]]:
        if not candidates:
            return []
        features = self._candidates_to_features(candidates)
        scores = self._classifier.predict_proba(features)
        return prediction_to_list(scores, candidates)

    def save(self, filename: str) -> list[str]:
        return joblib.dump(self, filename)

    @staticmethod
    def load(filename: str) -> MLLMModel:
        return joblib.load(filename)


1			"""MLLM (Maui-like Lexical Matchin) model for Annif"""
2
3			from __future__ import annotations
4
5			import collections
6			import math
7			from enum import IntEnum
8			from statistics import mean
9			from typing import TYPE_CHECKING, Any
10
11			import joblib
12			import numpy as np
13			from rdflib.namespace import SKOS
14			from sklearn.ensemble import BaggingClassifier
15			from sklearn.feature_extraction.text import CountVectorizer
16			from sklearn.tree import DecisionTreeClassifier
17
18			import annif.parallel
19			import annif.util
20			from annif.exception import OperationFailedException
21			from annif.lexical.tokenset import TokenSet, TokenSetIndex
22			from annif.lexical.util import (
23			get_subject_labels,
24			make_collection_matrix,
25			make_relation_matrix,
26			)
27
28			if TYPE_CHECKING:
29			from collections import defaultdict
30
31			from rdflib.graph import Graph
32			from rdflib.term import URIRef
33
34			from annif.analyzer import Analyzer
35			from annif.corpus.document import DocumentCorpus
36			from annif.vocab import AnnifVocabulary
37
38			Term = collections.namedtuple("Term", "subject_id label is_pref")
39
40			Match = collections.namedtuple("Match", "subject_id is_pref n_tokens pos ambiguity")
41
42			Candidate = collections.namedtuple(
43			"Candidate",
44			"doc_length subject_id freq is_pref n_tokens ambiguity "
45			+ "first_occ last_occ spread",
46			)
47
48			ModelData = collections.namedtuple(
49			"ModelData", "broader narrower related collection " + "doc_freq subj_freq idf"
50			)
51
52			Feature = IntEnum(
53			"Feature",
54			"freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity "
55			+ "first_occ last_occ spread doc_length "
56			+ "broader narrower related collection",
57			start=0,
58			)
59
60
61			def conflate_matches(matches: list[Match], doc_length: int) -> list[Candidate]:
62			subj_matches = collections.defaultdict(list)
63			for match in matches:
64			subj_matches[match.subject_id].append(match)
65			return [
66			Candidate(
67			doc_length=doc_length,
68			subject_id=subject_id,
69			freq=len(matches) / doc_length,
70			is_pref=mean((float(m.is_pref) for m in matches)),
71			n_tokens=mean((m.n_tokens for m in matches)),
72			ambiguity=mean((m.ambiguity for m in matches)),
73			first_occ=matches[0].pos / doc_length,
74			last_occ=matches[-1].pos / doc_length,
75			spread=(matches[-1].pos - matches[0].pos) / doc_length,
76			)
77			for subject_id, matches in subj_matches.items()
78			]
79
80
81			def generate_candidates(
82			text: str,
83			analyzer: Analyzer,
84			vectorizer: CountVectorizer,
85			index: TokenSetIndex,
86			) -> list[Candidate]:
87			sentences = analyzer.tokenize_sentences(text)
88			sent_tokens = vectorizer.transform(sentences)
89			matches = []
90
91			for sent_idx, token_matrix in enumerate(sent_tokens):
92			tset = TokenSet(token_matrix.nonzero()[1])
93			for ts, ambiguity in index.search(tset):
94			matches.append(
95			Match(
96			subject_id=ts.subject_id,
97			is_pref=ts.is_pref,
98			n_tokens=len(ts),
99			pos=sent_idx,
100			ambiguity=ambiguity,
101			)
102			)
103
104			return conflate_matches(matches, len(sentences))
105
106
107			def candidates_to_features(
108			candidates: list[Candidate], mdata: "ModelData"
109			) -> np.ndarray:
110			"""Convert a list of Candidates to a NumPy feature matrix"""
111
112			matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)
113			c_ids = [c.subject_id for c in candidates]
114			c_vec = np.zeros(mdata.related.shape[0], dtype=bool)
115			c_vec[c_ids] = True
116			broader = mdata.broader.multiply(c_vec).sum(axis=1)
117			narrower = mdata.narrower.multiply(c_vec).sum(axis=1)
118			related = mdata.related.multiply(c_vec).sum(axis=1)
119			collection = mdata.collection.multiply(c_vec).T.dot(mdata.collection).sum(axis=0)
120			for idx, c in enumerate(candidates):
121			subj = c.subject_id
122			matrix[idx, Feature.freq] = c.freq
123			matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj]
124			matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1
125			matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj]
126			matrix[idx, Feature.is_pref] = c.is_pref
127			matrix[idx, Feature.n_tokens] = c.n_tokens
128			matrix[idx, Feature.ambiguity] = c.ambiguity
129			matrix[idx, Feature.first_occ] = c.first_occ
130			matrix[idx, Feature.last_occ] = c.last_occ
131			matrix[idx, Feature.spread] = c.spread
132			matrix[idx, Feature.doc_length] = c.doc_length
133			matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)
134			matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)
135			matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)
136			matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)
137			return matrix
138
139
140			def create_classifier(params: dict[str, Any]) -> BaggingClassifier:
141			return BaggingClassifier(
142			DecisionTreeClassifier(
143			min_samples_leaf=int(params["min_samples_leaf"]),
144			max_leaf_nodes=int(params["max_leaf_nodes"]),
145			),
146			max_samples=float(params["max_samples"]),
147			)
148
149
150			def prediction_to_list(
151			scores: np.ndarray, candidates: list[Candidate]
152			) -> list[tuple[np.float64, int]]:
153			subj_scores = [(score[1], c.subject_id) for score, c in zip(scores, candidates)]
154			return sorted(subj_scores, reverse=True)
155
156
157			class MLLMCandidateGenerator(annif.parallel.BaseWorker):
158			@classmethod
159			def generate_candidates(cls, doc_subject_set, text):
160			candidates = generate_candidates(text, **cls.args) # pragma: no cover
161			return doc_subject_set, candidates # pragma: no cover
162
163
164			class MLLMFeatureConverter(annif.parallel.BaseWorker):
165			@classmethod
166			def candidates_to_features(cls, candidates):
167			return candidates_to_features(candidates, **cls.args) # pragma: no cover
168
169
170			class MLLMModel:
171			"""Maui-like Lexical Matching model"""
172
173			def generate_candidates(self, text: str, analyzer: Analyzer) -> list[Candidate]:
174			return generate_candidates(text, analyzer, self._vectorizer, self._index)
175
176			@property
177			def _model_data(self) -> ModelData:
178			return ModelData(
179			broader=self._broader_matrix,
180			narrower=self._narrower_matrix,
181			related=self._related_matrix,
182			collection=self._collection_matrix,
183			doc_freq=self._doc_freq,
184			subj_freq=self._subj_freq,
185			idf=self._idf,
186			)
187
188			def _candidates_to_features(self, candidates: list[Candidate]) -> np.ndarray:
189			return candidates_to_features(candidates, self._model_data)
190
191			@staticmethod
192			def _get_label_props(params: dict[str, Any]) -> tuple[list[URIRef], list[URIRef]]:
193			pref_label_props = [SKOS.prefLabel]
194
195			if annif.util.boolean(params["use_hidden_labels"]):
196			nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel]
197			else:
198			nonpref_label_props = [SKOS.altLabel]
199
200			return (pref_label_props, nonpref_label_props)
201
202			def _prepare_terms(
203			self,
204			graph: Graph,
205			vocab: AnnifVocabulary,
206			params: dict[str, Any],
207			) -> tuple[list[Term], list[int]]:
208			pref_label_props, nonpref_label_props = self._get_label_props(params)
209
210			terms = []
211			subject_ids = []
212			for subj_id, subject in vocab.subjects.active:
213			subject_ids.append(subj_id)
214
215			for label in get_subject_labels(
216			graph, subject.uri, pref_label_props, params["language"]
217			):
218			terms.append(Term(subject_id=subj_id, label=label, is_pref=True))
219
220			for label in get_subject_labels(
221			graph, subject.uri, nonpref_label_props, params["language"]
222			):
223			terms.append(Term(subject_id=subj_id, label=label, is_pref=False))
224
225			return (terms, subject_ids)
226
227			def _prepare_relations(self, graph: Graph, vocab: AnnifVocabulary) -> None:
228			self._broader_matrix = make_relation_matrix(graph, vocab, SKOS.broader)
229			self._narrower_matrix = make_relation_matrix(graph, vocab, SKOS.narrower)
230			self._related_matrix = make_relation_matrix(graph, vocab, SKOS.related)
231			self._collection_matrix = make_collection_matrix(graph, vocab)
232
233			def _prepare_train_index(
234			self,
235			vocab: AnnifVocabulary,
236			analyzer: Analyzer,
237			params: dict[str, Any],
238			) -> list[int]:
239			graph = vocab.as_graph()
240			terms, subject_ids = self._prepare_terms(graph, vocab, params)
241			self._prepare_relations(graph, vocab)
242
243			self._vectorizer = CountVectorizer(
244			binary=True, tokenizer=analyzer.tokenize_words, token_pattern=None
245			)
246			label_corpus = self._vectorizer.fit_transform((t.label for t in terms))
247
248			# frequency of each token used in labels - how rare each word is
249			token_freq = np.bincount(label_corpus.indices, minlength=label_corpus.shape[1])
250
251			self._index = TokenSetIndex()
252			for term, label_matrix in zip(terms, label_corpus):
253			tokens = label_matrix.nonzero()[1]
254			# sort tokens by frequency - use the rarest token as index key
255			tokens = sorted(tokens, key=token_freq.__getitem__)
256			tset = TokenSet(tokens, term.subject_id, term.is_pref)
257			self._index.add(tset)
258
259			return subject_ids
260
261			def _prepare_train_data(
262			self, corpus: DocumentCorpus, analyzer: Analyzer, n_jobs: int
263			) -> tuple[list[list[Candidate]], list[bool]]:
264			# frequency of subjects (by id) in the generated candidates
265			self._doc_freq = collections.Counter()
266			# frequency of manually assigned subjects ("domain keyphraseness")
267			self._subj_freq = collections.Counter()
268			train_x = []
269			train_y = []
270
271			jobs, pool_class = annif.parallel.get_pool(n_jobs)
272
273			cg_args = {
274			"analyzer": analyzer,
275			"vectorizer": self._vectorizer,
276			"index": self._index,
277			}
278
279			with pool_class(
280			jobs, initializer=MLLMCandidateGenerator.init, initargs=(cg_args,)
281			) as pool:
282			params = ((doc.subject_set, doc.text) for doc in corpus.documents)
283			for doc_subject_ids, candidates in pool.starmap(
284			MLLMCandidateGenerator.generate_candidates, params, 10
285			):
286			self._subj_freq.update(doc_subject_ids)
287			self._doc_freq.update([c.subject_id for c in candidates])
288			train_x.append(candidates)
289			train_y += [(c.subject_id in doc_subject_ids) for c in candidates]
290
291			return (train_x, train_y)
292
293			def _calculate_idf(
294			self, subject_ids: list[int], doc_count: int
295			) -> defaultdict[int, float]:
296			idf = collections.defaultdict(float)
297			for subj_id in subject_ids:
298			idf[subj_id] = math.log((doc_count + 1) / (self._doc_freq[subj_id] + 1)) + 1
299
300			return idf
301
302			def _prepare_features(
303			self, train_x: list[list[Candidate]], n_jobs: int
304			) -> list[np.ndarray]:
305			fc_args = {"mdata": self._model_data}
306			jobs, pool_class = annif.parallel.get_pool(n_jobs)
307
308			with pool_class(
309			jobs, initializer=MLLMFeatureConverter.init, initargs=(fc_args,)
310			) as pool:
311			features = pool.map(
312			MLLMFeatureConverter.candidates_to_features, train_x, 10
313			)
314
315			return features
316
317			def prepare_train(
318			self,
319			corpus: DocumentCorpus,
320			vocab: AnnifVocabulary,
321			analyzer: Analyzer,
322			params: dict[str, Any],
323			n_jobs: int,
324			) -> tuple[np.ndarray, np.ndarray]:
325			# create an index from the vocabulary terms
326			subject_ids = self._prepare_train_index(vocab, analyzer, params)
327
328			# convert the corpus into train data
329			train_x, train_y = self._prepare_train_data(corpus, analyzer, n_jobs)
330
331			# precalculate idf values for all candidate subjects
332			self._idf = self._calculate_idf(subject_ids, len(train_x))
333
334			# convert the train data into feature values
335			features = self._prepare_features(train_x, n_jobs)
336
337			return (np.vstack(features), np.array(train_y))
338
339			def train(
340			self,
341			train_x: np.ndarray \| list[tuple[int, int]],
342			train_y: list[bool] \| np.ndarray,
343			params: dict[str, Any],
344			) -> None:
345			# fit the model on the training corpus
346			self._classifier = create_classifier(params)
347			self._classifier.fit(train_x, train_y)
348			# sanity check: verify that the classifier has seen both classes
349			if self._classifier.n_classes_ != 2:
350			raise OperationFailedException(
351			"Unable to create classifier: "
352			+ "Not enough positive and negative examples "
353			+ "in the training data. Please check that your training "
354			+ "data matches your vocabulary."
355			)
356
357			def predict(self, candidates: list[Candidate]) -> list[tuple[np.float64, int]]:
358			if not candidates:
359			return []
360			features = self._candidates_to_features(candidates)
361			scores = self._classifier.predict_proba(features)
362			return prediction_to_list(scores, candidates)
363
364			def save(self, filename: str) -> list[str]:
365			return joblib.dump(self, filename)
366
367			@staticmethod
368			def load(filename: str) -> MLLMModel:
369			return joblib.load(filename)
370

NatLibFi / Annif

Pull Request — main (#873)

annif.lexical.mllm.MLLMModel.prepare_train() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like