annif.backend.xtransformer.XTransformerBackend._train() - Code Metrics - Inspection of "Add Xtransformer to backend" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#798)

unknown

created 2025-07-10 12:12 UTC

XTransformerBackend._train() A

↳ Parent: annif.backend.xtransformer

Complexity

Conditions

Size

Total Lines	20
Code Lines	17

Duplication

Lines	20
Ratio	100 %

Importance

Changes

Metric	Value
cc	3
eloc	17
nop	4
dl	20
loc	20
rs	9.55
c	0
b	0
f	0

"""Annif backend using the transformer variant of pecos."""

import logging
import os.path as osp
import sys
from typing import Any

import numpy as np
import scipy.sparse as sp
from pecos.utils.featurization.text.preprocess import Preprocessor
from pecos.xmc.xtransformer import matcher, model
from pecos.xmc.xtransformer.model import XTransformer
from pecos.xmc.xtransformer.module import MLProblemWithText

from annif.corpus.document import DocumentCorpus
from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import SubjectSuggestion, SuggestionBatch
from annif.util import (
    apply_param_parse_config,
    atomic_save,
    atomic_save_folder,
    boolean,
)

from . import backend, mixins


class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
    """XTransformer based backend for Annif"""

    name = "xtransformer"
    needs_subject_index = True

    _model = None

    train_X_file = "xtransformer-train-X.npz"
    train_y_file = "xtransformer-train-y.npz"
    train_txt_file = "xtransformer-train-raw.txt"
    model_folder = "xtransformer-model"

    PARAM_CONFIG = {
        "min_df": int,
        "ngram": int,
        "fix_clustering": boolean,
        "nr_splits": int,
        "min_codes": int,
        "max_leaf_size": int,
        "Cn": float,
        "Cp": float,
        "cost_sensitive_ranker": boolean,
        "rel_mode": str,
        "rel_norm": str,
        "neg_mining_chain": str,
        "imbalanced_ratio": float,
        "imbalanced_depth": int,
        "max_match_clusters": int,
        "do_fine_tune": boolean,
        "model_shortcut": str,
        "beam_size": int,
        "limit": int,
        "post_processor": str,
        "negative_sampling": str,
        "ensemble_method": str,
        "threshold": float,
        "loss_function": str,
        "truncate_length": int,
        "hidden_droput_prob": float,
        "batch_size": int,
        "gradient_accumulation_steps": int,
        "learning_rate": float,
        "weight_decay": float,
        "adam_epsilon": float,
        "num_train_epochs": int,
        "max_steps": int,
        "lr_schedule": str,
        "warmup_steps": int,
        "logging_steps": int,
        "save_steps": int,
        "max_active_matching_labels": int,
        "max_num_labels_in_gpu": int,
        "use_gpu": boolean,
        "bootstrap_model": str,
    }

    DEFAULT_PARAMETERS = {
        "min_df": 1,
        "ngram": 1,
        "fix_clustering": False,
        "nr_splits": 16,
        "min_codes": None,
        "max_leaf_size": 100,
        "Cn": 0.5,
        "Cp": 5.0,
        "cost_sensitive_ranker": True,
        "rel_mode": "induce",
        "rel_norm": "l1",
        "neg_mining_chain": "tfn+man",
        "imbalanced_ratio": 0.0,
        "imbalanced_depth": 100,
        "max_match_clusters": 32768,
        "do_fine_tune": True,
        "model_shortcut": "distilbert-base-multilingual-uncased",
        "beam_size": 20,
        "limit": 100,
        "post_processor": "sigmoid",
        "negative_sampling": "tfn",
        "ensemble_method": "transformer-only",
        "threshold": 0.1,
        "loss_function": "squared-hinge",
        "truncate_length": 128,
        "hidden_droput_prob": 0.1,
        "batch_size": 32,
        "gradient_accumulation_steps": 1,
        "learning_rate": 1e-4,
        "weight_decay": 0.0,
        "adam_epsilon": 1e-8,
        "num_train_epochs": 1,
        "max_steps": 0,
        "lr_schedule": "linear",
        "warmup_steps": 0,
        "logging_steps": 100,
        "save_steps": 1000,
        "max_active_matching_labels": None,
        "max_num_labels_in_gpu": 65536,
        "use_gpu": True,
        "bootstrap_model": "linear",
    }

    def _initialize_model(self):
        if self._model is None:
            path = osp.join(self.datadir, self.model_folder)
            self.debug("loading model from {}".format(path))
            if osp.exists(path):
                self._model = XTransformer.load(path)
            else:
                raise NotInitializedException(
                    "model {} not found".format(path), backend_id=self.backend_id
                )

    def initialize(self, parallel: bool = False) -> None:
        self.initialize_vectorizer()
        self._initialize_model()

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def _create_train_files(self, veccorpus, corpus):
        self.info("creating train file")
        Xs = []
        ys = []
        txt_pth = osp.join(self.datadir, self.train_txt_file)
        with open(txt_pth, "w", encoding="utf-8") as txt_file:
            for doc, vector in zip(corpus.documents, veccorpus):
                subject_set = doc.subject_set
                if not (subject_set and doc.text):
                    continue  # noqa
                print(" ".join(doc.text.split()), file=txt_file)
                Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
                ys.append(
                    sp.csr_matrix(
                        (
                            np.ones(len(subject_set)),
                            (np.zeros(len(subject_set)), [s for s in subject_set]),
                        ),
                        shape=(1, len(self.project.subjects)),
                        dtype=np.float32,
                    ).sorted_indices()
                )
        atomic_save(
            sp.vstack(Xs, format="csr"),
            self.datadir,
            self.train_X_file,
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
        )
        atomic_save(
            sp.vstack(ys, format="csr"),
            self.datadir,
            self.train_y_file,
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
        )

    def _create_model(self, params, jobs):
        train_txts = Preprocessor.load_data_from_file(
            osp.join(self.datadir, self.train_txt_file),
            label_text_path=None,
            text_pos=0,
        )["corpus"]
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
        model_path = osp.join(self.datadir, self.model_folder)
        new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
        new_params["only_topk"] = new_params.pop("limit")
        train_params = XTransformer.TrainParams.from_dict(
            new_params, recursive=True
        ).to_dict()
        pred_params = XTransformer.PredParams.from_dict(
            new_params, recursive=True
        ).to_dict()

        self.info("Start training")
        # enable progress
        matcher.LOGGER.setLevel(logging.DEBUG)
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
        model.LOGGER.setLevel(logging.DEBUG)
        model.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
        self._model = XTransformer.train(
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
            clustering=None,
            val_prob=None,
            train_params=train_params,
            pred_params=pred_params,
            beam_size=int(params["beam_size"]),
            steps_scale=None,
            label_feat=None,
        )
        atomic_save_folder(self._model, model_path)

    def _train(

        self,
        corpus: DocumentCorpus,
        params: dict[str, Any],
        jobs: int = 0,
    ) -> None:
        if corpus == "cached":
            self.info("Reusing cached training data from previous run.")
        else:
            if corpus.is_empty():
                raise NotSupportedException("Cannot t project with no documents")
            input = (doc.text for doc in corpus.documents)
            vecparams = {
                "min_df": int(params["min_df"]),
                "tokenizer": self.project.analyzer.tokenize_words,
                "ngram_range": (1, int(params["ngram"])),
            }
            veccorpus = self.create_vectorizer(input, vecparams)
            self._create_train_files(veccorpus, corpus)
        self._create_model(params, jobs)

    def _suggest_batch(
        self, texts: list[str], params: dict[str, Any]
    ) -> SuggestionBatch:
        vector = self.vectorizer.transform(texts)
        if vector.nnz == 0:  # All zero vector, empty result
            return list()
        new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
        prediction = self._model.predict(
            texts,
            X_feat=vector.sorted_indices(),
            batch_size=new_params["batch_size"],
            use_gpu=True,
            only_top_k=new_params["limit"],
            post_processor=new_params["post_processor"],
        )
        current_batchsize = prediction.get_shape()[0]
        batch_result = []
        for i in range(current_batchsize):
            results = []
            row = prediction.getrow(i)
            for idx, score in zip(row.indices, row.data):
                results.append(SubjectSuggestion(subject_id=idx, score=score))
            batch_result.append(results)
        return SuggestionBatch.from_sequence(batch_result, self.project.subjects)


1		"""Annif backend using the transformer variant of pecos."""
2
3		import logging
4		import os.path as osp
5		import sys
6		from typing import Any
7
8		import numpy as np
9		import scipy.sparse as sp
10		from pecos.utils.featurization.text.preprocess import Preprocessor
11		from pecos.xmc.xtransformer import matcher, model
12		from pecos.xmc.xtransformer.model import XTransformer
13		from pecos.xmc.xtransformer.module import MLProblemWithText
14
15		from annif.corpus.document import DocumentCorpus
16		from annif.exception import NotInitializedException, NotSupportedException
17		from annif.suggestion import SubjectSuggestion, SuggestionBatch
18		from annif.util import (
19		apply_param_parse_config,
20		atomic_save,
21		atomic_save_folder,
22		boolean,
23		)
24
25		from . import backend, mixins
26
27
28		class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
29		"""XTransformer based backend for Annif"""
30
31		name = "xtransformer"
32		needs_subject_index = True
33
34		_model = None
35
36		train_X_file = "xtransformer-train-X.npz"
37		train_y_file = "xtransformer-train-y.npz"
38		train_txt_file = "xtransformer-train-raw.txt"
39		model_folder = "xtransformer-model"
40
41		PARAM_CONFIG = {
42		"min_df": int,
43		"ngram": int,
44		"fix_clustering": boolean,
45		"nr_splits": int,
46		"min_codes": int,
47		"max_leaf_size": int,
48		"Cn": float,
49		"Cp": float,
50		"cost_sensitive_ranker": boolean,
51		"rel_mode": str,
52		"rel_norm": str,
53		"neg_mining_chain": str,
54		"imbalanced_ratio": float,
55		"imbalanced_depth": int,
56		"max_match_clusters": int,
57		"do_fine_tune": boolean,
58		"model_shortcut": str,
59		"beam_size": int,
60		"limit": int,
61		"post_processor": str,
62		"negative_sampling": str,
63		"ensemble_method": str,
64		"threshold": float,
65		"loss_function": str,
66		"truncate_length": int,
67		"hidden_droput_prob": float,
68		"batch_size": int,
69		"gradient_accumulation_steps": int,
70		"learning_rate": float,
71		"weight_decay": float,
72		"adam_epsilon": float,
73		"num_train_epochs": int,
74		"max_steps": int,
75		"lr_schedule": str,
76		"warmup_steps": int,
77		"logging_steps": int,
78		"save_steps": int,
79		"max_active_matching_labels": int,
80		"max_num_labels_in_gpu": int,
81		"use_gpu": boolean,
82		"bootstrap_model": str,
83		}
84
85		DEFAULT_PARAMETERS = {
86		"min_df": 1,
87		"ngram": 1,
88		"fix_clustering": False,
89		"nr_splits": 16,
90		"min_codes": None,
91		"max_leaf_size": 100,
92		"Cn": 0.5,
93		"Cp": 5.0,
94		"cost_sensitive_ranker": True,
95		"rel_mode": "induce",
96		"rel_norm": "l1",
97		"neg_mining_chain": "tfn+man",
98		"imbalanced_ratio": 0.0,
99		"imbalanced_depth": 100,
100		"max_match_clusters": 32768,
101		"do_fine_tune": True,
102		"model_shortcut": "distilbert-base-multilingual-uncased",
103		"beam_size": 20,
104		"limit": 100,
105		"post_processor": "sigmoid",
106		"negative_sampling": "tfn",
107		"ensemble_method": "transformer-only",
108		"threshold": 0.1,
109		"loss_function": "squared-hinge",
110		"truncate_length": 128,
111		"hidden_droput_prob": 0.1,
112		"batch_size": 32,
113		"gradient_accumulation_steps": 1,
114		"learning_rate": 1e-4,
115		"weight_decay": 0.0,
116		"adam_epsilon": 1e-8,
117		"num_train_epochs": 1,
118		"max_steps": 0,
119		"lr_schedule": "linear",
120		"warmup_steps": 0,
121		"logging_steps": 100,
122		"save_steps": 1000,
123		"max_active_matching_labels": None,
124		"max_num_labels_in_gpu": 65536,
125		"use_gpu": True,
126		"bootstrap_model": "linear",
127		}
128
129		def _initialize_model(self):
130		if self._model is None:
131		path = osp.join(self.datadir, self.model_folder)
132		self.debug("loading model from {}".format(path))
133		if osp.exists(path):
134		self._model = XTransformer.load(path)
135		else:
136		raise NotInitializedException(
137		"model {} not found".format(path), backend_id=self.backend_id
138		)
139
140		def initialize(self, parallel: bool = False) -> None:
141		self.initialize_vectorizer()
142		self._initialize_model()
143
144		def default_params(self):
145		params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
146		params.update(self.DEFAULT_PARAMETERS)
147		return params
148
149		def _create_train_files(self, veccorpus, corpus):
150		self.info("creating train file")
151		Xs = []
152		ys = []
153		txt_pth = osp.join(self.datadir, self.train_txt_file)
154		with open(txt_pth, "w", encoding="utf-8") as txt_file:
155		for doc, vector in zip(corpus.documents, veccorpus):
156		subject_set = doc.subject_set
157		if not (subject_set and doc.text):
158		continue # noqa
159		print(" ".join(doc.text.split()), file=txt_file)
160		Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
161		ys.append(
162		sp.csr_matrix(
163		(
164		np.ones(len(subject_set)),
165		(np.zeros(len(subject_set)), [s for s in subject_set]),
166		),
167		shape=(1, len(self.project.subjects)),
168		dtype=np.float32,
169		).sorted_indices()
170		)
171		atomic_save(
172		sp.vstack(Xs, format="csr"),
173		self.datadir,
174		self.train_X_file,
175		method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
176		)
177		atomic_save(
178		sp.vstack(ys, format="csr"),
179		self.datadir,
180		self.train_y_file,
181		method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
182		)
183
184		def _create_model(self, params, jobs):
185		train_txts = Preprocessor.load_data_from_file(
186		osp.join(self.datadir, self.train_txt_file),
187		label_text_path=None,
188		text_pos=0,
189		)["corpus"]
190		train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
191		train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
192		model_path = osp.join(self.datadir, self.model_folder)
193		new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
194		new_params["only_topk"] = new_params.pop("limit")
195		train_params = XTransformer.TrainParams.from_dict(
196		new_params, recursive=True
197		).to_dict()
198		pred_params = XTransformer.PredParams.from_dict(
199		new_params, recursive=True
200		).to_dict()
201
202		self.info("Start training")
203		# enable progress
204		matcher.LOGGER.setLevel(logging.DEBUG)
205		matcher.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
206		model.LOGGER.setLevel(logging.DEBUG)
207		model.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
208		self._model = XTransformer.train(
209		MLProblemWithText(train_txts, train_y, X_feat=train_X),
210		clustering=None,
211		val_prob=None,
212		train_params=train_params,
213		pred_params=pred_params,
214		beam_size=int(params["beam_size"]),
215		steps_scale=None,
216		label_feat=None,
217		)
218		atomic_save_folder(self._model, model_path)
219
220	View Code Duplication	def _train(
		0 ignored issues – show Duplication introduced 2024-09-16 15:17 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
221		self,
222		corpus: DocumentCorpus,
223		params: dict[str, Any],
224		jobs: int = 0,
225		) -> None:
226		if corpus == "cached":
227		self.info("Reusing cached training data from previous run.")
228		else:
229		if corpus.is_empty():
230		raise NotSupportedException("Cannot t project with no documents")
231		input = (doc.text for doc in corpus.documents)
232		vecparams = {
233		"min_df": int(params["min_df"]),
234		"tokenizer": self.project.analyzer.tokenize_words,
235		"ngram_range": (1, int(params["ngram"])),
236		}
237		veccorpus = self.create_vectorizer(input, vecparams)
238		self._create_train_files(veccorpus, corpus)
239		self._create_model(params, jobs)
240
241		def _suggest_batch(
242		self, texts: list[str], params: dict[str, Any]
243		) -> SuggestionBatch:
244		vector = self.vectorizer.transform(texts)
245		if vector.nnz == 0: # All zero vector, empty result
246		return list()
247		new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
248		prediction = self._model.predict(
249		texts,
250		X_feat=vector.sorted_indices(),
251		batch_size=new_params["batch_size"],
252		use_gpu=True,
253		only_top_k=new_params["limit"],
254		post_processor=new_params["post_processor"],
255		)
256		current_batchsize = prediction.get_shape()[0]
257		batch_result = []
258		for i in range(current_batchsize):
259		results = []
260		row = prediction.getrow(i)
261		for idx, score in zip(row.indices, row.data):
262		results.append(SubjectSuggestion(subject_id=idx, score=score))
263		batch_result.append(results)
264		return SuggestionBatch.from_sequence(batch_result, self.project.subjects)
265

NatLibFi / Annif

Pull Request — main (#798)

XTransformerBackend._train() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like