annif.backend.xtransformer.XTransformerBackend._create_train_files() - Code Metrics - Inspection of "Merge branch 'api-i18n' of github.com:mo-fu/Annif" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( eb437a...a53d46 )

by Juho

created 2023-03-09 08:59 UTC

XTransformerBackend._create_train_files() B

↳ Parent: annif.backend.xtransformer

Complexity

Conditions

Size

Total Lines	33
Code Lines	30

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	30
nop	3
dl	0
loc	33
rs	7.76
c	0
b	0
f	0

"""Annif backend using the transformer variant of pecos."""

import logging
import os.path as osp
from sys import stdout

import numpy as np
import scipy.sparse as sp
from pecos.utils.featurization.text.preprocess import Preprocessor
from pecos.xmc.xtransformer import matcher
from pecos.xmc.xtransformer.model import XTransformer
from pecos.xmc.xtransformer.module import MLProblemWithText

from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import ListSuggestionResult, SubjectSuggestion
from annif.util import (
    apply_param_parse_config,
    atomic_save,
    atomic_save_folder,
    boolean,
)

from . import backend, mixins


class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
    """XTransformer based backend for Annif"""

    name = "xtransformer"
    needs_subject_index = True

    _model = None

    train_X_file = "xtransformer-train-X.npz"
    train_y_file = "xtransformer-train-y.npz"
    train_txt_file = "xtransformer-train-raw.txt"
    model_folder = "xtransformer-model"

    PARAM_CONFIG = {
        "min_df": int,
        "ngram": int,
        "fix_clustering": boolean,
        "nr_splits": int,
        "min_codes": int,
        "max_leaf_size": int,
        "imbalanced_ratio": float,
        "imbalanced_depth": int,
        "max_match_clusters": int,
        "do_fine_tune": boolean,
        "model_shortcut": str,
        "beam_size": int,
        "limit": int,
        "post_processor": str,
        "negative_sampling": str,
        "ensemble_method": str,
        "threshold": float,
        "loss_function": str,
        "truncate_length": int,
        "hidden_droput_prob": float,
        "batch_size": int,
        "gradient_accumulation_steps": int,
        "learning_rate": float,
        "weight_decay": float,
        "adam_epsilon": float,
        "num_train_epochs": int,
        "max_steps": int,
        "lr_schedule": str,
        "warmup_steps": int,
        "logging_steps": int,
        "save_steps": int,
        "max_active_matching_labels": int,
        "max_num_labels_in_gpu": int,
        "use_gpu": boolean,
        "bootstrap_model": str,
    }

    DEFAULT_PARAMETERS = {
        "min_df": 1,
        "ngram": 1,
        "fix_clustering": False,
        "nr_splits": 16,
        "min_codes": None,
        "max_leaf_size": 100,
        "imbalanced_ratio": 0.0,
        "imbalanced_depth": 100,
        "max_match_clusters": 32768,
        "do_fine_tune": True,
        "model_shortcut": "distilbert-base-multilingual-cased",
        "beam_size": 20,
        "limit": 100,
        "post_processor": "sigmoid",
        "negative_sampling": "tfn",
        "ensemble_method": "transformer-only",
        "threshold": 0.1,
        "loss_function": "squared-hinge",
        "truncate_length": 128,
        "hidden_droput_prob": 0.1,
        "batch_size": 32,
        "gradient_accumulation_steps": 1,
        "learning_rate": 1e-4,
        "weight_decay": 0.0,
        "adam_epsilon": 1e-8,
        "num_train_epochs": 1,
        "max_steps": 0,
        "lr_schedule": "linear",
        "warmup_steps": 0,
        "logging_steps": 100,
        "save_steps": 1000,
        "max_active_matching_labels": None,
        "max_num_labels_in_gpu": 65536,
        "use_gpu": True,
        "bootstrap_model": "linear",
    }

    def _initialize_model(self):
        if self._model is None:
            path = osp.join(self.datadir, self.model_folder)
            self.debug("loading model from {}".format(path))
            if osp.exists(path):
                self._model = XTransformer.load(path)
            else:
                raise NotInitializedException(
                    "model {} not found".format(path), backend_id=self.backend_id
                )

    def initialize(self, parallel=False):
        self.initialize_vectorizer()
        self._initialize_model()

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def _create_train_files(self, veccorpus, corpus):
        self.info("creating train file")
        Xs = []
        ys = []
        txt_pth = osp.join(self.datadir, self.train_txt_file)
        with open(txt_pth, "w", encoding="utf-8") as txt_file:
            for doc, vector in zip(corpus.documents, veccorpus):
                subject_set = doc.subject_set
                if not (subject_set and doc.text):
                    continue  # noqa
                print(" ".join(doc.text.split()), file=txt_file)
                Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
                ys.append(
                    sp.csr_matrix(
                        (
                            np.ones(len(subject_set)),
                            (np.zeros(len(subject_set)), [s for s in subject_set]),
                        ),
                        shape=(1, len(self.project.subjects)),
                        dtype=np.float32,
                    ).sorted_indices()
                )
        atomic_save(
            sp.vstack(Xs, format="csr"),
            self.datadir,
            self.train_X_file,
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
        )
        atomic_save(
            sp.vstack(ys, format="csr"),
            self.datadir,
            self.train_y_file,
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
        )

    def _create_model(self, params, jobs):
        train_txts = Preprocessor.load_data_from_file(
            osp.join(self.datadir, self.train_txt_file),
            label_text_path=None,
            text_pos=0,
        )["corpus"]
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
        model_path = osp.join(self.datadir, self.model_folder)
        new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
        new_params["only_topk"] = new_params.pop("limit")
        train_params = XTransformer.TrainParams.from_dict(
            new_params, recursive=True
        ).to_dict()
        pred_params = XTransformer.PredParams.from_dict(
            new_params, recursive=True
        ).to_dict()

        self.info("Start training")
        # enable progress
        matcher.LOGGER.setLevel(logging.INFO)
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
        self._model = XTransformer.train(
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
            clustering=None,
            val_prob=None,
            train_params=train_params,
            pred_params=pred_params,
            beam_size=params["beam_size"],
            steps_scale=None,
            label_feat=None,
        )
        atomic_save_folder(self._model, model_path)

    def _train(self, corpus, params, jobs=0):

        if corpus == "cached":
            self.info("Reusing cached training data from previous run.")
        else:
            if corpus.is_empty():
                raise NotSupportedException("Cannot t project with no documents")
            input = (doc.text for doc in corpus.documents)
            vecparams = {
                "min_df": int(params["min_df"]),
                "tokenizer": self.project.analyzer.tokenize_words,
                "ngram_range": (1, int(params["ngram"])),
            }
            veccorpus = self.create_vectorizer(input, vecparams)
            self._create_train_files(veccorpus, corpus)
        self._create_model(params, jobs)

    def _suggest(self, text, params):
        text = " ".join(text.split())
        vector = self.vectorizer.transform([text])
        if vector.nnz == 0:  # All zero vector, empty result
            return ListSuggestionResult([])
        new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
        prediction = self._model.predict(
            [text],
            X_feat=vector.sorted_indices(),
            batch_size=new_params["batch_size"],
            use_gpu=False,
            only_top_k=new_params["limit"],
            post_processor=new_params["post_processor"],
        )
        results = []
        for idx, score in zip(prediction.indices, prediction.data):
            results.append(SubjectSuggestion(subject_id=idx, score=score))
        return ListSuggestionResult(results)


1		"""Annif backend using the transformer variant of pecos."""
2
3		import logging
4		import os.path as osp
5		from sys import stdout
6
7		import numpy as np
8		import scipy.sparse as sp
9		from pecos.utils.featurization.text.preprocess import Preprocessor
10		from pecos.xmc.xtransformer import matcher
11		from pecos.xmc.xtransformer.model import XTransformer
12		from pecos.xmc.xtransformer.module import MLProblemWithText
13
14		from annif.exception import NotInitializedException, NotSupportedException
15		from annif.suggestion import ListSuggestionResult, SubjectSuggestion
16		from annif.util import (
17		apply_param_parse_config,
18		atomic_save,
19		atomic_save_folder,
20		boolean,
21		)
22
23		from . import backend, mixins
24
25
26		class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
27		"""XTransformer based backend for Annif"""
28
29		name = "xtransformer"
30		needs_subject_index = True
31
32		_model = None
33
34		train_X_file = "xtransformer-train-X.npz"
35		train_y_file = "xtransformer-train-y.npz"
36		train_txt_file = "xtransformer-train-raw.txt"
37		model_folder = "xtransformer-model"
38
39		PARAM_CONFIG = {
40		"min_df": int,
41		"ngram": int,
42		"fix_clustering": boolean,
43		"nr_splits": int,
44		"min_codes": int,
45		"max_leaf_size": int,
46		"imbalanced_ratio": float,
47		"imbalanced_depth": int,
48		"max_match_clusters": int,
49		"do_fine_tune": boolean,
50		"model_shortcut": str,
51		"beam_size": int,
52		"limit": int,
53		"post_processor": str,
54		"negative_sampling": str,
55		"ensemble_method": str,
56		"threshold": float,
57		"loss_function": str,
58		"truncate_length": int,
59		"hidden_droput_prob": float,
60		"batch_size": int,
61		"gradient_accumulation_steps": int,
62		"learning_rate": float,
63		"weight_decay": float,
64		"adam_epsilon": float,
65		"num_train_epochs": int,
66		"max_steps": int,
67		"lr_schedule": str,
68		"warmup_steps": int,
69		"logging_steps": int,
70		"save_steps": int,
71		"max_active_matching_labels": int,
72		"max_num_labels_in_gpu": int,
73		"use_gpu": boolean,
74		"bootstrap_model": str,
75		}
76
77		DEFAULT_PARAMETERS = {
78		"min_df": 1,
79		"ngram": 1,
80		"fix_clustering": False,
81		"nr_splits": 16,
82		"min_codes": None,
83		"max_leaf_size": 100,
84		"imbalanced_ratio": 0.0,
85		"imbalanced_depth": 100,
86		"max_match_clusters": 32768,
87		"do_fine_tune": True,
88		"model_shortcut": "distilbert-base-multilingual-cased",
89		"beam_size": 20,
90		"limit": 100,
91		"post_processor": "sigmoid",
92		"negative_sampling": "tfn",
93		"ensemble_method": "transformer-only",
94		"threshold": 0.1,
95		"loss_function": "squared-hinge",
96		"truncate_length": 128,
97		"hidden_droput_prob": 0.1,
98		"batch_size": 32,
99		"gradient_accumulation_steps": 1,
100		"learning_rate": 1e-4,
101		"weight_decay": 0.0,
102		"adam_epsilon": 1e-8,
103		"num_train_epochs": 1,
104		"max_steps": 0,
105		"lr_schedule": "linear",
106		"warmup_steps": 0,
107		"logging_steps": 100,
108		"save_steps": 1000,
109		"max_active_matching_labels": None,
110		"max_num_labels_in_gpu": 65536,
111		"use_gpu": True,
112		"bootstrap_model": "linear",
113		}
114
115		def _initialize_model(self):
116		if self._model is None:
117		path = osp.join(self.datadir, self.model_folder)
118		self.debug("loading model from {}".format(path))
119		if osp.exists(path):
120		self._model = XTransformer.load(path)
121		else:
122		raise NotInitializedException(
123		"model {} not found".format(path), backend_id=self.backend_id
124		)
125
126		def initialize(self, parallel=False):
127		self.initialize_vectorizer()
128		self._initialize_model()
129
130		def default_params(self):
131		params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
132		params.update(self.DEFAULT_PARAMETERS)
133		return params
134
135		def _create_train_files(self, veccorpus, corpus):
136		self.info("creating train file")
137		Xs = []
138		ys = []
139		txt_pth = osp.join(self.datadir, self.train_txt_file)
140		with open(txt_pth, "w", encoding="utf-8") as txt_file:
141		for doc, vector in zip(corpus.documents, veccorpus):
142		subject_set = doc.subject_set
143		if not (subject_set and doc.text):
144		continue # noqa
145		print(" ".join(doc.text.split()), file=txt_file)
146		Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
147		ys.append(
148		sp.csr_matrix(
149		(
150		np.ones(len(subject_set)),
151		(np.zeros(len(subject_set)), [s for s in subject_set]),
152		),
153		shape=(1, len(self.project.subjects)),
154		dtype=np.float32,
155		).sorted_indices()
156		)
157		atomic_save(
158		sp.vstack(Xs, format="csr"),
159		self.datadir,
160		self.train_X_file,
161		method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
162		)
163		atomic_save(
164		sp.vstack(ys, format="csr"),
165		self.datadir,
166		self.train_y_file,
167		method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
168		)
169
170		def _create_model(self, params, jobs):
171		train_txts = Preprocessor.load_data_from_file(
172		osp.join(self.datadir, self.train_txt_file),
173		label_text_path=None,
174		text_pos=0,
175		)["corpus"]
176		train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
177		train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
178		model_path = osp.join(self.datadir, self.model_folder)
179		new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
180		new_params["only_topk"] = new_params.pop("limit")
181		train_params = XTransformer.TrainParams.from_dict(
182		new_params, recursive=True
183		).to_dict()
184		pred_params = XTransformer.PredParams.from_dict(
185		new_params, recursive=True
186		).to_dict()
187
188		self.info("Start training")
189		# enable progress
190		matcher.LOGGER.setLevel(logging.INFO)
191		matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
192		self._model = XTransformer.train(
193		MLProblemWithText(train_txts, train_y, X_feat=train_X),
194		clustering=None,
195		val_prob=None,
196		train_params=train_params,
197		pred_params=pred_params,
198		beam_size=params["beam_size"],
199		steps_scale=None,
200		label_feat=None,
201		)
202		atomic_save_folder(self._model, model_path)
203
204	View Code Duplication	def _train(self, corpus, params, jobs=0):
		0 ignored issues – show Duplication introduced 2021-12-06 09:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
205		if corpus == "cached":
206		self.info("Reusing cached training data from previous run.")
207		else:
208		if corpus.is_empty():
209		raise NotSupportedException("Cannot t project with no documents")
210		input = (doc.text for doc in corpus.documents)
211		vecparams = {
212		"min_df": int(params["min_df"]),
213		"tokenizer": self.project.analyzer.tokenize_words,
214		"ngram_range": (1, int(params["ngram"])),
215		}
216		veccorpus = self.create_vectorizer(input, vecparams)
217		self._create_train_files(veccorpus, corpus)
218		self._create_model(params, jobs)
219
220		def _suggest(self, text, params):
221		text = " ".join(text.split())
222		vector = self.vectorizer.transform([text])
223		if vector.nnz == 0: # All zero vector, empty result
224		return ListSuggestionResult([])
225		new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
226		prediction = self._model.predict(
227		[text],
228		X_feat=vector.sorted_indices(),
229		batch_size=new_params["batch_size"],
230		use_gpu=False,
231		only_top_k=new_params["limit"],
232		post_processor=new_params["post_processor"],
233		)
234		results = []
235		for idx, score in zip(prediction.indices, prediction.data):
236		results.append(SubjectSuggestion(subject_id=idx, score=score))
237		return ListSuggestionResult(results)
238

NatLibFi / Annif

Push — master ( eb437a...a53d46 )

XTransformerBackend._create_train_files() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like