annif.backend.xtransformer.XTransformerBackend._create_train_files() - Code Metrics - Inspection of "Add XTransformer backend" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#540)

unknown

created 2021-12-17 14:32 UTC

XTransformerBackend._create_train_files() B

↳ Parent: annif.backend.xtransformer

Complexity

Conditions

Size

Total Lines	42
Code Lines	42

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
eloc	42
nop	3
dl	0
loc	42
rs	7.472
c	0
b	0
f	0

"""Annif backend using the transformer variant of pecos."""

from sys import stdout
import os.path as osp
import logging
import scipy.sparse as sp
import numpy as np

from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import ListSuggestionResult, SubjectSuggestion
from . import mixins
from . import backend
from annif.util import boolean, apply_param_parse_config, atomic_save

from pecos.xmc.xtransformer.model import XTransformer
from pecos.xmc.xtransformer.module import MLProblemWithText
from pecos.utils.featurization.text.preprocess import Preprocessor
from pecos.xmc.xtransformer import matcher


class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
    """XTransformer based backend for Annif"""
    name = 'xtransformer'
    needs_subject_index = True

    _model = None

    train_X_file = 'xtransformer-train-X.npz'
    train_y_file = 'xtransformer-train-y.npz'
    train_txt_file = 'xtransformer-train-raw.txt'
    model_folder = 'xtransformer-model'

    PARAM_CONFIG = {
        'min_df': int,
        'ngram': int,
        'fix_clustering': boolean,
        'nr_splits': int,
        'min_codes': int,
        'max_leaf_size': int,
        'imbalanced_ratio': float,
        'imbalanced_depth': int,
        'max_match_clusters': int,
        'do_fine_tune': boolean,
        'model_shortcut': str,
        'beam_size': int,
        'limit': int,
        'post_processor': str,
        'negative_sampling': str,
        'ensemble_method': str,
        'threshold': float,
        'loss_function': str,
        'truncate_length': int,
        'hidden_droput_prob': float,
        'batch_size': int,
        'gradient_accumulation_steps': int,
        'learning_rate': float,
        'weight_decay': float,
        'adam_epsilon': float,
        'num_train_epochs': int,
        'max_steps': int,
        'lr_schedule': str,
        'warmup_steps': int,
        'logging_steps': int,
        'save_steps': int,
        'max_active_matching_labels': int,
        'max_num_labels_in_gpu': int,
        'use_gpu': boolean,
        'bootstrap_model': str
    }

    DEFAULT_PARAMETERS = {
        'min_df': 1,
        'ngram': 1,
        'fix_clustering': False,
        'nr_splits': 16,
        'min_codes': None,
        'max_leaf_size': 100,
        'imbalanced_ratio': 0.0,
        'imbalanced_depth': 100,
        'max_match_clusters': 32768,
        'do_fine_tune': True,
        # 'model_shortcut': 'distilbert-base-multilingual-cased',
        'model_shortcut': 'bert-base-multilingual-uncased',
        'beam_size': 20,
        'limit': 100,
        'post_processor': 'sigmoid',
        'negative_sampling': 'tfn',
        'ensemble_method': 'transformer-only',
        'threshold': 0.1,
        'loss_function': 'squared-hinge',
        'truncate_length': 128,
        'hidden_droput_prob': 0.1,
        'batch_size': 32,
        'gradient_accumulation_steps': 1,
        'learning_rate': 1e-4,
        'weight_decay': 0.0,
        'adam_epsilon': 1e-8,
        'num_train_epochs': 1,
        'max_steps': 0,
        'lr_schedule': 'linear',
        'warmup_steps': 0,
        'logging_steps': 100,
        'save_steps': 1000,
        'max_active_matching_labels': None,
        'max_num_labels_in_gpu': 65536,
        'use_gpu': True,
        'bootstrap_model': 'linear'
    }

    def _initialize_model(self):
        if self._model is None:
            path = osp.join(self.datadir, self.model_folder)
            self.debug('loading model from {}'.format(path))
            if osp.exists(path):
                self._model = XTransformer.load(path)
            else:
                raise NotInitializedException(
                    'model {} not found'.format(path),
                    backend_id=self.backend_id)

    def initialize(self, parallel=False):
        self.initialize_vectorizer()
        self._initialize_model()

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def _create_train_files(self, veccorpus, corpus):
        self.info('creating train file')
        Xs = []
        ys = []
        txt_pth = osp.join(self.datadir, self.train_txt_file)
        with open(txt_pth, 'w',  encoding='utf-8') as txt_file:
            for doc, vector in zip(corpus.documents, veccorpus):
                subject_ids = [
                    self.project.subjects.by_uri(uri)
                    for uri
                    in doc.uris]
                subject_ids = [s_id for s_id in subject_ids if s_id]
                if not (subject_ids and doc.text):
                    continue # noqa
                print(' '.join(doc.text.split()), file=txt_file)
                Xs.append(
                    sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
                ys.append(
                    sp.csr_matrix((
                        np.ones(len(subject_ids)),
                        (
                            np.zeros(len(subject_ids)),
                            subject_ids)),
                        shape=(1, len(self.project.subjects)),
                        dtype=np.float32
                        ).sorted_indices())
        atomic_save(
            sp.vstack(Xs, format='csr'),
            self.datadir,
            self.train_X_file,
            method=lambda mtrx, target: sp.save_npz(
                target,
                mtrx,
                compressed=True))
        atomic_save(
            sp.vstack(ys, format='csr'),
            self.datadir,
            self.train_y_file,
            method=lambda mtrx, target: sp.save_npz(
                target,
                mtrx,
                compressed=True))

    def _create_model(self, params, jobs):
        train_txts = Preprocessor.load_data_from_file(
            osp.join(self.datadir, self.train_txt_file),
            label_text_path=None,
            text_pos=0)['corpus']
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
        model_path = osp.join(self.datadir, self.model_folder)
        new_params = apply_param_parse_config(
            self.PARAM_CONFIG,
            self.params)
        new_params['only_topk'] = new_params.pop('limit')
        train_params = XTransformer.TrainParams.from_dict(
            new_params,
            recursive=True).to_dict()
        pred_params = XTransformer.PredParams.from_dict(
            new_params,
            recursive=True).to_dict()

        self.info('Start training')
        # enable progress
        matcher.LOGGER.setLevel(logging.INFO)
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
        self._model = XTransformer.train(
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
            clustering=None,
            val_prob=None,
            train_params=train_params,
            pred_params=pred_params,
            beam_size=params['beam_size'],
            steps_scale=None,
            label_feat=None,
            )
        atomic_save(self._model, model_path, None)

    def _train(self, corpus, params, jobs=0):

        if corpus == 'cached':
            self.info("Reusing cached training data from previous run.")
        else:
            if corpus.is_empty():
                raise NotSupportedException(
                    'Cannot t project with no documents')
            input = (doc.text for doc in corpus.documents)
            vecparams = {'min_df': int(params['min_df']),
                         'tokenizer': self.project.analyzer.tokenize_words,
                         'ngram_range': (1, int(params['ngram']))}
            veccorpus = self.create_vectorizer(input, vecparams)
            self._create_train_files(veccorpus, corpus)
        self._create_model(params, jobs)

    def _suggest(self, text, params):
        text = ' '.join(text.split())
        vector = self.vectorizer.transform([text])
        if vector.nnz == 0:  # All zero vector, empty result
            return ListSuggestionResult([])
        new_params = apply_param_parse_config(
            self.PARAM_CONFIG,
            params
        )
        prediction = self._model.predict(
            [text],
            X_feat=vector.sorted_indices(),
            batch_size=new_params['batch_size'],
            use_gpu=new_params['use_gpu'],
            only_top_k=new_params['limit'],
            post_processor=new_params['post_processor'])
        results = []
        for idx, score in zip(prediction.indices, prediction.data):
            subject = self.project.subjects[idx]
            results.append(SubjectSuggestion(
                uri=subject[0],
                label=subject[1],
                notation=subject[2],
                score=score
            ))
        return ListSuggestionResult(results)


1		"""Annif backend using the transformer variant of pecos."""
2
3		from sys import stdout
4		import os.path as osp
5		import logging
6		import scipy.sparse as sp
7		import numpy as np
8
9		from annif.exception import NotInitializedException, NotSupportedException
10		from annif.suggestion import ListSuggestionResult, SubjectSuggestion
11		from . import mixins
12		from . import backend
13		from annif.util import boolean, apply_param_parse_config, atomic_save
14
15		from pecos.xmc.xtransformer.model import XTransformer
16		from pecos.xmc.xtransformer.module import MLProblemWithText
17		from pecos.utils.featurization.text.preprocess import Preprocessor
18		from pecos.xmc.xtransformer import matcher
19
20
21		class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
22		"""XTransformer based backend for Annif"""
23		name = 'xtransformer'
24		needs_subject_index = True
25
26		_model = None
27
28		train_X_file = 'xtransformer-train-X.npz'
29		train_y_file = 'xtransformer-train-y.npz'
30		train_txt_file = 'xtransformer-train-raw.txt'
31		model_folder = 'xtransformer-model'
32
33		PARAM_CONFIG = {
34		'min_df': int,
35		'ngram': int,
36		'fix_clustering': boolean,
37		'nr_splits': int,
38		'min_codes': int,
39		'max_leaf_size': int,
40		'imbalanced_ratio': float,
41		'imbalanced_depth': int,
42		'max_match_clusters': int,
43		'do_fine_tune': boolean,
44		'model_shortcut': str,
45		'beam_size': int,
46		'limit': int,
47		'post_processor': str,
48		'negative_sampling': str,
49		'ensemble_method': str,
50		'threshold': float,
51		'loss_function': str,
52		'truncate_length': int,
53		'hidden_droput_prob': float,
54		'batch_size': int,
55		'gradient_accumulation_steps': int,
56		'learning_rate': float,
57		'weight_decay': float,
58		'adam_epsilon': float,
59		'num_train_epochs': int,
60		'max_steps': int,
61		'lr_schedule': str,
62		'warmup_steps': int,
63		'logging_steps': int,
64		'save_steps': int,
65		'max_active_matching_labels': int,
66		'max_num_labels_in_gpu': int,
67		'use_gpu': boolean,
68		'bootstrap_model': str
69		}
70
71		DEFAULT_PARAMETERS = {
72		'min_df': 1,
73		'ngram': 1,
74		'fix_clustering': False,
75		'nr_splits': 16,
76		'min_codes': None,
77		'max_leaf_size': 100,
78		'imbalanced_ratio': 0.0,
79		'imbalanced_depth': 100,
80		'max_match_clusters': 32768,
81		'do_fine_tune': True,
82		# 'model_shortcut': 'distilbert-base-multilingual-cased',
83		'model_shortcut': 'bert-base-multilingual-uncased',
84		'beam_size': 20,
85		'limit': 100,
86		'post_processor': 'sigmoid',
87		'negative_sampling': 'tfn',
88		'ensemble_method': 'transformer-only',
89		'threshold': 0.1,
90		'loss_function': 'squared-hinge',
91		'truncate_length': 128,
92		'hidden_droput_prob': 0.1,
93		'batch_size': 32,
94		'gradient_accumulation_steps': 1,
95		'learning_rate': 1e-4,
96		'weight_decay': 0.0,
97		'adam_epsilon': 1e-8,
98		'num_train_epochs': 1,
99		'max_steps': 0,
100		'lr_schedule': 'linear',
101		'warmup_steps': 0,
102		'logging_steps': 100,
103		'save_steps': 1000,
104		'max_active_matching_labels': None,
105		'max_num_labels_in_gpu': 65536,
106		'use_gpu': True,
107		'bootstrap_model': 'linear'
108		}
109
110		def _initialize_model(self):
111		if self._model is None:
112		path = osp.join(self.datadir, self.model_folder)
113		self.debug('loading model from {}'.format(path))
114		if osp.exists(path):
115		self._model = XTransformer.load(path)
116		else:
117		raise NotInitializedException(
118		'model {} not found'.format(path),
119		backend_id=self.backend_id)
120
121		def initialize(self, parallel=False):
122		self.initialize_vectorizer()
123		self._initialize_model()
124
125		def default_params(self):
126		params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
127		params.update(self.DEFAULT_PARAMETERS)
128		return params
129
130		def _create_train_files(self, veccorpus, corpus):
131		self.info('creating train file')
132		Xs = []
133		ys = []
134		txt_pth = osp.join(self.datadir, self.train_txt_file)
135		with open(txt_pth, 'w', encoding='utf-8') as txt_file:
136		for doc, vector in zip(corpus.documents, veccorpus):
137		subject_ids = [
138		self.project.subjects.by_uri(uri)
139		for uri
140		in doc.uris]
141		subject_ids = [s_id for s_id in subject_ids if s_id]
142		if not (subject_ids and doc.text):
143		continue # noqa
144		print(' '.join(doc.text.split()), file=txt_file)
145		Xs.append(
146		sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
147		ys.append(
148		sp.csr_matrix((
149		np.ones(len(subject_ids)),
150		(
151		np.zeros(len(subject_ids)),
152		subject_ids)),
153		shape=(1, len(self.project.subjects)),
154		dtype=np.float32
155		).sorted_indices())
156		atomic_save(
157		sp.vstack(Xs, format='csr'),
158		self.datadir,
159		self.train_X_file,
160		method=lambda mtrx, target: sp.save_npz(
161		target,
162		mtrx,
163		compressed=True))
164		atomic_save(
165		sp.vstack(ys, format='csr'),
166		self.datadir,
167		self.train_y_file,
168		method=lambda mtrx, target: sp.save_npz(
169		target,
170		mtrx,
171		compressed=True))
172
173		def _create_model(self, params, jobs):
174		train_txts = Preprocessor.load_data_from_file(
175		osp.join(self.datadir, self.train_txt_file),
176		label_text_path=None,
177		text_pos=0)['corpus']
178		train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
179		train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
180		model_path = osp.join(self.datadir, self.model_folder)
181		new_params = apply_param_parse_config(
182		self.PARAM_CONFIG,
183		self.params)
184		new_params['only_topk'] = new_params.pop('limit')
185		train_params = XTransformer.TrainParams.from_dict(
186		new_params,
187		recursive=True).to_dict()
188		pred_params = XTransformer.PredParams.from_dict(
189		new_params,
190		recursive=True).to_dict()
191
192		self.info('Start training')
193		# enable progress
194		matcher.LOGGER.setLevel(logging.INFO)
195		matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
196		self._model = XTransformer.train(
197		MLProblemWithText(train_txts, train_y, X_feat=train_X),
198		clustering=None,
199		val_prob=None,
200		train_params=train_params,
201		pred_params=pred_params,
202		beam_size=params['beam_size'],
203		steps_scale=None,
204		label_feat=None,
205		)
206		atomic_save(self._model, model_path, None)
207
208	View Code Duplication	def _train(self, corpus, params, jobs=0):
		0 ignored issues – show Duplication introduced 2021-12-06 09:44 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
209		if corpus == 'cached':
210		self.info("Reusing cached training data from previous run.")
211		else:
212		if corpus.is_empty():
213		raise NotSupportedException(
214		'Cannot t project with no documents')
215		input = (doc.text for doc in corpus.documents)
216		vecparams = {'min_df': int(params['min_df']),
217		'tokenizer': self.project.analyzer.tokenize_words,
218		'ngram_range': (1, int(params['ngram']))}
219		veccorpus = self.create_vectorizer(input, vecparams)
220		self._create_train_files(veccorpus, corpus)
221		self._create_model(params, jobs)
222
223		def _suggest(self, text, params):
224		text = ' '.join(text.split())
225		vector = self.vectorizer.transform([text])
226		if vector.nnz == 0: # All zero vector, empty result
227		return ListSuggestionResult([])
228		new_params = apply_param_parse_config(
229		self.PARAM_CONFIG,
230		params
231		)
232		prediction = self._model.predict(
233		[text],
234		X_feat=vector.sorted_indices(),
235		batch_size=new_params['batch_size'],
236		use_gpu=new_params['use_gpu'],
237		only_top_k=new_params['limit'],
238		post_processor=new_params['post_processor'])
239		results = []
240		for idx, score in zip(prediction.indices, prediction.data):
241		subject = self.project.subjects[idx]
242		results.append(SubjectSuggestion(
243		uri=subject[0],
244		label=subject[1],
245		notation=subject[2],
246		score=score
247		))
248		return ListSuggestionResult(results)
249

NatLibFi / Annif

Pull Request — master (#540)

XTransformerBackend._create_train_files() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like