1
|
|
|
"""Annif backend using the transformer variant of pecos.""" |
2
|
|
|
|
3
|
|
|
import logging |
4
|
|
|
import os.path as osp |
5
|
|
|
import sys |
6
|
|
|
from typing import Any |
7
|
|
|
|
8
|
|
|
import numpy as np |
9
|
|
|
import scipy.sparse as sp |
10
|
|
|
from pecos.utils.featurization.text.preprocess import Preprocessor |
11
|
|
|
from pecos.xmc.xtransformer import matcher, model |
12
|
|
|
from pecos.xmc.xtransformer.model import XTransformer |
13
|
|
|
from pecos.xmc.xtransformer.module import MLProblemWithText |
14
|
|
|
|
15
|
|
|
from annif.corpus.document import DocumentCorpus |
16
|
|
|
from annif.exception import NotInitializedException, NotSupportedException |
17
|
|
|
from annif.suggestion import SubjectSuggestion, SuggestionBatch |
18
|
|
|
from annif.util import ( |
19
|
|
|
apply_param_parse_config, |
20
|
|
|
atomic_save, |
21
|
|
|
atomic_save_folder, |
22
|
|
|
boolean, |
23
|
|
|
) |
24
|
|
|
|
25
|
|
|
from . import backend, mixins |
26
|
|
|
|
27
|
|
|
|
28
|
|
|
class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): |
29
|
|
|
"""XTransformer based backend for Annif""" |
30
|
|
|
|
31
|
|
|
name = "xtransformer" |
32
|
|
|
needs_subject_index = True |
33
|
|
|
|
34
|
|
|
_model = None |
35
|
|
|
|
36
|
|
|
train_X_file = "xtransformer-train-X.npz" |
37
|
|
|
train_y_file = "xtransformer-train-y.npz" |
38
|
|
|
train_txt_file = "xtransformer-train-raw.txt" |
39
|
|
|
model_folder = "xtransformer-model" |
40
|
|
|
|
41
|
|
|
PARAM_CONFIG = { |
42
|
|
|
"min_df": int, |
43
|
|
|
"ngram": int, |
44
|
|
|
"fix_clustering": boolean, |
45
|
|
|
"nr_splits": int, |
46
|
|
|
"min_codes": int, |
47
|
|
|
"max_leaf_size": int, |
48
|
|
|
"Cn": float, |
49
|
|
|
"Cp": float, |
50
|
|
|
"cost_sensitive_ranker": boolean, |
51
|
|
|
"rel_mode": str, |
52
|
|
|
"rel_norm": str, |
53
|
|
|
"neg_mining_chain": str, |
54
|
|
|
"imbalanced_ratio": float, |
55
|
|
|
"imbalanced_depth": int, |
56
|
|
|
"max_match_clusters": int, |
57
|
|
|
"do_fine_tune": boolean, |
58
|
|
|
"model_shortcut": str, |
59
|
|
|
"beam_size": int, |
60
|
|
|
"limit": int, |
61
|
|
|
"post_processor": str, |
62
|
|
|
"negative_sampling": str, |
63
|
|
|
"ensemble_method": str, |
64
|
|
|
"threshold": float, |
65
|
|
|
"loss_function": str, |
66
|
|
|
"truncate_length": int, |
67
|
|
|
"hidden_droput_prob": float, |
68
|
|
|
"batch_size": int, |
69
|
|
|
"gradient_accumulation_steps": int, |
70
|
|
|
"learning_rate": float, |
71
|
|
|
"weight_decay": float, |
72
|
|
|
"adam_epsilon": float, |
73
|
|
|
"num_train_epochs": int, |
74
|
|
|
"max_steps": int, |
75
|
|
|
"lr_schedule": str, |
76
|
|
|
"warmup_steps": int, |
77
|
|
|
"logging_steps": int, |
78
|
|
|
"save_steps": int, |
79
|
|
|
"max_active_matching_labels": int, |
80
|
|
|
"max_num_labels_in_gpu": int, |
81
|
|
|
"use_gpu": boolean, |
82
|
|
|
"bootstrap_model": str, |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
DEFAULT_PARAMETERS = { |
86
|
|
|
"min_df": 1, |
87
|
|
|
"ngram": 1, |
88
|
|
|
"fix_clustering": False, |
89
|
|
|
"nr_splits": 16, |
90
|
|
|
"min_codes": None, |
91
|
|
|
"max_leaf_size": 100, |
92
|
|
|
"Cn": 0.5, |
93
|
|
|
"Cp": 5.0, |
94
|
|
|
"cost_sensitive_ranker": True, |
95
|
|
|
"rel_mode": "induce", |
96
|
|
|
"rel_norm": "l1", |
97
|
|
|
"neg_mining_chain": "tfn+man", |
98
|
|
|
"imbalanced_ratio": 0.0, |
99
|
|
|
"imbalanced_depth": 100, |
100
|
|
|
"max_match_clusters": 32768, |
101
|
|
|
"do_fine_tune": True, |
102
|
|
|
"model_shortcut": "distilbert-base-multilingual-uncased", |
103
|
|
|
"beam_size": 20, |
104
|
|
|
"limit": 100, |
105
|
|
|
"post_processor": "sigmoid", |
106
|
|
|
"negative_sampling": "tfn", |
107
|
|
|
"ensemble_method": "transformer-only", |
108
|
|
|
"threshold": 0.1, |
109
|
|
|
"loss_function": "squared-hinge", |
110
|
|
|
"truncate_length": 128, |
111
|
|
|
"hidden_droput_prob": 0.1, |
112
|
|
|
"batch_size": 32, |
113
|
|
|
"gradient_accumulation_steps": 1, |
114
|
|
|
"learning_rate": 1e-4, |
115
|
|
|
"weight_decay": 0.0, |
116
|
|
|
"adam_epsilon": 1e-8, |
117
|
|
|
"num_train_epochs": 1, |
118
|
|
|
"max_steps": 0, |
119
|
|
|
"lr_schedule": "linear", |
120
|
|
|
"warmup_steps": 0, |
121
|
|
|
"logging_steps": 100, |
122
|
|
|
"save_steps": 1000, |
123
|
|
|
"max_active_matching_labels": None, |
124
|
|
|
"max_num_labels_in_gpu": 65536, |
125
|
|
|
"use_gpu": True, |
126
|
|
|
"bootstrap_model": "linear", |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
def _initialize_model(self): |
130
|
|
|
if self._model is None: |
131
|
|
|
path = osp.join(self.datadir, self.model_folder) |
132
|
|
|
self.debug("loading model from {}".format(path)) |
133
|
|
|
if osp.exists(path): |
134
|
|
|
self._model = XTransformer.load(path) |
135
|
|
|
else: |
136
|
|
|
raise NotInitializedException( |
137
|
|
|
"model {} not found".format(path), backend_id=self.backend_id |
138
|
|
|
) |
139
|
|
|
|
140
|
|
|
def initialize(self, parallel: bool = False) -> None: |
141
|
|
|
self.initialize_vectorizer() |
142
|
|
|
self._initialize_model() |
143
|
|
|
|
144
|
|
|
def default_params(self): |
145
|
|
|
params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() |
146
|
|
|
params.update(self.DEFAULT_PARAMETERS) |
147
|
|
|
return params |
148
|
|
|
|
149
|
|
|
def _create_train_files(self, veccorpus, corpus): |
150
|
|
|
self.info("creating train file") |
151
|
|
|
Xs = [] |
152
|
|
|
ys = [] |
153
|
|
|
txt_pth = osp.join(self.datadir, self.train_txt_file) |
154
|
|
|
with open(txt_pth, "w", encoding="utf-8") as txt_file: |
155
|
|
|
for doc, vector in zip(corpus.documents, veccorpus): |
156
|
|
|
subject_set = doc.subject_set |
157
|
|
|
if not (subject_set and doc.text): |
158
|
|
|
continue # noqa |
159
|
|
|
print(" ".join(doc.text.split()), file=txt_file) |
160
|
|
|
Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices()) |
161
|
|
|
ys.append( |
162
|
|
|
sp.csr_matrix( |
163
|
|
|
( |
164
|
|
|
np.ones(len(subject_set)), |
165
|
|
|
(np.zeros(len(subject_set)), [s for s in subject_set]), |
166
|
|
|
), |
167
|
|
|
shape=(1, len(self.project.subjects)), |
168
|
|
|
dtype=np.float32, |
169
|
|
|
).sorted_indices() |
170
|
|
|
) |
171
|
|
|
atomic_save( |
172
|
|
|
sp.vstack(Xs, format="csr"), |
173
|
|
|
self.datadir, |
174
|
|
|
self.train_X_file, |
175
|
|
|
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True), |
176
|
|
|
) |
177
|
|
|
atomic_save( |
178
|
|
|
sp.vstack(ys, format="csr"), |
179
|
|
|
self.datadir, |
180
|
|
|
self.train_y_file, |
181
|
|
|
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True), |
182
|
|
|
) |
183
|
|
|
|
184
|
|
|
def _create_model(self, params, jobs): |
185
|
|
|
train_txts = Preprocessor.load_data_from_file( |
186
|
|
|
osp.join(self.datadir, self.train_txt_file), |
187
|
|
|
label_text_path=None, |
188
|
|
|
text_pos=0, |
189
|
|
|
)["corpus"] |
190
|
|
|
train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file)) |
191
|
|
|
train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file)) |
192
|
|
|
model_path = osp.join(self.datadir, self.model_folder) |
193
|
|
|
new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params) |
194
|
|
|
new_params["only_topk"] = new_params.pop("limit") |
195
|
|
|
train_params = XTransformer.TrainParams.from_dict( |
196
|
|
|
new_params, recursive=True |
197
|
|
|
).to_dict() |
198
|
|
|
pred_params = XTransformer.PredParams.from_dict( |
199
|
|
|
new_params, recursive=True |
200
|
|
|
).to_dict() |
201
|
|
|
|
202
|
|
|
self.info("Start training") |
203
|
|
|
# enable progress |
204
|
|
|
matcher.LOGGER.setLevel(logging.DEBUG) |
205
|
|
|
matcher.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout)) |
206
|
|
|
model.LOGGER.setLevel(logging.DEBUG) |
207
|
|
|
model.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout)) |
208
|
|
|
self._model = XTransformer.train( |
209
|
|
|
MLProblemWithText(train_txts, train_y, X_feat=train_X), |
210
|
|
|
clustering=None, |
211
|
|
|
val_prob=None, |
212
|
|
|
train_params=train_params, |
213
|
|
|
pred_params=pred_params, |
214
|
|
|
beam_size=int(params["beam_size"]), |
215
|
|
|
steps_scale=None, |
216
|
|
|
label_feat=None, |
217
|
|
|
) |
218
|
|
|
atomic_save_folder(self._model, model_path) |
219
|
|
|
|
220
|
|
View Code Duplication |
def _train( |
|
|
|
|
221
|
|
|
self, |
222
|
|
|
corpus: DocumentCorpus, |
223
|
|
|
params: dict[str, Any], |
224
|
|
|
jobs: int = 0, |
225
|
|
|
) -> None: |
226
|
|
|
if corpus == "cached": |
227
|
|
|
self.info("Reusing cached training data from previous run.") |
228
|
|
|
else: |
229
|
|
|
if corpus.is_empty(): |
230
|
|
|
raise NotSupportedException("Cannot t project with no documents") |
231
|
|
|
input = (doc.text for doc in corpus.documents) |
232
|
|
|
vecparams = { |
233
|
|
|
"min_df": int(params["min_df"]), |
234
|
|
|
"tokenizer": self.project.analyzer.tokenize_words, |
235
|
|
|
"ngram_range": (1, int(params["ngram"])), |
236
|
|
|
} |
237
|
|
|
veccorpus = self.create_vectorizer(input, vecparams) |
238
|
|
|
self._create_train_files(veccorpus, corpus) |
239
|
|
|
self._create_model(params, jobs) |
240
|
|
|
|
241
|
|
|
def _suggest_batch( |
242
|
|
|
self, texts: list[str], params: dict[str, Any] |
243
|
|
|
) -> SuggestionBatch: |
244
|
|
|
vector = self.vectorizer.transform(texts) |
245
|
|
|
if vector.nnz == 0: # All zero vector, empty result |
246
|
|
|
return list() |
247
|
|
|
new_params = apply_param_parse_config(self.PARAM_CONFIG, params) |
248
|
|
|
prediction = self._model.predict( |
249
|
|
|
texts, |
250
|
|
|
X_feat=vector.sorted_indices(), |
251
|
|
|
batch_size=new_params["batch_size"], |
252
|
|
|
use_gpu=True, |
253
|
|
|
only_top_k=new_params["limit"], |
254
|
|
|
post_processor=new_params["post_processor"], |
255
|
|
|
) |
256
|
|
|
current_batchsize = prediction.get_shape()[0] |
257
|
|
|
batch_result = [] |
258
|
|
|
for i in range(current_batchsize): |
259
|
|
|
results = [] |
260
|
|
|
row = prediction.getrow(i) |
261
|
|
|
for idx, score in zip(row.indices, row.data): |
262
|
|
|
results.append(SubjectSuggestion(subject_id=idx, score=score)) |
263
|
|
|
batch_result.append(results) |
264
|
|
|
return SuggestionBatch.from_sequence(batch_result, self.project.subjects) |
265
|
|
|
|