1
|
|
|
"""Annif backend using the transformer variant of pecos.""" |
2
|
|
|
|
3
|
|
|
import logging |
4
|
|
|
import os.path as osp |
5
|
|
|
from sys import stdout |
6
|
|
|
|
7
|
|
|
import numpy as np |
8
|
|
|
import scipy.sparse as sp |
9
|
|
|
from pecos.utils.featurization.text.preprocess import Preprocessor |
10
|
|
|
from pecos.xmc.xtransformer import matcher |
11
|
|
|
from pecos.xmc.xtransformer.model import XTransformer |
12
|
|
|
from pecos.xmc.xtransformer.module import MLProblemWithText |
13
|
|
|
|
14
|
|
|
from annif.exception import NotInitializedException, NotSupportedException |
15
|
|
|
from annif.suggestion import ListSuggestionResult, SubjectSuggestion |
16
|
|
|
from annif.util import ( |
17
|
|
|
apply_param_parse_config, |
18
|
|
|
atomic_save, |
19
|
|
|
atomic_save_folder, |
20
|
|
|
boolean, |
21
|
|
|
) |
22
|
|
|
|
23
|
|
|
from . import backend, mixins |
24
|
|
|
|
25
|
|
|
|
26
|
|
|
class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend): |
27
|
|
|
"""XTransformer based backend for Annif""" |
28
|
|
|
|
29
|
|
|
name = "xtransformer" |
30
|
|
|
needs_subject_index = True |
31
|
|
|
|
32
|
|
|
_model = None |
33
|
|
|
|
34
|
|
|
train_X_file = "xtransformer-train-X.npz" |
35
|
|
|
train_y_file = "xtransformer-train-y.npz" |
36
|
|
|
train_txt_file = "xtransformer-train-raw.txt" |
37
|
|
|
model_folder = "xtransformer-model" |
38
|
|
|
|
39
|
|
|
PARAM_CONFIG = { |
40
|
|
|
"min_df": int, |
41
|
|
|
"ngram": int, |
42
|
|
|
"fix_clustering": boolean, |
43
|
|
|
"nr_splits": int, |
44
|
|
|
"min_codes": int, |
45
|
|
|
"max_leaf_size": int, |
46
|
|
|
"imbalanced_ratio": float, |
47
|
|
|
"imbalanced_depth": int, |
48
|
|
|
"max_match_clusters": int, |
49
|
|
|
"do_fine_tune": boolean, |
50
|
|
|
"model_shortcut": str, |
51
|
|
|
"beam_size": int, |
52
|
|
|
"limit": int, |
53
|
|
|
"post_processor": str, |
54
|
|
|
"negative_sampling": str, |
55
|
|
|
"ensemble_method": str, |
56
|
|
|
"threshold": float, |
57
|
|
|
"loss_function": str, |
58
|
|
|
"truncate_length": int, |
59
|
|
|
"hidden_droput_prob": float, |
60
|
|
|
"batch_size": int, |
61
|
|
|
"gradient_accumulation_steps": int, |
62
|
|
|
"learning_rate": float, |
63
|
|
|
"weight_decay": float, |
64
|
|
|
"adam_epsilon": float, |
65
|
|
|
"num_train_epochs": int, |
66
|
|
|
"max_steps": int, |
67
|
|
|
"lr_schedule": str, |
68
|
|
|
"warmup_steps": int, |
69
|
|
|
"logging_steps": int, |
70
|
|
|
"save_steps": int, |
71
|
|
|
"max_active_matching_labels": int, |
72
|
|
|
"max_num_labels_in_gpu": int, |
73
|
|
|
"use_gpu": boolean, |
74
|
|
|
"bootstrap_model": str, |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
DEFAULT_PARAMETERS = { |
78
|
|
|
"min_df": 1, |
79
|
|
|
"ngram": 1, |
80
|
|
|
"fix_clustering": False, |
81
|
|
|
"nr_splits": 16, |
82
|
|
|
"min_codes": None, |
83
|
|
|
"max_leaf_size": 100, |
84
|
|
|
"imbalanced_ratio": 0.0, |
85
|
|
|
"imbalanced_depth": 100, |
86
|
|
|
"max_match_clusters": 32768, |
87
|
|
|
"do_fine_tune": True, |
88
|
|
|
"model_shortcut": "distilbert-base-multilingual-cased", |
89
|
|
|
"beam_size": 20, |
90
|
|
|
"limit": 100, |
91
|
|
|
"post_processor": "sigmoid", |
92
|
|
|
"negative_sampling": "tfn", |
93
|
|
|
"ensemble_method": "transformer-only", |
94
|
|
|
"threshold": 0.1, |
95
|
|
|
"loss_function": "squared-hinge", |
96
|
|
|
"truncate_length": 128, |
97
|
|
|
"hidden_droput_prob": 0.1, |
98
|
|
|
"batch_size": 32, |
99
|
|
|
"gradient_accumulation_steps": 1, |
100
|
|
|
"learning_rate": 1e-4, |
101
|
|
|
"weight_decay": 0.0, |
102
|
|
|
"adam_epsilon": 1e-8, |
103
|
|
|
"num_train_epochs": 1, |
104
|
|
|
"max_steps": 0, |
105
|
|
|
"lr_schedule": "linear", |
106
|
|
|
"warmup_steps": 0, |
107
|
|
|
"logging_steps": 100, |
108
|
|
|
"save_steps": 1000, |
109
|
|
|
"max_active_matching_labels": None, |
110
|
|
|
"max_num_labels_in_gpu": 65536, |
111
|
|
|
"use_gpu": True, |
112
|
|
|
"bootstrap_model": "linear", |
113
|
|
|
} |
114
|
|
|
|
115
|
|
|
def _initialize_model(self): |
116
|
|
|
if self._model is None: |
117
|
|
|
path = osp.join(self.datadir, self.model_folder) |
118
|
|
|
self.debug("loading model from {}".format(path)) |
119
|
|
|
if osp.exists(path): |
120
|
|
|
self._model = XTransformer.load(path) |
121
|
|
|
else: |
122
|
|
|
raise NotInitializedException( |
123
|
|
|
"model {} not found".format(path), backend_id=self.backend_id |
124
|
|
|
) |
125
|
|
|
|
126
|
|
|
def initialize(self, parallel=False): |
127
|
|
|
self.initialize_vectorizer() |
128
|
|
|
self._initialize_model() |
129
|
|
|
|
130
|
|
|
def default_params(self): |
131
|
|
|
params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() |
132
|
|
|
params.update(self.DEFAULT_PARAMETERS) |
133
|
|
|
return params |
134
|
|
|
|
135
|
|
|
def _create_train_files(self, veccorpus, corpus): |
136
|
|
|
self.info("creating train file") |
137
|
|
|
Xs = [] |
138
|
|
|
ys = [] |
139
|
|
|
txt_pth = osp.join(self.datadir, self.train_txt_file) |
140
|
|
|
with open(txt_pth, "w", encoding="utf-8") as txt_file: |
141
|
|
|
for doc, vector in zip(corpus.documents, veccorpus): |
142
|
|
|
subject_set = doc.subject_set |
143
|
|
|
if not (subject_set and doc.text): |
144
|
|
|
continue # noqa |
145
|
|
|
print(" ".join(doc.text.split()), file=txt_file) |
146
|
|
|
Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices()) |
147
|
|
|
ys.append( |
148
|
|
|
sp.csr_matrix( |
149
|
|
|
( |
150
|
|
|
np.ones(len(subject_set)), |
151
|
|
|
(np.zeros(len(subject_set)), [s for s in subject_set]), |
152
|
|
|
), |
153
|
|
|
shape=(1, len(self.project.subjects)), |
154
|
|
|
dtype=np.float32, |
155
|
|
|
).sorted_indices() |
156
|
|
|
) |
157
|
|
|
atomic_save( |
158
|
|
|
sp.vstack(Xs, format="csr"), |
159
|
|
|
self.datadir, |
160
|
|
|
self.train_X_file, |
161
|
|
|
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True), |
162
|
|
|
) |
163
|
|
|
atomic_save( |
164
|
|
|
sp.vstack(ys, format="csr"), |
165
|
|
|
self.datadir, |
166
|
|
|
self.train_y_file, |
167
|
|
|
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True), |
168
|
|
|
) |
169
|
|
|
|
170
|
|
|
def _create_model(self, params, jobs): |
171
|
|
|
train_txts = Preprocessor.load_data_from_file( |
172
|
|
|
osp.join(self.datadir, self.train_txt_file), |
173
|
|
|
label_text_path=None, |
174
|
|
|
text_pos=0, |
175
|
|
|
)["corpus"] |
176
|
|
|
train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file)) |
177
|
|
|
train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file)) |
178
|
|
|
model_path = osp.join(self.datadir, self.model_folder) |
179
|
|
|
new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params) |
180
|
|
|
new_params["only_topk"] = new_params.pop("limit") |
181
|
|
|
train_params = XTransformer.TrainParams.from_dict( |
182
|
|
|
new_params, recursive=True |
183
|
|
|
).to_dict() |
184
|
|
|
pred_params = XTransformer.PredParams.from_dict( |
185
|
|
|
new_params, recursive=True |
186
|
|
|
).to_dict() |
187
|
|
|
|
188
|
|
|
self.info("Start training") |
189
|
|
|
# enable progress |
190
|
|
|
matcher.LOGGER.setLevel(logging.INFO) |
191
|
|
|
matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout)) |
192
|
|
|
self._model = XTransformer.train( |
193
|
|
|
MLProblemWithText(train_txts, train_y, X_feat=train_X), |
194
|
|
|
clustering=None, |
195
|
|
|
val_prob=None, |
196
|
|
|
train_params=train_params, |
197
|
|
|
pred_params=pred_params, |
198
|
|
|
beam_size=params["beam_size"], |
199
|
|
|
steps_scale=None, |
200
|
|
|
label_feat=None, |
201
|
|
|
) |
202
|
|
|
atomic_save_folder(self._model, model_path) |
203
|
|
|
|
204
|
|
View Code Duplication |
def _train(self, corpus, params, jobs=0): |
|
|
|
|
205
|
|
|
if corpus == "cached": |
206
|
|
|
self.info("Reusing cached training data from previous run.") |
207
|
|
|
else: |
208
|
|
|
if corpus.is_empty(): |
209
|
|
|
raise NotSupportedException("Cannot t project with no documents") |
210
|
|
|
input = (doc.text for doc in corpus.documents) |
211
|
|
|
vecparams = { |
212
|
|
|
"min_df": int(params["min_df"]), |
213
|
|
|
"tokenizer": self.project.analyzer.tokenize_words, |
214
|
|
|
"ngram_range": (1, int(params["ngram"])), |
215
|
|
|
} |
216
|
|
|
veccorpus = self.create_vectorizer(input, vecparams) |
217
|
|
|
self._create_train_files(veccorpus, corpus) |
218
|
|
|
self._create_model(params, jobs) |
219
|
|
|
|
220
|
|
|
def _suggest(self, text, params): |
221
|
|
|
text = " ".join(text.split()) |
222
|
|
|
vector = self.vectorizer.transform([text]) |
223
|
|
|
if vector.nnz == 0: # All zero vector, empty result |
224
|
|
|
return ListSuggestionResult([]) |
225
|
|
|
new_params = apply_param_parse_config(self.PARAM_CONFIG, params) |
226
|
|
|
prediction = self._model.predict( |
227
|
|
|
[text], |
228
|
|
|
X_feat=vector.sorted_indices(), |
229
|
|
|
batch_size=new_params["batch_size"], |
230
|
|
|
use_gpu=False, |
231
|
|
|
only_top_k=new_params["limit"], |
232
|
|
|
post_processor=new_params["post_processor"], |
233
|
|
|
) |
234
|
|
|
results = [] |
235
|
|
|
for idx, score in zip(prediction.indices, prediction.data): |
236
|
|
|
results.append(SubjectSuggestion(subject_id=idx, score=score)) |
237
|
|
|
return ListSuggestionResult(results) |
238
|
|
|
|