Passed
Pull Request — main (#798)
by
unknown
02:57
created

XTransformerBackend.default_params()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
"""Annif backend using the transformer variant of pecos."""
2
3
import logging
4
import os.path as osp
5
import sys
6
from typing import Any
7
8
import numpy as np
9
import scipy.sparse as sp
10
from pecos.utils.featurization.text.preprocess import Preprocessor
11
from pecos.xmc.xtransformer import matcher, model
12
from pecos.xmc.xtransformer.model import XTransformer
13
from pecos.xmc.xtransformer.module import MLProblemWithText
14
15
from annif.corpus.document import DocumentCorpus
16
from annif.exception import NotInitializedException, NotSupportedException
17
from annif.suggestion import SubjectSuggestion, SuggestionBatch
18
from annif.util import (
19
    apply_param_parse_config,
20
    atomic_save,
21
    atomic_save_folder,
22
    boolean,
23
)
24
25
from . import backend, mixins
26
27
28
class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
29
    """XTransformer based backend for Annif"""
30
31
    name = "xtransformer"
32
    needs_subject_index = True
33
34
    _model = None
35
36
    train_X_file = "xtransformer-train-X.npz"
37
    train_y_file = "xtransformer-train-y.npz"
38
    train_txt_file = "xtransformer-train-raw.txt"
39
    model_folder = "xtransformer-model"
40
41
    PARAM_CONFIG = {
42
        "min_df": int,
43
        "ngram": int,
44
        "fix_clustering": boolean,
45
        "nr_splits": int,
46
        "min_codes": int,
47
        "max_leaf_size": int,
48
        "Cn": float,
49
        "Cp": float,
50
        "cost_sensitive_ranker": boolean,
51
        "rel_mode": str,
52
        "rel_norm": str,
53
        "neg_mining_chain": str,
54
        "imbalanced_ratio": float,
55
        "imbalanced_depth": int,
56
        "max_match_clusters": int,
57
        "do_fine_tune": boolean,
58
        "model_shortcut": str,
59
        "beam_size": int,
60
        "limit": int,
61
        "post_processor": str,
62
        "negative_sampling": str,
63
        "ensemble_method": str,
64
        "threshold": float,
65
        "loss_function": str,
66
        "truncate_length": int,
67
        "hidden_droput_prob": float,
68
        "batch_size": int,
69
        "gradient_accumulation_steps": int,
70
        "learning_rate": float,
71
        "weight_decay": float,
72
        "adam_epsilon": float,
73
        "num_train_epochs": int,
74
        "max_steps": int,
75
        "lr_schedule": str,
76
        "warmup_steps": int,
77
        "logging_steps": int,
78
        "save_steps": int,
79
        "max_active_matching_labels": int,
80
        "max_num_labels_in_gpu": int,
81
        "use_gpu": boolean,
82
        "bootstrap_model": str,
83
    }
84
85
    DEFAULT_PARAMETERS = {
86
        "min_df": 1,
87
        "ngram": 1,
88
        "fix_clustering": False,
89
        "nr_splits": 16,
90
        "min_codes": None,
91
        "max_leaf_size": 100,
92
        "Cn": 0.5,
93
        "Cp": 5.0,
94
        "cost_sensitive_ranker": True,
95
        "rel_mode": "induce",
96
        "rel_norm": "l1",
97
        "neg_mining_chain": "tfn+man",
98
        "imbalanced_ratio": 0.0,
99
        "imbalanced_depth": 100,
100
        "max_match_clusters": 32768,
101
        "do_fine_tune": True,
102
        "model_shortcut": "distilbert-base-multilingual-uncased",
103
        "beam_size": 20,
104
        "limit": 100,
105
        "post_processor": "sigmoid",
106
        "negative_sampling": "tfn",
107
        "ensemble_method": "transformer-only",
108
        "threshold": 0.1,
109
        "loss_function": "squared-hinge",
110
        "truncate_length": 128,
111
        "hidden_droput_prob": 0.1,
112
        "batch_size": 32,
113
        "gradient_accumulation_steps": 1,
114
        "learning_rate": 1e-4,
115
        "weight_decay": 0.0,
116
        "adam_epsilon": 1e-8,
117
        "num_train_epochs": 1,
118
        "max_steps": 0,
119
        "lr_schedule": "linear",
120
        "warmup_steps": 0,
121
        "logging_steps": 100,
122
        "save_steps": 1000,
123
        "max_active_matching_labels": None,
124
        "max_num_labels_in_gpu": 65536,
125
        "use_gpu": True,
126
        "bootstrap_model": "linear",
127
    }
128
129
    def _initialize_model(self):
130
        if self._model is None:
131
            path = osp.join(self.datadir, self.model_folder)
132
            self.debug("loading model from {}".format(path))
133
            if osp.exists(path):
134
                self._model = XTransformer.load(path)
135
            else:
136
                raise NotInitializedException(
137
                    "model {} not found".format(path), backend_id=self.backend_id
138
                )
139
140
    def initialize(self, parallel: bool = False) -> None:
141
        self.initialize_vectorizer()
142
        self._initialize_model()
143
144
    def default_params(self):
145
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
146
        params.update(self.DEFAULT_PARAMETERS)
147
        return params
148
149
    def _create_train_files(self, veccorpus, corpus):
150
        self.info("creating train file")
151
        Xs = []
152
        ys = []
153
        txt_pth = osp.join(self.datadir, self.train_txt_file)
154
        with open(txt_pth, "w", encoding="utf-8") as txt_file:
155
            for doc, vector in zip(corpus.documents, veccorpus):
156
                subject_set = doc.subject_set
157
                if not (subject_set and doc.text):
158
                    continue  # noqa
159
                print(" ".join(doc.text.split()), file=txt_file)
160
                Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
161
                ys.append(
162
                    sp.csr_matrix(
163
                        (
164
                            np.ones(len(subject_set)),
165
                            (np.zeros(len(subject_set)), [s for s in subject_set]),
166
                        ),
167
                        shape=(1, len(self.project.subjects)),
168
                        dtype=np.float32,
169
                    ).sorted_indices()
170
                )
171
        atomic_save(
172
            sp.vstack(Xs, format="csr"),
173
            self.datadir,
174
            self.train_X_file,
175
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
176
        )
177
        atomic_save(
178
            sp.vstack(ys, format="csr"),
179
            self.datadir,
180
            self.train_y_file,
181
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
182
        )
183
184
    def _create_model(self, params, jobs):
185
        train_txts = Preprocessor.load_data_from_file(
186
            osp.join(self.datadir, self.train_txt_file),
187
            label_text_path=None,
188
            text_pos=0,
189
        )["corpus"]
190
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
191
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
192
        model_path = osp.join(self.datadir, self.model_folder)
193
        new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
194
        new_params["only_topk"] = new_params.pop("limit")
195
        train_params = XTransformer.TrainParams.from_dict(
196
            new_params, recursive=True
197
        ).to_dict()
198
        pred_params = XTransformer.PredParams.from_dict(
199
            new_params, recursive=True
200
        ).to_dict()
201
202
        self.info("Start training")
203
        # enable progress
204
        matcher.LOGGER.setLevel(logging.DEBUG)
205
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
206
        model.LOGGER.setLevel(logging.DEBUG)
207
        model.LOGGER.addHandler(logging.StreamHandler(stream=sys.stdout))
208
        self._model = XTransformer.train(
209
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
210
            clustering=None,
211
            val_prob=None,
212
            train_params=train_params,
213
            pred_params=pred_params,
214
            beam_size=int(params["beam_size"]),
215
            steps_scale=None,
216
            label_feat=None,
217
        )
218
        atomic_save_folder(self._model, model_path)
219
220 View Code Duplication
    def _train(
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
221
        self,
222
        corpus: DocumentCorpus,
223
        params: dict[str, Any],
224
        jobs: int = 0,
225
    ) -> None:
226
        if corpus == "cached":
227
            self.info("Reusing cached training data from previous run.")
228
        else:
229
            if corpus.is_empty():
230
                raise NotSupportedException("Cannot t project with no documents")
231
            input = (doc.text for doc in corpus.documents)
232
            vecparams = {
233
                "min_df": int(params["min_df"]),
234
                "tokenizer": self.project.analyzer.tokenize_words,
235
                "ngram_range": (1, int(params["ngram"])),
236
            }
237
            veccorpus = self.create_vectorizer(input, vecparams)
238
            self._create_train_files(veccorpus, corpus)
239
        self._create_model(params, jobs)
240
241
    def _suggest_batch(
242
        self, texts: list[str], params: dict[str, Any]
243
    ) -> SuggestionBatch:
244
        vector = self.vectorizer.transform(texts)
245
        if vector.nnz == 0:  # All zero vector, empty result
246
            return list()
247
        new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
248
        prediction = self._model.predict(
249
            texts,
250
            X_feat=vector.sorted_indices(),
251
            batch_size=new_params["batch_size"],
252
            use_gpu=True,
253
            only_top_k=new_params["limit"],
254
            post_processor=new_params["post_processor"],
255
        )
256
        current_batchsize = prediction.get_shape()[0]
257
        batch_result = []
258
        for i in range(current_batchsize):
259
            results = []
260
            row = prediction.getrow(i)
261
            for idx, score in zip(row.indices, row.data):
262
                results.append(SubjectSuggestion(subject_id=idx, score=score))
263
            batch_result.append(results)
264
        return SuggestionBatch.from_sequence(batch_result, self.project.subjects)
265