Passed
Push — master ( eb437a...a53d46 )
by Juho
03:20 queued 15s
created

XTransformerBackend._create_train_files()   B

Complexity

Conditions 7

Size

Total Lines 33
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 30
nop 3
dl 0
loc 33
rs 7.76
c 0
b 0
f 0
1
"""Annif backend using the transformer variant of pecos."""
2
3
import logging
4
import os.path as osp
5
from sys import stdout
6
7
import numpy as np
8
import scipy.sparse as sp
9
from pecos.utils.featurization.text.preprocess import Preprocessor
10
from pecos.xmc.xtransformer import matcher
11
from pecos.xmc.xtransformer.model import XTransformer
12
from pecos.xmc.xtransformer.module import MLProblemWithText
13
14
from annif.exception import NotInitializedException, NotSupportedException
15
from annif.suggestion import ListSuggestionResult, SubjectSuggestion
16
from annif.util import (
17
    apply_param_parse_config,
18
    atomic_save,
19
    atomic_save_folder,
20
    boolean,
21
)
22
23
from . import backend, mixins
24
25
26
class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
27
    """XTransformer based backend for Annif"""
28
29
    name = "xtransformer"
30
    needs_subject_index = True
31
32
    _model = None
33
34
    train_X_file = "xtransformer-train-X.npz"
35
    train_y_file = "xtransformer-train-y.npz"
36
    train_txt_file = "xtransformer-train-raw.txt"
37
    model_folder = "xtransformer-model"
38
39
    PARAM_CONFIG = {
40
        "min_df": int,
41
        "ngram": int,
42
        "fix_clustering": boolean,
43
        "nr_splits": int,
44
        "min_codes": int,
45
        "max_leaf_size": int,
46
        "imbalanced_ratio": float,
47
        "imbalanced_depth": int,
48
        "max_match_clusters": int,
49
        "do_fine_tune": boolean,
50
        "model_shortcut": str,
51
        "beam_size": int,
52
        "limit": int,
53
        "post_processor": str,
54
        "negative_sampling": str,
55
        "ensemble_method": str,
56
        "threshold": float,
57
        "loss_function": str,
58
        "truncate_length": int,
59
        "hidden_droput_prob": float,
60
        "batch_size": int,
61
        "gradient_accumulation_steps": int,
62
        "learning_rate": float,
63
        "weight_decay": float,
64
        "adam_epsilon": float,
65
        "num_train_epochs": int,
66
        "max_steps": int,
67
        "lr_schedule": str,
68
        "warmup_steps": int,
69
        "logging_steps": int,
70
        "save_steps": int,
71
        "max_active_matching_labels": int,
72
        "max_num_labels_in_gpu": int,
73
        "use_gpu": boolean,
74
        "bootstrap_model": str,
75
    }
76
77
    DEFAULT_PARAMETERS = {
78
        "min_df": 1,
79
        "ngram": 1,
80
        "fix_clustering": False,
81
        "nr_splits": 16,
82
        "min_codes": None,
83
        "max_leaf_size": 100,
84
        "imbalanced_ratio": 0.0,
85
        "imbalanced_depth": 100,
86
        "max_match_clusters": 32768,
87
        "do_fine_tune": True,
88
        "model_shortcut": "distilbert-base-multilingual-cased",
89
        "beam_size": 20,
90
        "limit": 100,
91
        "post_processor": "sigmoid",
92
        "negative_sampling": "tfn",
93
        "ensemble_method": "transformer-only",
94
        "threshold": 0.1,
95
        "loss_function": "squared-hinge",
96
        "truncate_length": 128,
97
        "hidden_droput_prob": 0.1,
98
        "batch_size": 32,
99
        "gradient_accumulation_steps": 1,
100
        "learning_rate": 1e-4,
101
        "weight_decay": 0.0,
102
        "adam_epsilon": 1e-8,
103
        "num_train_epochs": 1,
104
        "max_steps": 0,
105
        "lr_schedule": "linear",
106
        "warmup_steps": 0,
107
        "logging_steps": 100,
108
        "save_steps": 1000,
109
        "max_active_matching_labels": None,
110
        "max_num_labels_in_gpu": 65536,
111
        "use_gpu": True,
112
        "bootstrap_model": "linear",
113
    }
114
115
    def _initialize_model(self):
116
        if self._model is None:
117
            path = osp.join(self.datadir, self.model_folder)
118
            self.debug("loading model from {}".format(path))
119
            if osp.exists(path):
120
                self._model = XTransformer.load(path)
121
            else:
122
                raise NotInitializedException(
123
                    "model {} not found".format(path), backend_id=self.backend_id
124
                )
125
126
    def initialize(self, parallel=False):
127
        self.initialize_vectorizer()
128
        self._initialize_model()
129
130
    def default_params(self):
131
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
132
        params.update(self.DEFAULT_PARAMETERS)
133
        return params
134
135
    def _create_train_files(self, veccorpus, corpus):
136
        self.info("creating train file")
137
        Xs = []
138
        ys = []
139
        txt_pth = osp.join(self.datadir, self.train_txt_file)
140
        with open(txt_pth, "w", encoding="utf-8") as txt_file:
141
            for doc, vector in zip(corpus.documents, veccorpus):
142
                subject_set = doc.subject_set
143
                if not (subject_set and doc.text):
144
                    continue  # noqa
145
                print(" ".join(doc.text.split()), file=txt_file)
146
                Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
147
                ys.append(
148
                    sp.csr_matrix(
149
                        (
150
                            np.ones(len(subject_set)),
151
                            (np.zeros(len(subject_set)), [s for s in subject_set]),
152
                        ),
153
                        shape=(1, len(self.project.subjects)),
154
                        dtype=np.float32,
155
                    ).sorted_indices()
156
                )
157
        atomic_save(
158
            sp.vstack(Xs, format="csr"),
159
            self.datadir,
160
            self.train_X_file,
161
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
162
        )
163
        atomic_save(
164
            sp.vstack(ys, format="csr"),
165
            self.datadir,
166
            self.train_y_file,
167
            method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
168
        )
169
170
    def _create_model(self, params, jobs):
171
        train_txts = Preprocessor.load_data_from_file(
172
            osp.join(self.datadir, self.train_txt_file),
173
            label_text_path=None,
174
            text_pos=0,
175
        )["corpus"]
176
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
177
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
178
        model_path = osp.join(self.datadir, self.model_folder)
179
        new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
180
        new_params["only_topk"] = new_params.pop("limit")
181
        train_params = XTransformer.TrainParams.from_dict(
182
            new_params, recursive=True
183
        ).to_dict()
184
        pred_params = XTransformer.PredParams.from_dict(
185
            new_params, recursive=True
186
        ).to_dict()
187
188
        self.info("Start training")
189
        # enable progress
190
        matcher.LOGGER.setLevel(logging.INFO)
191
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
192
        self._model = XTransformer.train(
193
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
194
            clustering=None,
195
            val_prob=None,
196
            train_params=train_params,
197
            pred_params=pred_params,
198
            beam_size=params["beam_size"],
199
            steps_scale=None,
200
            label_feat=None,
201
        )
202
        atomic_save_folder(self._model, model_path)
203
204 View Code Duplication
    def _train(self, corpus, params, jobs=0):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
205
        if corpus == "cached":
206
            self.info("Reusing cached training data from previous run.")
207
        else:
208
            if corpus.is_empty():
209
                raise NotSupportedException("Cannot t project with no documents")
210
            input = (doc.text for doc in corpus.documents)
211
            vecparams = {
212
                "min_df": int(params["min_df"]),
213
                "tokenizer": self.project.analyzer.tokenize_words,
214
                "ngram_range": (1, int(params["ngram"])),
215
            }
216
            veccorpus = self.create_vectorizer(input, vecparams)
217
            self._create_train_files(veccorpus, corpus)
218
        self._create_model(params, jobs)
219
220
    def _suggest(self, text, params):
221
        text = " ".join(text.split())
222
        vector = self.vectorizer.transform([text])
223
        if vector.nnz == 0:  # All zero vector, empty result
224
            return ListSuggestionResult([])
225
        new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
226
        prediction = self._model.predict(
227
            [text],
228
            X_feat=vector.sorted_indices(),
229
            batch_size=new_params["batch_size"],
230
            use_gpu=False,
231
            only_top_k=new_params["limit"],
232
            post_processor=new_params["post_processor"],
233
        )
234
        results = []
235
        for idx, score in zip(prediction.indices, prediction.data):
236
            results.append(SubjectSuggestion(subject_id=idx, score=score))
237
        return ListSuggestionResult(results)
238