Passed
Pull Request — master (#540)
by
unknown
02:12
created

XTransformerBackend._create_train_files()   B

Complexity

Conditions 7

Size

Total Lines 42
Code Lines 42

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 42
nop 3
dl 0
loc 42
rs 7.472
c 0
b 0
f 0
1
"""Annif backend using the transformer variant of pecos."""
2
3
from sys import stdout
4
import os.path as osp
5
import logging
6
import scipy.sparse as sp
7
import numpy as np
8
9
from annif.exception import NotInitializedException, NotSupportedException
10
from annif.suggestion import ListSuggestionResult, SubjectSuggestion
11
from . import mixins
12
from . import backend
13
from annif.util import boolean, apply_param_parse_config, atomic_save
14
15
from pecos.xmc.xtransformer.model import XTransformer
16
from pecos.xmc.xtransformer.module import MLProblemWithText
17
from pecos.utils.featurization.text.preprocess import Preprocessor
18
from pecos.xmc.xtransformer import matcher
19
20
21
class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
22
    """XTransformer based backend for Annif"""
23
    name = 'xtransformer'
24
    needs_subject_index = True
25
26
    _model = None
27
28
    train_X_file = 'xtransformer-train-X.npz'
29
    train_y_file = 'xtransformer-train-y.npz'
30
    train_txt_file = 'xtransformer-train-raw.txt'
31
    model_folder = 'xtransformer-model'
32
33
    PARAM_CONFIG = {
34
        'min_df': int,
35
        'ngram': int,
36
        'fix_clustering': boolean,
37
        'nr_splits': int,
38
        'min_codes': int,
39
        'max_leaf_size': int,
40
        'imbalanced_ratio': float,
41
        'imbalanced_depth': int,
42
        'max_match_clusters': int,
43
        'do_fine_tune': boolean,
44
        'model_shortcut': str,
45
        'beam_size': int,
46
        'limit': int,
47
        'post_processor': str,
48
        'negative_sampling': str,
49
        'ensemble_method': str,
50
        'threshold': float,
51
        'loss_function': str,
52
        'truncate_length': int,
53
        'hidden_droput_prob': float,
54
        'batch_size': int,
55
        'gradient_accumulation_steps': int,
56
        'learning_rate': float,
57
        'weight_decay': float,
58
        'adam_epsilon': float,
59
        'num_train_epochs': int,
60
        'max_steps': int,
61
        'lr_schedule': str,
62
        'warmup_steps': int,
63
        'logging_steps': int,
64
        'save_steps': int,
65
        'max_active_matching_labels': int,
66
        'max_num_labels_in_gpu': int,
67
        'use_gpu': boolean,
68
        'bootstrap_model': str
69
    }
70
71
    DEFAULT_PARAMETERS = {
72
        'min_df': 1,
73
        'ngram': 1,
74
        'fix_clustering': False,
75
        'nr_splits': 16,
76
        'min_codes': None,
77
        'max_leaf_size': 100,
78
        'imbalanced_ratio': 0.0,
79
        'imbalanced_depth': 100,
80
        'max_match_clusters': 32768,
81
        'do_fine_tune': True,
82
        # 'model_shortcut': 'distilbert-base-multilingual-cased',
83
        'model_shortcut': 'bert-base-multilingual-uncased',
84
        'beam_size': 20,
85
        'limit': 100,
86
        'post_processor': 'sigmoid',
87
        'negative_sampling': 'tfn',
88
        'ensemble_method': 'transformer-only',
89
        'threshold': 0.1,
90
        'loss_function': 'squared-hinge',
91
        'truncate_length': 128,
92
        'hidden_droput_prob': 0.1,
93
        'batch_size': 32,
94
        'gradient_accumulation_steps': 1,
95
        'learning_rate': 1e-4,
96
        'weight_decay': 0.0,
97
        'adam_epsilon': 1e-8,
98
        'num_train_epochs': 1,
99
        'max_steps': 0,
100
        'lr_schedule': 'linear',
101
        'warmup_steps': 0,
102
        'logging_steps': 100,
103
        'save_steps': 1000,
104
        'max_active_matching_labels': None,
105
        'max_num_labels_in_gpu': 65536,
106
        'use_gpu': True,
107
        'bootstrap_model': 'linear'
108
    }
109
110
    def _initialize_model(self):
111
        if self._model is None:
112
            path = osp.join(self.datadir, self.model_folder)
113
            self.debug('loading model from {}'.format(path))
114
            if osp.exists(path):
115
                self._model = XTransformer.load(path)
116
            else:
117
                raise NotInitializedException(
118
                    'model {} not found'.format(path),
119
                    backend_id=self.backend_id)
120
121
    def initialize(self, parallel=False):
122
        self.initialize_vectorizer()
123
        self._initialize_model()
124
125
    def default_params(self):
126
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
127
        params.update(self.DEFAULT_PARAMETERS)
128
        return params
129
130
    def _create_train_files(self, veccorpus, corpus):
131
        self.info('creating train file')
132
        Xs = []
133
        ys = []
134
        txt_pth = osp.join(self.datadir, self.train_txt_file)
135
        with open(txt_pth, 'w',  encoding='utf-8') as txt_file:
136
            for doc, vector in zip(corpus.documents, veccorpus):
137
                subject_ids = [
138
                    self.project.subjects.by_uri(uri)
139
                    for uri
140
                    in doc.uris]
141
                subject_ids = [s_id for s_id in subject_ids if s_id]
142
                if not (subject_ids and doc.text):
143
                    continue # noqa
144
                print(' '.join(doc.text.split()), file=txt_file)
145
                Xs.append(
146
                    sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
147
                ys.append(
148
                    sp.csr_matrix((
149
                        np.ones(len(subject_ids)),
150
                        (
151
                            np.zeros(len(subject_ids)),
152
                            subject_ids)),
153
                        shape=(1, len(self.project.subjects)),
154
                        dtype=np.float32
155
                        ).sorted_indices())
156
        atomic_save(
157
            sp.vstack(Xs, format='csr'),
158
            self.datadir,
159
            self.train_X_file,
160
            method=lambda mtrx, target: sp.save_npz(
161
                target,
162
                mtrx,
163
                compressed=True))
164
        atomic_save(
165
            sp.vstack(ys, format='csr'),
166
            self.datadir,
167
            self.train_y_file,
168
            method=lambda mtrx, target: sp.save_npz(
169
                target,
170
                mtrx,
171
                compressed=True))
172
173
    def _create_model(self, params, jobs):
174
        train_txts = Preprocessor.load_data_from_file(
175
            osp.join(self.datadir, self.train_txt_file),
176
            label_text_path=None,
177
            text_pos=0)['corpus']
178
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
179
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
180
        model_path = osp.join(self.datadir, self.model_folder)
181
        new_params = apply_param_parse_config(
182
            self.PARAM_CONFIG,
183
            self.params)
184
        new_params['only_topk'] = new_params.pop('limit')
185
        train_params = XTransformer.TrainParams.from_dict(
186
            new_params,
187
            recursive=True).to_dict()
188
        pred_params = XTransformer.PredParams.from_dict(
189
            new_params,
190
            recursive=True).to_dict()
191
192
        self.info('Start training')
193
        # enable progress
194
        matcher.LOGGER.setLevel(logging.INFO)
195
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
196
        self._model = XTransformer.train(
197
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
198
            clustering=None,
199
            val_prob=None,
200
            train_params=train_params,
201
            pred_params=pred_params,
202
            beam_size=params['beam_size'],
203
            steps_scale=None,
204
            label_feat=None,
205
            )
206
        atomic_save(self._model, model_path, None)
207
208 View Code Duplication
    def _train(self, corpus, params, jobs=0):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
209
        if corpus == 'cached':
210
            self.info("Reusing cached training data from previous run.")
211
        else:
212
            if corpus.is_empty():
213
                raise NotSupportedException(
214
                    'Cannot t project with no documents')
215
            input = (doc.text for doc in corpus.documents)
216
            vecparams = {'min_df': int(params['min_df']),
217
                         'tokenizer': self.project.analyzer.tokenize_words,
218
                         'ngram_range': (1, int(params['ngram']))}
219
            veccorpus = self.create_vectorizer(input, vecparams)
220
            self._create_train_files(veccorpus, corpus)
221
        self._create_model(params, jobs)
222
223
    def _suggest(self, text, params):
224
        text = ' '.join(text.split())
225
        vector = self.vectorizer.transform([text])
226
        if vector.nnz == 0:  # All zero vector, empty result
227
            return ListSuggestionResult([])
228
        new_params = apply_param_parse_config(
229
            self.PARAM_CONFIG,
230
            params
231
        )
232
        prediction = self._model.predict(
233
            [text],
234
            X_feat=vector.sorted_indices(),
235
            batch_size=new_params['batch_size'],
236
            use_gpu=new_params['use_gpu'],
237
            only_top_k=new_params['limit'],
238
            post_processor=new_params['post_processor'])
239
        results = []
240
        for idx, score in zip(prediction.indices, prediction.data):
241
            subject = self.project.subjects[idx]
242
            results.append(SubjectSuggestion(
243
                uri=subject[0],
244
                label=subject[1],
245
                notation=subject[2],
246
                score=score
247
            ))
248
        return ListSuggestionResult(results)
249