Passed
Pull Request — master (#540)
by
unknown
02:44
created

annif.backend.xtransformer   A

Complexity

Total Complexity 19

Size/Duplication

Total Lines 242
Duplicated Lines 90.91 %

Importance

Changes 0
Metric Value
eloc 216
dl 220
loc 242
rs 10
c 0
b 0
f 0
wmc 19

7 Methods

Rating   Name   Duplication   Size   Complexity  
A XTransformerBackend.initialize() 3 3 1
A XTransformerBackend._create_model() 34 34 1
A XTransformerBackend._suggest() 23 23 3
A XTransformerBackend._train() 14 14 3
A XTransformerBackend._initialize_model() 10 10 3
B XTransformerBackend._create_train_files() 38 38 7
A XTransformerBackend.default_params() 4 4 1

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
"""Annif backend using the transformer variant of pecos."""
2
3
from sys import stdout
4
import os.path as osp
5
import logging
6
import scipy.sparse as sp
7
import numpy as np
8
9
from annif.exception import NotInitializedException, NotSupportedException
10
from annif.suggestion import ListSuggestionResult, SubjectSuggestion
11
from . import mixins
12
from . import backend
13
from annif.util import boolean, apply_param_parse_config, atomic_save_folder, \
14
    atomic_save
15
16
from pecos.xmc.xtransformer.model import XTransformer
17
from pecos.xmc.xtransformer.module import MLProblemWithText
18
from pecos.utils.featurization.text.preprocess import Preprocessor
19
from pecos.xmc.xtransformer import matcher
20
21
22 View Code Duplication
class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
23
    """XTransformer based backend for Annif"""
24
    name = 'xtransformer'
25
    needs_subject_index = True
26
27
    _model = None
28
29
    train_X_file = 'xtransformer-train-X.npz'
30
    train_y_file = 'xtransformer-train-y.npz'
31
    train_txt_file = 'xtransformer-train-raw.txt'
32
    model_folder = 'xtransformer-model'
33
34
    PARAM_CONFIG = {
35
        'min_df': int,
36
        'ngram': int,
37
        'fix_clustering': boolean,
38
        'nr_splits': int,
39
        'min_codes': int,
40
        'max_leaf_size': int,
41
        'imbalanced_ratio': float,
42
        'imbalanced_depth': int,
43
        'max_match_clusters': int,
44
        'do_fine_tune': boolean,
45
        'model_shortcut': str,
46
        'beam_size': int,
47
        'limit': int,
48
        'post_processor': str,
49
        'negative_sampling': str,
50
        'ensemble_method': str,
51
        'threshold': float,
52
        'loss_function': str,
53
        'truncate_length': int,
54
        'hidden_droput_prob': float,
55
        'batch_size': int,
56
        'gradient_accumulation_steps': int,
57
        'learning_rate': float,
58
        'weight_decay': float,
59
        'adam_epsilon': float,
60
        'num_train_epochs': int,
61
        'max_steps': int,
62
        'lr_schedule': str,
63
        'warmup_steps': int,
64
        'logging_steps': int,
65
        'save_steps': int,
66
        'max_active_matching_labels': int,
67
        'max_num_labels_in_gpu': int,
68
        'use_gpu': boolean,
69
        'bootstrap_model': str
70
    }
71
72
    DEFAULT_PARAMETERS = {
73
        'min_df': 1,
74
        'ngram': 1,
75
        'fix_clustering': False,
76
        'nr_splits': 16,
77
        'min_codes': None,
78
        'max_leaf_size': 100,
79
        'imbalanced_ratio': 0.0,
80
        'imbalanced_depth': 100,
81
        'max_match_clusters': 32768,
82
        'do_fine_tune': True,
83
        'model_shortcut': 'distilbert-base-multilingual-cased',
84
        'beam_size': 20,
85
        'limit': 100,
86
        'post_processor': 'sigmoid',
87
        'negative_sampling': 'tfn',
88
        'ensemble_method': 'transformer-only',
89
        'threshold': 0.1,
90
        'loss_function': 'squared-hinge',
91
        'truncate_length': 128,
92
        'hidden_droput_prob': 0.1,
93
        'batch_size': 32,
94
        'gradient_accumulation_steps': 1,
95
        'learning_rate': 1e-4,
96
        'weight_decay': 0.0,
97
        'adam_epsilon': 1e-8,
98
        'num_train_epochs': 1,
99
        'max_steps': 0,
100
        'lr_schedule': 'linear',
101
        'warmup_steps': 0,
102
        'logging_steps': 100,
103
        'save_steps': 1000,
104
        'max_active_matching_labels': None,
105
        'max_num_labels_in_gpu': 65536,
106
        'use_gpu': True,
107
        'bootstrap_model': 'linear'
108
    }
109
110
    def _initialize_model(self):
111
        if self._model is None:
112
            path = osp.join(self.datadir, self.model_folder)
113
            self.debug('loading model from {}'.format(path))
114
            if osp.exists(path):
115
                self._model = XTransformer.load(path)
116
            else:
117
                raise NotInitializedException(
118
                    'model {} not found'.format(path),
119
                    backend_id=self.backend_id)
120
121
    def initialize(self, parallel=False):
122
        self.initialize_vectorizer()
123
        self._initialize_model()
124
125
    def default_params(self):
126
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
127
        params.update(self.DEFAULT_PARAMETERS)
128
        return params
129
130
    def _create_train_files(self, veccorpus, corpus):
131
        self.info('creating train file')
132
        Xs = []
133
        ys = []
134
        txt_pth = osp.join(self.datadir, self.train_txt_file)
135
        with open(txt_pth, 'w',  encoding='utf-8') as txt_file:
136
            for doc, vector in zip(corpus.documents, veccorpus):
137
                subject_set = doc.subject_set
138
                if not (subject_set and doc.text):
139
                    continue # noqa
140
                print(' '.join(doc.text.split()), file=txt_file)
141
                Xs.append(
142
                    sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
143
                ys.append(
144
                    sp.csr_matrix((
145
                        np.ones(len(subject_set)),
146
                        (
147
                            np.zeros(len(subject_set)),
148
                            [s for s in subject_set])),
149
                        shape=(1, len(self.project.subjects)),
150
                        dtype=np.float32
151
                        ).sorted_indices())
152
        atomic_save(
153
            sp.vstack(Xs, format='csr'),
154
            self.datadir,
155
            self.train_X_file,
156
            method=lambda mtrx, target: sp.save_npz(
157
                target,
158
                mtrx,
159
                compressed=True))
160
        atomic_save(
161
            sp.vstack(ys, format='csr'),
162
            self.datadir,
163
            self.train_y_file,
164
            method=lambda mtrx, target: sp.save_npz(
165
                target,
166
                mtrx,
167
                compressed=True))
168
169
    def _create_model(self, params, jobs):
170
        train_txts = Preprocessor.load_data_from_file(
171
            osp.join(self.datadir, self.train_txt_file),
172
            label_text_path=None,
173
            text_pos=0)['corpus']
174
        train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
175
        train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
176
        model_path = osp.join(self.datadir, self.model_folder)
177
        new_params = apply_param_parse_config(
178
            self.PARAM_CONFIG,
179
            self.params)
180
        new_params['only_topk'] = new_params.pop('limit')
181
        train_params = XTransformer.TrainParams.from_dict(
182
            new_params,
183
            recursive=True).to_dict()
184
        pred_params = XTransformer.PredParams.from_dict(
185
            new_params,
186
            recursive=True).to_dict()
187
188
        self.info('Start training')
189
        # enable progress
190
        matcher.LOGGER.setLevel(logging.INFO)
191
        matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
192
        self._model = XTransformer.train(
193
            MLProblemWithText(train_txts, train_y, X_feat=train_X),
194
            clustering=None,
195
            val_prob=None,
196
            train_params=train_params,
197
            pred_params=pred_params,
198
            beam_size=params['beam_size'],
199
            steps_scale=None,
200
            label_feat=None,
201
            )
202
        atomic_save_folder(self._model, model_path)
203
204
    def _train(self, corpus, params, jobs=0):
205
        if corpus == 'cached':
206
            self.info("Reusing cached training data from previous run.")
207
        else:
208
            if corpus.is_empty():
209
                raise NotSupportedException(
210
                    'Cannot t project with no documents')
211
            input = (doc.text for doc in corpus.documents)
212
            vecparams = {'min_df': int(params['min_df']),
213
                         'tokenizer': self.project.analyzer.tokenize_words,
214
                         'ngram_range': (1, int(params['ngram']))}
215
            veccorpus = self.create_vectorizer(input, vecparams)
216
            self._create_train_files(veccorpus, corpus)
217
        self._create_model(params, jobs)
218
219
    def _suggest(self, text, params):
220
        text = ' '.join(text.split())
221
        vector = self.vectorizer.transform([text])
222
        if vector.nnz == 0:  # All zero vector, empty result
223
            return ListSuggestionResult([])
224
        new_params = apply_param_parse_config(
225
            self.PARAM_CONFIG,
226
            params
227
        )
228
        prediction = self._model.predict(
229
            [text],
230
            X_feat=vector.sorted_indices(),
231
            batch_size=new_params['batch_size'],
232
            use_gpu=False,
233
            only_top_k=new_params['limit'],
234
            post_processor=new_params['post_processor'])
235
        results = []
236
        for idx, score in zip(prediction.indices, prediction.data):
237
            results.append(SubjectSuggestion(
238
                subject_id=idx,
239
                score=score
240
            ))
241
        return ListSuggestionResult(results)
242