Passed
Pull Request — master (#663)
by Juho
03:19
created

annif.project   C

Complexity

Total Complexity 55

Size/Duplication

Total Lines 298
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 217
dl 0
loc 298
rs 6
c 0
b 0
f 0
wmc 55

26 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject._get_info() 0 8 3
A AnnifProject.subjects() 0 3 1
A AnnifProject.modification_time() 0 3 1
A AnnifProject._initialize_vocab() 0 7 2
A AnnifProject.suggest() 0 15 3
A AnnifProject.analyzer() 0 10 3
A AnnifProject.transform() 0 7 2
A AnnifProject.vocab_lang() 0 5 2
A AnnifProject.is_trained() 0 3 1
A AnnifProject.vocab() 0 5 2
A AnnifProject.backend() 0 20 4
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject._init_access() 0 8 2
A AnnifProject._suggest_with_backend() 0 7 2
A AnnifProject.__init__() 0 12 1
A AnnifProject._initialize_analyzer() 0 6 2
A AnnifProject.initialize() 0 15 2
A AnnifProject._suggest_batch_with_backend() 0 5 2
A AnnifProject.remove_model_data() 0 9 2
A AnnifProject.dump() 0 9 1
A AnnifProject.hyperopt() 0 10 2
A AnnifProject.suggest_corpus() 0 7 1
A AnnifProject.train() 0 8 3
A AnnifProject.learn() 0 11 3
A AnnifProject.suggest_batch() 0 9 3

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
import enum
4
import itertools
5
import os.path
6
from shutil import rmtree
7
8
import annif
9
import annif.analyzer
10
import annif.backend
11
import annif.corpus
12
import annif.suggestion
13
import annif.transform
14
from annif.datadir import DatadirMixin
15
from annif.exception import (
16
    AnnifException,
17
    ConfigurationException,
18
    NotInitializedException,
19
    NotSupportedException,
20
)
21
22
logger = annif.logger
23
24
25
class Access(enum.IntEnum):
26
    """Enumeration of access levels for projects"""
27
28
    private = 1
29
    hidden = 2
30
    public = 3
31
32
33
class AnnifProject(DatadirMixin):
34
    """Class representing the configuration of a single Annif project."""
35
36
    # defaults for uninitialized instances
37
    _transform = None
38
    _analyzer = None
39
    _backend = None
40
    _vocab = None
41
    _vocab_lang = None
42
    initialized = False
43
44
    # default values for configuration settings
45
    DEFAULT_ACCESS = "public"
46
    DOC_BATCH_SIZE = 32
47
48
    def __init__(self, project_id, config, datadir, registry):
49
        DatadirMixin.__init__(self, datadir, "projects", project_id)
50
        self.project_id = project_id
51
        self.name = config.get("name", project_id)
52
        self.language = config["language"]
53
        self.analyzer_spec = config.get("analyzer", None)
54
        self.transform_spec = config.get("transform", "pass")
55
        self.vocab_spec = config.get("vocab", None)
56
        self.config = config
57
        self._base_datadir = datadir
58
        self.registry = registry
59
        self._init_access()
60
61
    def _init_access(self):
62
        access = self.config.get("access", self.DEFAULT_ACCESS)
63
        try:
64
            self.access = getattr(Access, access)
65
        except AttributeError:
66
            raise ConfigurationException(
67
                "'{}' is not a valid access setting".format(access),
68
                project_id=self.project_id,
69
            )
70
71
    def _initialize_analyzer(self):
72
        if not self.analyzer_spec:
73
            return  # not configured, so assume it's not needed
74
        analyzer = self.analyzer
75
        logger.debug(
76
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
77
        )
78
79
    def _initialize_subjects(self):
80
        try:
81
            subjects = self.subjects
82
            logger.debug(
83
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
84
            )
85
        except AnnifException as err:
86
            logger.warning(err.format_message())
87
88
    def _initialize_backend(self, parallel):
89
        logger.debug("Project '%s': initializing backend", self.project_id)
90
        try:
91
            if not self.backend:
92
                logger.debug("Cannot initialize backend: does not exist")
93
                return
94
            self.backend.initialize(parallel)
95
        except AnnifException as err:
96
            logger.warning(err.format_message())
97
98
    def initialize(self, parallel=False):
99
        """Initialize this project and its backend so that they are ready to
100
        be used. If parallel is True, expect that the project will be used
101
        for parallel processing."""
102
103
        if self.initialized:
104
            return
105
106
        logger.debug("Initializing project '%s'", self.project_id)
107
108
        self._initialize_analyzer()
109
        self._initialize_subjects()
110
        self._initialize_backend(parallel)
111
112
        self.initialized = True
113
114
    def _suggest_with_backend(self, text, backend_params):
115
        if backend_params is None:
116
            backend_params = {}
117
        beparams = backend_params.get(self.backend.backend_id, {})
118
        hits = self.backend.suggest(text, beparams)
119
        logger.debug("Got %d hits from backend %s", len(hits), self.backend.backend_id)
120
        return hits
121
122
    def _suggest_batch_with_backend(self, texts, backend_params):
123
        if backend_params is None:
124
            backend_params = {}
125
        beparams = backend_params.get(self.backend.backend_id, {})
126
        return self.backend.suggest_batch(texts, beparams)
127
128
    @property
129
    def analyzer(self):
130
        if self._analyzer is None:
131
            if self.analyzer_spec:
132
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
133
            else:
134
                raise ConfigurationException(
135
                    "analyzer setting is missing", project_id=self.project_id
136
                )
137
        return self._analyzer
138
139
    @property
140
    def transform(self):
141
        if self._transform is None:
142
            self._transform = annif.transform.get_transform(
143
                self.transform_spec, project=self
144
            )
145
        return self._transform
146
147
    @property
148
    def backend(self):
149
        if self._backend is None:
150
            if "backend" not in self.config:
151
                raise ConfigurationException(
152
                    "backend setting is missing", project_id=self.project_id
153
                )
154
            backend_id = self.config["backend"]
155
            try:
156
                backend_class = annif.backend.get_backend(backend_id)
157
                self._backend = backend_class(
158
                    backend_id, config_params=self.config, project=self
159
                )
160
            except ValueError:
161
                logger.warning(
162
                    "Could not create backend %s, "
163
                    "make sure you've installed optional dependencies",
164
                    backend_id,
165
                )
166
        return self._backend
167
168
    def _initialize_vocab(self):
169
        if self.vocab_spec is None:
170
            raise ConfigurationException(
171
                "vocab setting is missing", project_id=self.project_id
172
            )
173
        self._vocab, self._vocab_lang = self.registry.get_vocab(
174
            self.vocab_spec, self.language
175
        )
176
177
    @property
178
    def vocab(self):
179
        if self._vocab is None:
180
            self._initialize_vocab()
181
        return self._vocab
182
183
    @property
184
    def vocab_lang(self):
185
        if self._vocab_lang is None:
186
            self._initialize_vocab()
187
        return self._vocab_lang
188
189
    @property
190
    def subjects(self):
191
        return self.vocab.subjects
192
193
    def _get_info(self, key):
194
        try:
195
            be = self.backend
196
            if be is not None:
197
                return getattr(be, key)
198
        except AnnifException as err:
199
            logger.warning(err.format_message())
200
            return None
201
202
    @property
203
    def is_trained(self):
204
        return self._get_info("is_trained")
205
206
    @property
207
    def modification_time(self):
208
        return self._get_info("modification_time")
209
210
    def suggest(self, text, backend_params=None):
211
        """Suggest subjects the given text by passing it to the backend. Returns a
212
        list of SubjectSuggestion objects ordered by decreasing score."""
213
        if not self.is_trained:
214
            if self.is_trained is None:
215
                logger.warning("Could not get train state information.")
216
            else:
217
                raise NotInitializedException("Project is not trained.")
218
        logger.debug(
219
            'Suggesting subjects for text "%s..." (len=%d)', text[:20], len(text)
220
        )
221
        text = self.transform.transform_text(text)
222
        hits = self._suggest_with_backend(text, backend_params)
223
        logger.debug("%d hits from backend", len(hits))
224
        return hits
225
226
    def suggest_corpus(self, corpus, backend_params=None):
227
        """Suggest subjects for the given documents corpus in batches of documents."""
228
        suggestions = (
229
            self.suggest_batch([doc.text for doc in doc_batch], backend_params)
230
            for doc_batch in corpus.doc_batches(self.DOC_BATCH_SIZE)
231
        )
232
        return itertools.chain.from_iterable(suggestions)
233
234
    def suggest_batch(self, texts, backend_params=None):
235
        """Suggest subjects for the given documents batch."""
236
        if not self.is_trained:
237
            if self.is_trained is None:
238
                logger.warning("Could not get train state information.")
239
            else:
240
                raise NotInitializedException("Project is not trained.")
241
        texts = [self.transform.transform_text(text) for text in texts]
242
        return self._suggest_batch_with_backend(texts, backend_params)
243
244
    def train(self, corpus, backend_params=None, jobs=0):
245
        """train the project using documents from a metadata source"""
246
        if corpus != "cached":
247
            corpus = self.transform.transform_corpus(corpus)
248
        if backend_params is None:
249
            backend_params = {}
250
        beparams = backend_params.get(self.backend.backend_id, {})
251
        self.backend.train(corpus, beparams, jobs)
252
253
    def learn(self, corpus, backend_params=None):
254
        """further train the project using documents from a metadata source"""
255
        if backend_params is None:
256
            backend_params = {}
257
        beparams = backend_params.get(self.backend.backend_id, {})
258
        corpus = self.transform.transform_corpus(corpus)
259
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
260
            self.backend.learn(corpus, beparams)
261
        else:
262
            raise NotSupportedException(
263
                "Learning not supported by backend", project_id=self.project_id
264
            )
265
266
    def hyperopt(self, corpus, trials, jobs, metric, results_file):
267
        """optimize the hyperparameters of the project using a validation
268
        corpus against a given metric"""
269
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
270
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
271
            return optimizer.optimize(trials, jobs, results_file)
272
273
        raise NotSupportedException(
274
            "Hyperparameter optimization not supported " "by backend",
275
            project_id=self.project_id,
276
        )
277
278
    def dump(self):
279
        """return this project as a dict"""
280
        return {
281
            "project_id": self.project_id,
282
            "name": self.name,
283
            "language": self.language,
284
            "backend": {"backend_id": self.config.get("backend")},
285
            "is_trained": self.is_trained,
286
            "modification_time": self.modification_time,
287
        }
288
289
    def remove_model_data(self):
290
        """remove the data of this project"""
291
        datadir_path = self._datadir_path
292
        if os.path.isdir(datadir_path):
293
            rmtree(datadir_path)
294
            logger.info("Removed model data for project {}.".format(self.project_id))
295
        else:
296
            logger.warning(
297
                "No model data to remove for project {}.".format(self.project_id)
298
            )
299