Passed
Pull Request — master (#663)
by Juho
03:01
created

annif.project   B

Complexity

Total Complexity 50

Size/Duplication

Total Lines 273
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 198
dl 0
loc 273
rs 8.4
c 0
b 0
f 0
wmc 50

24 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject._get_info() 0 8 3
A AnnifProject.subjects() 0 3 1
A AnnifProject.modification_time() 0 3 1
A AnnifProject._initialize_vocab() 0 7 2
A AnnifProject.analyzer() 0 10 3
A AnnifProject.transform() 0 7 2
A AnnifProject.vocab_lang() 0 5 2
A AnnifProject.is_trained() 0 3 1
A AnnifProject.vocab() 0 5 2
A AnnifProject.backend() 0 20 4
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject._init_access() 0 8 2
A AnnifProject.__init__() 0 12 1
A AnnifProject._initialize_analyzer() 0 6 2
A AnnifProject.initialize() 0 15 2
A AnnifProject.remove_model_data() 0 9 2
A AnnifProject.dump() 0 9 1
A AnnifProject.hyperopt() 0 10 2
A AnnifProject.suggest_corpus() 0 7 1
A AnnifProject.train() 0 8 3
A AnnifProject._suggest_with_backend() 0 5 2
A AnnifProject.suggest() 0 9 3
A AnnifProject.learn() 0 11 3

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
import enum
4
import itertools
5
import os.path
6
from shutil import rmtree
7
8
import annif
9
import annif.analyzer
10
import annif.backend
11
import annif.corpus
12
import annif.suggestion
13
import annif.transform
14
from annif.datadir import DatadirMixin
15
from annif.exception import (
16
    AnnifException,
17
    ConfigurationException,
18
    NotInitializedException,
19
    NotSupportedException,
20
)
21
22
logger = annif.logger
23
24
25
class Access(enum.IntEnum):
26
    """Enumeration of access levels for projects"""
27
28
    private = 1
29
    hidden = 2
30
    public = 3
31
32
33
class AnnifProject(DatadirMixin):
34
    """Class representing the configuration of a single Annif project."""
35
36
    # defaults for uninitialized instances
37
    _transform = None
38
    _analyzer = None
39
    _backend = None
40
    _vocab = None
41
    _vocab_lang = None
42
    initialized = False
43
44
    # default values for configuration settings
45
    DEFAULT_ACCESS = "public"
46
47
    def __init__(self, project_id, config, datadir, registry):
48
        DatadirMixin.__init__(self, datadir, "projects", project_id)
49
        self.project_id = project_id
50
        self.name = config.get("name", project_id)
51
        self.language = config["language"]
52
        self.analyzer_spec = config.get("analyzer", None)
53
        self.transform_spec = config.get("transform", "pass")
54
        self.vocab_spec = config.get("vocab", None)
55
        self.config = config
56
        self._base_datadir = datadir
57
        self.registry = registry
58
        self._init_access()
59
60
    def _init_access(self):
61
        access = self.config.get("access", self.DEFAULT_ACCESS)
62
        try:
63
            self.access = getattr(Access, access)
64
        except AttributeError:
65
            raise ConfigurationException(
66
                "'{}' is not a valid access setting".format(access),
67
                project_id=self.project_id,
68
            )
69
70
    def _initialize_analyzer(self):
71
        if not self.analyzer_spec:
72
            return  # not configured, so assume it's not needed
73
        analyzer = self.analyzer
74
        logger.debug(
75
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
76
        )
77
78
    def _initialize_subjects(self):
79
        try:
80
            subjects = self.subjects
81
            logger.debug(
82
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
83
            )
84
        except AnnifException as err:
85
            logger.warning(err.format_message())
86
87
    def _initialize_backend(self, parallel):
88
        logger.debug("Project '%s': initializing backend", self.project_id)
89
        try:
90
            if not self.backend:
91
                logger.debug("Cannot initialize backend: does not exist")
92
                return
93
            self.backend.initialize(parallel)
94
        except AnnifException as err:
95
            logger.warning(err.format_message())
96
97
    def initialize(self, parallel=False):
98
        """Initialize this project and its backend so that they are ready to
99
        be used. If parallel is True, expect that the project will be used
100
        for parallel processing."""
101
102
        if self.initialized:
103
            return
104
105
        logger.debug("Initializing project '%s'", self.project_id)
106
107
        self._initialize_analyzer()
108
        self._initialize_subjects()
109
        self._initialize_backend(parallel)
110
111
        self.initialized = True
112
113
    def _suggest_with_backend(self, texts, backend_params):
114
        if backend_params is None:
115
            backend_params = {}
116
        beparams = backend_params.get(self.backend.backend_id, {})
117
        return self.backend.suggest(texts, beparams)
118
119
    @property
120
    def analyzer(self):
121
        if self._analyzer is None:
122
            if self.analyzer_spec:
123
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
124
            else:
125
                raise ConfigurationException(
126
                    "analyzer setting is missing", project_id=self.project_id
127
                )
128
        return self._analyzer
129
130
    @property
131
    def transform(self):
132
        if self._transform is None:
133
            self._transform = annif.transform.get_transform(
134
                self.transform_spec, project=self
135
            )
136
        return self._transform
137
138
    @property
139
    def backend(self):
140
        if self._backend is None:
141
            if "backend" not in self.config:
142
                raise ConfigurationException(
143
                    "backend setting is missing", project_id=self.project_id
144
                )
145
            backend_id = self.config["backend"]
146
            try:
147
                backend_class = annif.backend.get_backend(backend_id)
148
                self._backend = backend_class(
149
                    backend_id, config_params=self.config, project=self
150
                )
151
            except ValueError:
152
                logger.warning(
153
                    "Could not create backend %s, "
154
                    "make sure you've installed optional dependencies",
155
                    backend_id,
156
                )
157
        return self._backend
158
159
    def _initialize_vocab(self):
160
        if self.vocab_spec is None:
161
            raise ConfigurationException(
162
                "vocab setting is missing", project_id=self.project_id
163
            )
164
        self._vocab, self._vocab_lang = self.registry.get_vocab(
165
            self.vocab_spec, self.language
166
        )
167
168
    @property
169
    def vocab(self):
170
        if self._vocab is None:
171
            self._initialize_vocab()
172
        return self._vocab
173
174
    @property
175
    def vocab_lang(self):
176
        if self._vocab_lang is None:
177
            self._initialize_vocab()
178
        return self._vocab_lang
179
180
    @property
181
    def subjects(self):
182
        return self.vocab.subjects
183
184
    def _get_info(self, key):
185
        try:
186
            be = self.backend
187
            if be is not None:
188
                return getattr(be, key)
189
        except AnnifException as err:
190
            logger.warning(err.format_message())
191
            return None
192
193
    @property
194
    def is_trained(self):
195
        return self._get_info("is_trained")
196
197
    @property
198
    def modification_time(self):
199
        return self._get_info("modification_time")
200
201
    def suggest_corpus(self, corpus, backend_params=None):
202
        """Suggest subjects for the given documents corpus in batches of documents."""
203
        suggestions = (
204
            self.suggest([doc.text for doc in doc_batch], backend_params)
205
            for doc_batch in corpus.doc_batches
206
        )
207
        return itertools.chain.from_iterable(suggestions)
208
209
    def suggest(self, texts, backend_params=None):
210
        """Suggest subjects for the given documents batch."""
211
        if not self.is_trained:
212
            if self.is_trained is None:
213
                logger.warning("Could not get train state information.")
214
            else:
215
                raise NotInitializedException("Project is not trained.")
216
        texts = [self.transform.transform_text(text) for text in texts]
217
        return self._suggest_with_backend(texts, backend_params)
218
219
    def train(self, corpus, backend_params=None, jobs=0):
220
        """train the project using documents from a metadata source"""
221
        if corpus != "cached":
222
            corpus = self.transform.transform_corpus(corpus)
223
        if backend_params is None:
224
            backend_params = {}
225
        beparams = backend_params.get(self.backend.backend_id, {})
226
        self.backend.train(corpus, beparams, jobs)
227
228
    def learn(self, corpus, backend_params=None):
229
        """further train the project using documents from a metadata source"""
230
        if backend_params is None:
231
            backend_params = {}
232
        beparams = backend_params.get(self.backend.backend_id, {})
233
        corpus = self.transform.transform_corpus(corpus)
234
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
235
            self.backend.learn(corpus, beparams)
236
        else:
237
            raise NotSupportedException(
238
                "Learning not supported by backend", project_id=self.project_id
239
            )
240
241
    def hyperopt(self, corpus, trials, jobs, metric, results_file):
242
        """optimize the hyperparameters of the project using a validation
243
        corpus against a given metric"""
244
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
245
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
246
            return optimizer.optimize(trials, jobs, results_file)
247
248
        raise NotSupportedException(
249
            "Hyperparameter optimization not supported " "by backend",
250
            project_id=self.project_id,
251
        )
252
253
    def dump(self):
254
        """return this project as a dict"""
255
        return {
256
            "project_id": self.project_id,
257
            "name": self.name,
258
            "language": self.language,
259
            "backend": {"backend_id": self.config.get("backend")},
260
            "is_trained": self.is_trained,
261
            "modification_time": self.modification_time,
262
        }
263
264
    def remove_model_data(self):
265
        """remove the data of this project"""
266
        datadir_path = self._datadir_path
267
        if os.path.isdir(datadir_path):
268
            rmtree(datadir_path)
269
            logger.info("Removed model data for project {}.".format(self.project_id))
270
        else:
271
            logger.warning(
272
                "No model data to remove for project {}.".format(self.project_id)
273
            )
274