annif.project   C
last analyzed

Complexity

Total Complexity 55

Size/Duplication

Total Lines 356
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 265
dl 0
loc 356
rs 6
c 0
b 0
f 0
wmc 55

24 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject._get_info() 0 8 3
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject.remove_model_data() 0 9 2
A AnnifProject.dump() 0 22 2
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject.subjects() 0 10 3
A AnnifProject.hyperopt() 0 17 2
A AnnifProject.suggest_corpus() 0 13 1
A AnnifProject.modification_time() 0 3 1
A AnnifProject.train() 0 13 3
A AnnifProject._init_access() 0 8 2
A AnnifProject._initialize_vocab() 0 13 4
A AnnifProject._suggest_with_backend() 0 9 2
A AnnifProject.__init__() 0 18 1
A AnnifProject.suggest() 0 13 3
A AnnifProject.analyzer() 0 10 3
A AnnifProject._initialize_analyzer() 0 6 2
A AnnifProject.transform() 0 7 2
A AnnifProject.learn() 0 15 3
A AnnifProject.vocab_lang() 0 5 2
A AnnifProject.is_trained() 0 3 1
A AnnifProject.vocab() 0 5 2
A AnnifProject.backend() 0 20 4
A AnnifProject.initialize() 0 15 2

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
from __future__ import annotations
4
5
import enum
6
import os.path
7
import re
8
from shutil import rmtree
9
from typing import TYPE_CHECKING
10
11
import annif
12
import annif.analyzer
13
import annif.backend
14
import annif.corpus
15
import annif.transform
16
from annif.datadir import DatadirMixin
17
from annif.exception import (
18
    AnnifException,
19
    ConfigurationException,
20
    NotInitializedException,
21
    NotSupportedException,
22
)
23
from annif.util import parse_args
24
from annif.vocab import SubjectIndexFilter
25
26
if TYPE_CHECKING:
27
    from collections import defaultdict
28
    from configparser import SectionProxy
29
    from datetime import datetime
30
31
    from click.utils import LazyFile
32
33
    from annif.analyzer import Analyzer
34
    from annif.backend import AnnifBackend
35
    from annif.backend.hyperopt import HPRecommendation
36
    from annif.corpus.document import DocumentCorpus
37
    from annif.corpus.subject import SubjectIndex
38
    from annif.registry import AnnifRegistry
39
    from annif.transform.transform import TransformChain
40
    from annif.vocab import AnnifVocabulary
41
42
logger = annif.logger
43
44
45
class Access(enum.IntEnum):
46
    """Enumeration of access levels for projects"""
47
48
    private = 1
49
    hidden = 2
50
    public = 3
51
52
53
class AnnifProject(DatadirMixin):
54
    """Class representing the configuration of a single Annif project."""
55
56
    # defaults for uninitialized instances
57
    _transform = None
58
    _analyzer = None
59
    _backend = None
60
    _vocab = None
61
    _vocab_lang = None
62
    _vocab_kwargs = {}
63
    _subject_index = None
64
    initialized = False
65
66
    # default values for configuration settings
67
    DEFAULT_ACCESS = "public"
68
69
    def __init__(
70
        self,
71
        project_id: str,
72
        config: dict[str, str] | SectionProxy,
73
        datadir: str,
74
        registry: AnnifRegistry,
75
    ) -> None:
76
        DatadirMixin.__init__(self, datadir, "projects", project_id)
77
        self.project_id = project_id
78
        self.name = config.get("name", project_id)
79
        self.language = config["language"]
80
        self.analyzer_spec = config.get("analyzer", None)
81
        self.transform_spec = config.get("transform", "pass")
82
        self.vocab_spec = config.get("vocab", None)
83
        self.config = config
84
        self._base_datadir = datadir
85
        self.registry = registry
86
        self._init_access()
87
88
    def _init_access(self) -> None:
89
        access = self.config.get("access", self.DEFAULT_ACCESS)
90
        try:
91
            self.access = getattr(Access, access)
92
        except AttributeError:
93
            raise ConfigurationException(
94
                "'{}' is not a valid access setting".format(access),
95
                project_id=self.project_id,
96
            )
97
98
    def _initialize_analyzer(self) -> None:
99
        if not self.analyzer_spec:
100
            return  # not configured, so assume it's not needed
101
        analyzer = self.analyzer
102
        logger.debug(
103
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
104
        )
105
106
    def _initialize_subjects(self) -> None:
107
        try:
108
            subjects = self.subjects
109
            logger.debug(
110
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
111
            )
112
        except AnnifException as err:
113
            logger.warning(err.format_message())
114
115
    def _initialize_backend(self, parallel: bool) -> None:
116
        logger.debug("Project '%s': initializing backend", self.project_id)
117
        try:
118
            if not self.backend:
119
                logger.debug("Cannot initialize backend: does not exist")
120
                return
121
            self.backend.initialize(parallel)
122
        except AnnifException as err:
123
            logger.warning(err.format_message())
124
125
    def initialize(self, parallel: bool = False) -> None:
126
        """Initialize this project and its backend so that they are ready to
127
        be used. If parallel is True, expect that the project will be used
128
        for parallel processing."""
129
130
        if self.initialized:
131
            return
132
133
        logger.debug("Initializing project '%s'", self.project_id)
134
135
        self._initialize_analyzer()
136
        self._initialize_subjects()
137
        self._initialize_backend(parallel)
138
139
        self.initialized = True
140
141
    def _suggest_with_backend(
142
        self,
143
        texts: list[str],
144
        backend_params: defaultdict[str, dict] | None,
145
    ) -> annif.suggestion.SuggestionBatch:
146
        if backend_params is None:
147
            backend_params = {}
148
        beparams = backend_params.get(self.backend.backend_id, {})
149
        return self.backend.suggest(texts, beparams)
150
151
    @property
152
    def analyzer(self) -> Analyzer:
153
        if self._analyzer is None:
154
            if self.analyzer_spec:
155
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
156
            else:
157
                raise ConfigurationException(
158
                    "analyzer setting is missing", project_id=self.project_id
159
                )
160
        return self._analyzer
161
162
    @property
163
    def transform(self) -> TransformChain:
164
        if self._transform is None:
165
            self._transform = annif.transform.get_transform(
166
                self.transform_spec, project=self
167
            )
168
        return self._transform
169
170
    @property
171
    def backend(self) -> AnnifBackend | None:
172
        if self._backend is None:
173
            if "backend" not in self.config:
174
                raise ConfigurationException(
175
                    "backend setting is missing", project_id=self.project_id
176
                )
177
            backend_id = self.config["backend"]
178
            try:
179
                backend_class = annif.backend.get_backend(backend_id)
180
                self._backend = backend_class(
181
                    backend_id, config_params=self.config, project=self
182
                )
183
            except ValueError:
184
                logger.warning(
185
                    "Could not create backend %s, "
186
                    "make sure you've installed optional dependencies",
187
                    backend_id,
188
                )
189
        return self._backend
190
191
    def _initialize_vocab(self) -> None:
192
        if self.vocab_spec is None:
193
            raise ConfigurationException(
194
                "vocab setting is missing", project_id=self.project_id
195
            )
196
197
        match = re.match(r"([\w-]+)(\((.*)\))?$", self.vocab_spec)
198
        if match is None:
199
            raise ValueError(f"Invalid vocabulary specification: {self.vocab_spec}")
200
        vocab_id = match.group(1)
201
        posargs, self._vocab_kwargs = parse_args(match.group(3))
202
        self._vocab_lang = posargs[0] if posargs else self.language
203
        self._vocab = self.registry.get_vocab(vocab_id)
204
205
    @property
206
    def vocab(self) -> AnnifVocabulary:
207
        if self._vocab is None:
208
            self._initialize_vocab()
209
        return self._vocab
210
211
    @property
212
    def vocab_lang(self) -> str:
213
        if self._vocab_lang is None:
214
            self._initialize_vocab()
215
        return self._vocab_lang
216
217
    @property
218
    def subjects(self) -> SubjectIndex:
219
        if self._subject_index is None:
220
            self._subject_index = self.vocab.subjects
221
            if "exclude" in self._vocab_kwargs:
222
                exclude_list = self._vocab_kwargs["exclude"].split("|")
223
                self._subject_index = SubjectIndexFilter(
224
                    self._subject_index, exclude=exclude_list
225
                )
226
        return self._subject_index
227
228
    def _get_info(self, key: str) -> bool | datetime | None:
229
        try:
230
            be = self.backend
231
            if be is not None:
232
                return getattr(be, key)
233
        except AnnifException as err:
234
            logger.warning(err.format_message())
235
            return None
236
237
    @property
238
    def is_trained(self) -> bool | None:
239
        return self._get_info("is_trained")
240
241
    @property
242
    def modification_time(self) -> datetime | None:
243
        return self._get_info("modification_time")
244
245
    def suggest_corpus(
246
        self,
247
        corpus: DocumentCorpus,
248
        backend_params: defaultdict[str, dict] | None = None,
249
    ) -> annif.suggestion.SuggestionResults:
250
        """Suggest subjects for the given documents corpus in batches of documents."""
251
        suggestions = (
252
            self.suggest([doc.text for doc in doc_batch], backend_params)
253
            for doc_batch in corpus.doc_batches
254
        )
255
        import annif.suggestion
256
257
        return annif.suggestion.SuggestionResults(suggestions)
258
259
    def suggest(
260
        self,
261
        texts: list[str],
262
        backend_params: defaultdict[str, dict] | None = None,
263
    ) -> annif.suggestion.SuggestionBatch:
264
        """Suggest subjects for the given documents batch."""
265
        if not self.is_trained:
266
            if self.is_trained is None:
267
                logger.warning("Could not get train state information.")
268
            else:
269
                raise NotInitializedException("Project is not trained.")
270
        texts = [self.transform.transform_text(text) for text in texts]
271
        return self._suggest_with_backend(texts, backend_params)
272
273
    def train(
274
        self,
275
        corpus: DocumentCorpus,
276
        backend_params: defaultdict[str, dict] | None = None,
277
        jobs: int = 0,
278
    ) -> None:
279
        """train the project using documents from a metadata source"""
280
        if corpus != "cached":
281
            corpus = self.transform.transform_corpus(corpus)
282
        if backend_params is None:
283
            backend_params = {}
284
        beparams = backend_params.get(self.backend.backend_id, {})
285
        self.backend.train(corpus, beparams, jobs)
286
287
    def learn(
288
        self,
289
        corpus: DocumentCorpus,
290
        backend_params: defaultdict[str, dict] | None = None,
291
    ) -> None:
292
        """further train the project using documents from a metadata source"""
293
        if backend_params is None:
294
            backend_params = {}
295
        beparams = backend_params.get(self.backend.backend_id, {})
296
        corpus = self.transform.transform_corpus(corpus)
297
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
298
            self.backend.learn(corpus, beparams)
299
        else:
300
            raise NotSupportedException(
301
                "Learning not supported by backend", project_id=self.project_id
302
            )
303
304
    def hyperopt(
305
        self,
306
        corpus: DocumentCorpus,
307
        trials: int,
308
        jobs: int,
309
        metric: str,
310
        results_file: LazyFile | None,
311
    ) -> HPRecommendation:
312
        """optimize the hyperparameters of the project using a validation
313
        corpus against a given metric"""
314
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
315
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
316
            return optimizer.optimize(trials, jobs, results_file)
317
318
        raise NotSupportedException(
319
            "Hyperparameter optimization not supported " "by backend",
320
            project_id=self.project_id,
321
        )
322
323
    def dump(self) -> dict[str, str | dict | bool | datetime | None]:
324
        """return this project as a dict"""
325
326
        try:
327
            vocab = {
328
                "vocab_id": self.vocab.vocab_id,
329
                "languages": sorted(self.vocab.languages),
330
            }
331
            vocab_lang = self.vocab_lang
332
        except ConfigurationException:
333
            vocab = None
334
            vocab_lang = None
335
336
        return {
337
            "project_id": self.project_id,
338
            "name": self.name,
339
            "language": self.language,
340
            "backend": {"backend_id": self.config.get("backend")},
341
            "vocab": vocab,
342
            "vocab_language": vocab_lang,
343
            "is_trained": self.is_trained,
344
            "modification_time": self.modification_time,
345
        }
346
347
    def remove_model_data(self) -> None:
348
        """remove the data of this project"""
349
        datadir_path = self._datadir_path
350
        if os.path.isdir(datadir_path):
351
            rmtree(datadir_path)
352
            logger.info("Removed model data for project {}.".format(self.project_id))
353
        else:
354
            logger.warning(
355
                "No model data to remove for project {}.".format(self.project_id)
356
            )
357