annif.project.AnnifProject._initialize_analyzer() - Code Metrics - Inspection of "Bump the github-actions group across 2 directories..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — dependabot/github_actions/dot-... ( 23c3ab...32b3fe )

unknown

created 2025-04-01 23:55 UTC

annif.project.AnnifProject._initialize_analyzer() A

↳ Parent: annif.project

Complexity

Conditions

Size

Total Lines	6
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	6
nop	1
dl	0
loc	6
rs	10
c	0
b	0
f	0

"""Project management functionality for Annif"""

from __future__ import annotations

import enum
import os.path
import re
from shutil import rmtree
from typing import TYPE_CHECKING

import annif
import annif.analyzer
import annif.backend
import annif.corpus
import annif.transform
from annif.datadir import DatadirMixin
from annif.exception import (
    AnnifException,
    ConfigurationException,
    NotInitializedException,
    NotSupportedException,
)
from annif.util import parse_args
from annif.vocab import SubjectIndexFilter

if TYPE_CHECKING:
    from collections import defaultdict
    from configparser import SectionProxy
    from datetime import datetime

    from click.utils import LazyFile

    from annif.analyzer import Analyzer
    from annif.backend import AnnifBackend
    from annif.backend.hyperopt import HPRecommendation
    from annif.corpus.document import DocumentCorpus
    from annif.corpus.subject import SubjectIndex
    from annif.registry import AnnifRegistry
    from annif.transform.transform import TransformChain
    from annif.vocab import AnnifVocabulary

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""

    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _transform = None
    _analyzer = None
    _backend = None
    _vocab = None
    _vocab_lang = None
    _vocab_kwargs = {}
    _subject_index = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = "public"

    def __init__(
        self,
        project_id: str,
        config: dict[str, str] | SectionProxy,
        datadir: str,
        registry: AnnifRegistry,
    ) -> None:
        DatadirMixin.__init__(self, datadir, "projects", project_id)
        self.project_id = project_id
        self.name = config.get("name", project_id)
        self.language = config["language"]
        self.analyzer_spec = config.get("analyzer", None)
        self.transform_spec = config.get("transform", "pass")
        self.vocab_spec = config.get("vocab", None)
        self.config = config
        self._base_datadir = datadir
        self.registry = registry
        self._init_access()

    def _init_access(self) -> None:
        access = self.config.get("access", self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id,
            )

    def _initialize_analyzer(self) -> None:
        if not self.analyzer_spec:
            return  # not configured, so assume it's not needed
        analyzer = self.analyzer
        logger.debug(
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
        )

    def _initialize_subjects(self) -> None:
        try:
            subjects = self.subjects
            logger.debug(
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
            )
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self, parallel: bool) -> None:
        logger.debug("Project '%s': initializing backend", self.project_id)
        try:
            if not self.backend:
                logger.debug("Cannot initialize backend: does not exist")
                return
            self.backend.initialize(parallel)
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self, parallel: bool = False) -> None:
        """Initialize this project and its backend so that they are ready to
        be used. If parallel is True, expect that the project will be used
        for parallel processing."""

        if self.initialized:
            return

        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_backend(parallel)

        self.initialized = True

    def _suggest_with_backend(
        self,
        texts: list[str],
        backend_params: defaultdict[str, dict] | None,
    ) -> annif.suggestion.SuggestionBatch:
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        return self.backend.suggest(texts, beparams)

    @property
    def analyzer(self) -> Analyzer:
        if self._analyzer is None:
            if self.analyzer_spec:
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
            else:
                raise ConfigurationException(
                    "analyzer setting is missing", project_id=self.project_id
                )
        return self._analyzer

    @property
    def transform(self) -> TransformChain:
        if self._transform is None:
            self._transform = annif.transform.get_transform(
                self.transform_spec, project=self
            )
        return self._transform

    @property
    def backend(self) -> AnnifBackend | None:
        if self._backend is None:
            if "backend" not in self.config:
                raise ConfigurationException(
                    "backend setting is missing", project_id=self.project_id
                )
            backend_id = self.config["backend"]
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, config_params=self.config, project=self
                )
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id,
                )
        return self._backend

    def _initialize_vocab(self) -> None:
        if self.vocab_spec is None:
            raise ConfigurationException(
                "vocab setting is missing", project_id=self.project_id
            )

        match = re.match(r"([\w-]+)(\((.*)\))?$", self.vocab_spec)
        if match is None:
            raise ValueError(f"Invalid vocabulary specification: {self.vocab_spec}")
        vocab_id = match.group(1)
        posargs, self._vocab_kwargs = parse_args(match.group(3))
        self._vocab_lang = posargs[0] if posargs else self.language
        self._vocab = self.registry.get_vocab(vocab_id)

    @property
    def vocab(self) -> AnnifVocabulary:
        if self._vocab is None:
            self._initialize_vocab()
        return self._vocab

    @property
    def vocab_lang(self) -> str:
        if self._vocab_lang is None:
            self._initialize_vocab()
        return self._vocab_lang

    @property
    def subjects(self) -> SubjectIndex:
        if self._subject_index is None:
            self._subject_index = self.vocab.subjects
            if "exclude" in self._vocab_kwargs:
                exclude_list = self._vocab_kwargs["exclude"].split("|")
                self._subject_index = SubjectIndexFilter(
                    self._subject_index, exclude=exclude_list
                )
        return self._subject_index

    def _get_info(self, key: str) -> bool | datetime | None:
        try:
            be = self.backend
            if be is not None:
                return getattr(be, key)
        except AnnifException as err:
            logger.warning(err.format_message())
            return None

    @property
    def is_trained(self) -> bool | None:
        return self._get_info("is_trained")

    @property
    def modification_time(self) -> datetime | None:
        return self._get_info("modification_time")

    def suggest_corpus(
        self,
        corpus: DocumentCorpus,
        backend_params: defaultdict[str, dict] | None = None,
    ) -> annif.suggestion.SuggestionResults:
        """Suggest subjects for the given documents corpus in batches of documents."""
        suggestions = (
            self.suggest([doc.text for doc in doc_batch], backend_params)
            for doc_batch in corpus.doc_batches
        )
        import annif.suggestion

        return annif.suggestion.SuggestionResults(suggestions)

    def suggest(
        self,
        texts: list[str],
        backend_params: defaultdict[str, dict] | None = None,
    ) -> annif.suggestion.SuggestionBatch:
        """Suggest subjects for the given documents batch."""
        if not self.is_trained:
            if self.is_trained is None:
                logger.warning("Could not get train state information.")
            else:
                raise NotInitializedException("Project is not trained.")
        texts = [self.transform.transform_text(text) for text in texts]
        return self._suggest_with_backend(texts, backend_params)

    def train(
        self,
        corpus: DocumentCorpus,
        backend_params: defaultdict[str, dict] | None = None,
        jobs: int = 0,
    ) -> None:
        """train the project using documents from a metadata source"""
        if corpus != "cached":
            corpus = self.transform.transform_corpus(corpus)
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        self.backend.train(corpus, beparams, jobs)

    def learn(
        self,
        corpus: DocumentCorpus,
        backend_params: defaultdict[str, dict] | None = None,
    ) -> None:
        """further train the project using documents from a metadata source"""
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        corpus = self.transform.transform_corpus(corpus)
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, beparams)
        else:
            raise NotSupportedException(
                "Learning not supported by backend", project_id=self.project_id
            )

    def hyperopt(
        self,
        corpus: DocumentCorpus,
        trials: int,
        jobs: int,
        metric: str,
        results_file: LazyFile | None,
    ) -> HPRecommendation:
        """optimize the hyperparameters of the project using a validation
        corpus against a given metric"""
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
            return optimizer.optimize(trials, jobs, results_file)

        raise NotSupportedException(
            "Hyperparameter optimization not supported " "by backend",
            project_id=self.project_id,
        )

    def dump(self) -> dict[str, str | dict | bool | datetime | None]:
        """return this project as a dict"""

        try:
            vocab = {
                "vocab_id": self.vocab.vocab_id,
                "languages": sorted(self.vocab.languages),
            }
            vocab_lang = self.vocab_lang
        except ConfigurationException:
            vocab = None
            vocab_lang = None

        return {
            "project_id": self.project_id,
            "name": self.name,
            "language": self.language,
            "backend": {"backend_id": self.config.get("backend")},
            "vocab": vocab,
            "vocab_language": vocab_lang,
            "is_trained": self.is_trained,
            "modification_time": self.modification_time,
        }

    def remove_model_data(self) -> None:
        """remove the data of this project"""
        datadir_path = self._datadir_path
        if os.path.isdir(datadir_path):
            rmtree(datadir_path)
            logger.info("Removed model data for project {}.".format(self.project_id))
        else:
            logger.warning(
                "No model data to remove for project {}.".format(self.project_id)
            )


1			"""Project management functionality for Annif"""
2
3			from __future__ import annotations
4
5			import enum
6			import os.path
7			import re
8			from shutil import rmtree
9			from typing import TYPE_CHECKING
10
11			import annif
12			import annif.analyzer
13			import annif.backend
14			import annif.corpus
15			import annif.transform
16			from annif.datadir import DatadirMixin
17			from annif.exception import (
18			AnnifException,
19			ConfigurationException,
20			NotInitializedException,
21			NotSupportedException,
22			)
23			from annif.util import parse_args
24			from annif.vocab import SubjectIndexFilter
25
26			if TYPE_CHECKING:
27			from collections import defaultdict
28			from configparser import SectionProxy
29			from datetime import datetime
30
31			from click.utils import LazyFile
32
33			from annif.analyzer import Analyzer
34			from annif.backend import AnnifBackend
35			from annif.backend.hyperopt import HPRecommendation
36			from annif.corpus.document import DocumentCorpus
37			from annif.corpus.subject import SubjectIndex
38			from annif.registry import AnnifRegistry
39			from annif.transform.transform import TransformChain
40			from annif.vocab import AnnifVocabulary
41
42			logger = annif.logger
43
44
45			class Access(enum.IntEnum):
46			"""Enumeration of access levels for projects"""
47
48			private = 1
49			hidden = 2
50			public = 3
51
52
53			class AnnifProject(DatadirMixin):
54			"""Class representing the configuration of a single Annif project."""
55
56			# defaults for uninitialized instances
57			_transform = None
58			_analyzer = None
59			_backend = None
60			_vocab = None
61			_vocab_lang = None
62			_vocab_kwargs = {}
63			_subject_index = None
64			initialized = False
65
66			# default values for configuration settings
67			DEFAULT_ACCESS = "public"
68
69			def __init__(
70			self,
71			project_id: str,
72			config: dict[str, str] \| SectionProxy,
73			datadir: str,
74			registry: AnnifRegistry,
75			) -> None:
76			DatadirMixin.__init__(self, datadir, "projects", project_id)
77			self.project_id = project_id
78			self.name = config.get("name", project_id)
79			self.language = config["language"]
80			self.analyzer_spec = config.get("analyzer", None)
81			self.transform_spec = config.get("transform", "pass")
82			self.vocab_spec = config.get("vocab", None)
83			self.config = config
84			self._base_datadir = datadir
85			self.registry = registry
86			self._init_access()
87
88			def _init_access(self) -> None:
89			access = self.config.get("access", self.DEFAULT_ACCESS)
90			try:
91			self.access = getattr(Access, access)
92			except AttributeError:
93			raise ConfigurationException(
94			"'{}' is not a valid access setting".format(access),
95			project_id=self.project_id,
96			)
97
98			def _initialize_analyzer(self) -> None:
99			if not self.analyzer_spec:
100			return # not configured, so assume it's not needed
101			analyzer = self.analyzer
102			logger.debug(
103			"Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
104			)
105
106			def _initialize_subjects(self) -> None:
107			try:
108			subjects = self.subjects
109			logger.debug(
110			"Project '%s': initialized subjects: %s", self.project_id, str(subjects)
111			)
112			except AnnifException as err:
113			logger.warning(err.format_message())
114
115			def _initialize_backend(self, parallel: bool) -> None:
116			logger.debug("Project '%s': initializing backend", self.project_id)
117			try:
118			if not self.backend:
119			logger.debug("Cannot initialize backend: does not exist")
120			return
121			self.backend.initialize(parallel)
122			except AnnifException as err:
123			logger.warning(err.format_message())
124
125			def initialize(self, parallel: bool = False) -> None:
126			"""Initialize this project and its backend so that they are ready to
127			be used. If parallel is True, expect that the project will be used
128			for parallel processing."""
129
130			if self.initialized:
131			return
132
133			logger.debug("Initializing project '%s'", self.project_id)
134
135			self._initialize_analyzer()
136			self._initialize_subjects()
137			self._initialize_backend(parallel)
138
139			self.initialized = True
140
141			def _suggest_with_backend(
142			self,
143			texts: list[str],
144			backend_params: defaultdict[str, dict] \| None,
145			) -> annif.suggestion.SuggestionBatch:
146			if backend_params is None:
147			backend_params = {}
148			beparams = backend_params.get(self.backend.backend_id, {})
149			return self.backend.suggest(texts, beparams)
150
151			@property
152			def analyzer(self) -> Analyzer:
153			if self._analyzer is None:
154			if self.analyzer_spec:
155			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
156			else:
157			raise ConfigurationException(
158			"analyzer setting is missing", project_id=self.project_id
159			)
160			return self._analyzer
161
162			@property
163			def transform(self) -> TransformChain:
164			if self._transform is None:
165			self._transform = annif.transform.get_transform(
166			self.transform_spec, project=self
167			)
168			return self._transform
169
170			@property
171			def backend(self) -> AnnifBackend \| None:
172			if self._backend is None:
173			if "backend" not in self.config:
174			raise ConfigurationException(
175			"backend setting is missing", project_id=self.project_id
176			)
177			backend_id = self.config["backend"]
178			try:
179			backend_class = annif.backend.get_backend(backend_id)
180			self._backend = backend_class(
181			backend_id, config_params=self.config, project=self
182			)
183			except ValueError:
184			logger.warning(
185			"Could not create backend %s, "
186			"make sure you've installed optional dependencies",
187			backend_id,
188			)
189			return self._backend
190
191			def _initialize_vocab(self) -> None:
192			if self.vocab_spec is None:
193			raise ConfigurationException(
194			"vocab setting is missing", project_id=self.project_id
195			)
196
197			match = re.match(r"([\w-]+)(\((.*)\))?$", self.vocab_spec)
198			if match is None:
199			raise ValueError(f"Invalid vocabulary specification: {self.vocab_spec}")
200			vocab_id = match.group(1)
201			posargs, self._vocab_kwargs = parse_args(match.group(3))
202			self._vocab_lang = posargs[0] if posargs else self.language
203			self._vocab = self.registry.get_vocab(vocab_id)
204
205			@property
206			def vocab(self) -> AnnifVocabulary:
207			if self._vocab is None:
208			self._initialize_vocab()
209			return self._vocab
210
211			@property
212			def vocab_lang(self) -> str:
213			if self._vocab_lang is None:
214			self._initialize_vocab()
215			return self._vocab_lang
216
217			@property
218			def subjects(self) -> SubjectIndex:
219			if self._subject_index is None:
220			self._subject_index = self.vocab.subjects
221			if "exclude" in self._vocab_kwargs:
222			exclude_list = self._vocab_kwargs["exclude"].split("\|")
223			self._subject_index = SubjectIndexFilter(
224			self._subject_index, exclude=exclude_list
225			)
226			return self._subject_index
227
228			def _get_info(self, key: str) -> bool \| datetime \| None:
229			try:
230			be = self.backend
231			if be is not None:
232			return getattr(be, key)
233			except AnnifException as err:
234			logger.warning(err.format_message())
235			return None
236
237			@property
238			def is_trained(self) -> bool \| None:
239			return self._get_info("is_trained")
240
241			@property
242			def modification_time(self) -> datetime \| None:
243			return self._get_info("modification_time")
244
245			def suggest_corpus(
246			self,
247			corpus: DocumentCorpus,
248			backend_params: defaultdict[str, dict] \| None = None,
249			) -> annif.suggestion.SuggestionResults:
250			"""Suggest subjects for the given documents corpus in batches of documents."""
251			suggestions = (
252			self.suggest([doc.text for doc in doc_batch], backend_params)
253			for doc_batch in corpus.doc_batches
254			)
255			import annif.suggestion
256
257			return annif.suggestion.SuggestionResults(suggestions)
258
259			def suggest(
260			self,
261			texts: list[str],
262			backend_params: defaultdict[str, dict] \| None = None,
263			) -> annif.suggestion.SuggestionBatch:
264			"""Suggest subjects for the given documents batch."""
265			if not self.is_trained:
266			if self.is_trained is None:
267			logger.warning("Could not get train state information.")
268			else:
269			raise NotInitializedException("Project is not trained.")
270			texts = [self.transform.transform_text(text) for text in texts]
271			return self._suggest_with_backend(texts, backend_params)
272
273			def train(
274			self,
275			corpus: DocumentCorpus,
276			backend_params: defaultdict[str, dict] \| None = None,
277			jobs: int = 0,
278			) -> None:
279			"""train the project using documents from a metadata source"""
280			if corpus != "cached":
281			corpus = self.transform.transform_corpus(corpus)
282			if backend_params is None:
283			backend_params = {}
284			beparams = backend_params.get(self.backend.backend_id, {})
285			self.backend.train(corpus, beparams, jobs)
286
287			def learn(
288			self,
289			corpus: DocumentCorpus,
290			backend_params: defaultdict[str, dict] \| None = None,
291			) -> None:
292			"""further train the project using documents from a metadata source"""
293			if backend_params is None:
294			backend_params = {}
295			beparams = backend_params.get(self.backend.backend_id, {})
296			corpus = self.transform.transform_corpus(corpus)
297			if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
298			self.backend.learn(corpus, beparams)
299			else:
300			raise NotSupportedException(
301			"Learning not supported by backend", project_id=self.project_id
302			)
303
304			def hyperopt(
305			self,
306			corpus: DocumentCorpus,
307			trials: int,
308			jobs: int,
309			metric: str,
310			results_file: LazyFile \| None,
311			) -> HPRecommendation:
312			"""optimize the hyperparameters of the project using a validation
313			corpus against a given metric"""
314			if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
315			optimizer = self.backend.get_hp_optimizer(corpus, metric)
316			return optimizer.optimize(trials, jobs, results_file)
317
318			raise NotSupportedException(
319			"Hyperparameter optimization not supported " "by backend",
320			project_id=self.project_id,
321			)
322
323			def dump(self) -> dict[str, str \| dict \| bool \| datetime \| None]:
324			"""return this project as a dict"""
325
326			try:
327			vocab = {
328			"vocab_id": self.vocab.vocab_id,
329			"languages": sorted(self.vocab.languages),
330			}
331			vocab_lang = self.vocab_lang
332			except ConfigurationException:
333			vocab = None
334			vocab_lang = None
335
336			return {
337			"project_id": self.project_id,
338			"name": self.name,
339			"language": self.language,
340			"backend": {"backend_id": self.config.get("backend")},
341			"vocab": vocab,
342			"vocab_language": vocab_lang,
343			"is_trained": self.is_trained,
344			"modification_time": self.modification_time,
345			}
346
347			def remove_model_data(self) -> None:
348			"""remove the data of this project"""
349			datadir_path = self._datadir_path
350			if os.path.isdir(datadir_path):
351			rmtree(datadir_path)
352			logger.info("Removed model data for project {}.".format(self.project_id))
353			else:
354			logger.warning(
355			"No model data to remove for project {}.".format(self.project_id)
356			)
357

NatLibFi / Annif

Push — dependabot/github_actions/dot-... ( 23c3ab...32b3fe )

annif.project.AnnifProject._initialize_analyzer() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like