annif.project - Code Metrics - Inspection of "Support for batch suggest operations in suggest an..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#663)

by Juho

created 2023-02-03 10:33 UTC

annif.project B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	273
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	198
dl	0
loc	273
rs	8.4
c	0
b	0
f	0
wmc	50

24 Methods

Rating	Name	Size	Complexity
A	AnnifProject._get_info()	8	3
A	AnnifProject.subjects()	3	1
A	AnnifProject.modification_time()	3	1
A	AnnifProject._initialize_vocab()	7	2
A	AnnifProject.analyzer()	10	3
A	AnnifProject.transform()	7	2
A	AnnifProject.vocab_lang()	5	2
A	AnnifProject.is_trained()	3	1
A	AnnifProject.vocab()	5	2
A	AnnifProject.backend()	20	4
A	AnnifProject._initialize_subjects()	8	2
A	AnnifProject._initialize_backend()	9	3
A	AnnifProject._init_access()	8	2
A	AnnifProject.__init__()	12	1
A	AnnifProject._initialize_analyzer()	6	2
A	AnnifProject.initialize()	15	2
A	AnnifProject.remove_model_data()	9	2
A	AnnifProject.dump()	9	1
A	AnnifProject.hyperopt()	10	2
A	AnnifProject.suggest_corpus()	7	1
A	AnnifProject.train()	8	3
A	AnnifProject._suggest_with_backend()	5	2
A	AnnifProject.suggest()	9	3
A	AnnifProject.learn()	11	3

How to fix Complexity

"""Project management functionality for Annif"""

import enum
import itertools
import os.path
from shutil import rmtree

import annif
import annif.analyzer
import annif.backend
import annif.corpus
import annif.suggestion
import annif.transform
from annif.datadir import DatadirMixin
from annif.exception import (
    AnnifException,
    ConfigurationException,
    NotInitializedException,
    NotSupportedException,
)

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""

    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _transform = None
    _analyzer = None
    _backend = None
    _vocab = None
    _vocab_lang = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = "public"

    def __init__(self, project_id, config, datadir, registry):
        DatadirMixin.__init__(self, datadir, "projects", project_id)
        self.project_id = project_id
        self.name = config.get("name", project_id)
        self.language = config["language"]
        self.analyzer_spec = config.get("analyzer", None)
        self.transform_spec = config.get("transform", "pass")
        self.vocab_spec = config.get("vocab", None)
        self.config = config
        self._base_datadir = datadir
        self.registry = registry
        self._init_access()

    def _init_access(self):
        access = self.config.get("access", self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id,
            )

    def _initialize_analyzer(self):
        if not self.analyzer_spec:
            return  # not configured, so assume it's not needed
        analyzer = self.analyzer
        logger.debug(
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
        )

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug(
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
            )
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self, parallel):
        logger.debug("Project '%s': initializing backend", self.project_id)
        try:
            if not self.backend:
                logger.debug("Cannot initialize backend: does not exist")
                return
            self.backend.initialize(parallel)
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self, parallel=False):
        """Initialize this project and its backend so that they are ready to
        be used. If parallel is True, expect that the project will be used
        for parallel processing."""

        if self.initialized:
            return

        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_backend(parallel)

        self.initialized = True

    def _suggest_with_backend(self, texts, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        return self.backend.suggest(texts, beparams)

    @property
    def analyzer(self):
        if self._analyzer is None:
            if self.analyzer_spec:
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
            else:
                raise ConfigurationException(
                    "analyzer setting is missing", project_id=self.project_id
                )
        return self._analyzer

    @property
    def transform(self):
        if self._transform is None:
            self._transform = annif.transform.get_transform(
                self.transform_spec, project=self
            )
        return self._transform

    @property
    def backend(self):
        if self._backend is None:
            if "backend" not in self.config:
                raise ConfigurationException(
                    "backend setting is missing", project_id=self.project_id
                )
            backend_id = self.config["backend"]
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, config_params=self.config, project=self
                )
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id,
                )
        return self._backend

    def _initialize_vocab(self):
        if self.vocab_spec is None:
            raise ConfigurationException(
                "vocab setting is missing", project_id=self.project_id
            )
        self._vocab, self._vocab_lang = self.registry.get_vocab(
            self.vocab_spec, self.language
        )

    @property
    def vocab(self):
        if self._vocab is None:
            self._initialize_vocab()
        return self._vocab

    @property
    def vocab_lang(self):
        if self._vocab_lang is None:
            self._initialize_vocab()
        return self._vocab_lang

    @property
    def subjects(self):
        return self.vocab.subjects

    def _get_info(self, key):
        try:
            be = self.backend
            if be is not None:
                return getattr(be, key)
        except AnnifException as err:
            logger.warning(err.format_message())
            return None

    @property
    def is_trained(self):
        return self._get_info("is_trained")

    @property
    def modification_time(self):
        return self._get_info("modification_time")

    def suggest_corpus(self, corpus, backend_params=None):
        """Suggest subjects for the given documents corpus in batches of documents."""
        suggestions = (
            self.suggest([doc.text for doc in doc_batch], backend_params)
            for doc_batch in corpus.doc_batches
        )
        return itertools.chain.from_iterable(suggestions)

    def suggest(self, texts, backend_params=None):
        """Suggest subjects for the given documents batch."""
        if not self.is_trained:
            if self.is_trained is None:
                logger.warning("Could not get train state information.")
            else:
                raise NotInitializedException("Project is not trained.")
        texts = [self.transform.transform_text(text) for text in texts]
        return self._suggest_with_backend(texts, backend_params)

    def train(self, corpus, backend_params=None, jobs=0):
        """train the project using documents from a metadata source"""
        if corpus != "cached":
            corpus = self.transform.transform_corpus(corpus)
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        self.backend.train(corpus, beparams, jobs)

    def learn(self, corpus, backend_params=None):
        """further train the project using documents from a metadata source"""
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        corpus = self.transform.transform_corpus(corpus)
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, beparams)
        else:
            raise NotSupportedException(
                "Learning not supported by backend", project_id=self.project_id
            )

    def hyperopt(self, corpus, trials, jobs, metric, results_file):
        """optimize the hyperparameters of the project using a validation
        corpus against a given metric"""
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
            return optimizer.optimize(trials, jobs, results_file)

        raise NotSupportedException(
            "Hyperparameter optimization not supported " "by backend",
            project_id=self.project_id,
        )

    def dump(self):
        """return this project as a dict"""
        return {
            "project_id": self.project_id,
            "name": self.name,
            "language": self.language,
            "backend": {"backend_id": self.config.get("backend")},
            "is_trained": self.is_trained,
            "modification_time": self.modification_time,
        }

    def remove_model_data(self):
        """remove the data of this project"""
        datadir_path = self._datadir_path
        if os.path.isdir(datadir_path):
            rmtree(datadir_path)
            logger.info("Removed model data for project {}.".format(self.project_id))
        else:
            logger.warning(
                "No model data to remove for project {}.".format(self.project_id)
            )


1			"""Project management functionality for Annif"""
2
3			import enum
4			import itertools
5			import os.path
6			from shutil import rmtree
7
8			import annif
9			import annif.analyzer
10			import annif.backend
11			import annif.corpus
12			import annif.suggestion
13			import annif.transform
14			from annif.datadir import DatadirMixin
15			from annif.exception import (
16			AnnifException,
17			ConfigurationException,
18			NotInitializedException,
19			NotSupportedException,
20			)
21
22			logger = annif.logger
23
24
25			class Access(enum.IntEnum):
26			"""Enumeration of access levels for projects"""
27
28			private = 1
29			hidden = 2
30			public = 3
31
32
33			class AnnifProject(DatadirMixin):
34			"""Class representing the configuration of a single Annif project."""
35
36			# defaults for uninitialized instances
37			_transform = None
38			_analyzer = None
39			_backend = None
40			_vocab = None
41			_vocab_lang = None
42			initialized = False
43
44			# default values for configuration settings
45			DEFAULT_ACCESS = "public"
46
47			def __init__(self, project_id, config, datadir, registry):
48			DatadirMixin.__init__(self, datadir, "projects", project_id)
49			self.project_id = project_id
50			self.name = config.get("name", project_id)
51			self.language = config["language"]
52			self.analyzer_spec = config.get("analyzer", None)
53			self.transform_spec = config.get("transform", "pass")
54			self.vocab_spec = config.get("vocab", None)
55			self.config = config
56			self._base_datadir = datadir
57			self.registry = registry
58			self._init_access()
59
60			def _init_access(self):
61			access = self.config.get("access", self.DEFAULT_ACCESS)
62			try:
63			self.access = getattr(Access, access)
64			except AttributeError:
65			raise ConfigurationException(
66			"'{}' is not a valid access setting".format(access),
67			project_id=self.project_id,
68			)
69
70			def _initialize_analyzer(self):
71			if not self.analyzer_spec:
72			return # not configured, so assume it's not needed
73			analyzer = self.analyzer
74			logger.debug(
75			"Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
76			)
77
78			def _initialize_subjects(self):
79			try:
80			subjects = self.subjects
81			logger.debug(
82			"Project '%s': initialized subjects: %s", self.project_id, str(subjects)
83			)
84			except AnnifException as err:
85			logger.warning(err.format_message())
86
87			def _initialize_backend(self, parallel):
88			logger.debug("Project '%s': initializing backend", self.project_id)
89			try:
90			if not self.backend:
91			logger.debug("Cannot initialize backend: does not exist")
92			return
93			self.backend.initialize(parallel)
94			except AnnifException as err:
95			logger.warning(err.format_message())
96
97			def initialize(self, parallel=False):
98			"""Initialize this project and its backend so that they are ready to
99			be used. If parallel is True, expect that the project will be used
100			for parallel processing."""
101
102			if self.initialized:
103			return
104
105			logger.debug("Initializing project '%s'", self.project_id)
106
107			self._initialize_analyzer()
108			self._initialize_subjects()
109			self._initialize_backend(parallel)
110
111			self.initialized = True
112
113			def _suggest_with_backend(self, texts, backend_params):
114			if backend_params is None:
115			backend_params = {}
116			beparams = backend_params.get(self.backend.backend_id, {})
117			return self.backend.suggest(texts, beparams)
118
119			@property
120			def analyzer(self):
121			if self._analyzer is None:
122			if self.analyzer_spec:
123			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
124			else:
125			raise ConfigurationException(
126			"analyzer setting is missing", project_id=self.project_id
127			)
128			return self._analyzer
129
130			@property
131			def transform(self):
132			if self._transform is None:
133			self._transform = annif.transform.get_transform(
134			self.transform_spec, project=self
135			)
136			return self._transform
137
138			@property
139			def backend(self):
140			if self._backend is None:
141			if "backend" not in self.config:
142			raise ConfigurationException(
143			"backend setting is missing", project_id=self.project_id
144			)
145			backend_id = self.config["backend"]
146			try:
147			backend_class = annif.backend.get_backend(backend_id)
148			self._backend = backend_class(
149			backend_id, config_params=self.config, project=self
150			)
151			except ValueError:
152			logger.warning(
153			"Could not create backend %s, "
154			"make sure you've installed optional dependencies",
155			backend_id,
156			)
157			return self._backend
158
159			def _initialize_vocab(self):
160			if self.vocab_spec is None:
161			raise ConfigurationException(
162			"vocab setting is missing", project_id=self.project_id
163			)
164			self._vocab, self._vocab_lang = self.registry.get_vocab(
165			self.vocab_spec, self.language
166			)
167
168			@property
169			def vocab(self):
170			if self._vocab is None:
171			self._initialize_vocab()
172			return self._vocab
173
174			@property
175			def vocab_lang(self):
176			if self._vocab_lang is None:
177			self._initialize_vocab()
178			return self._vocab_lang
179
180			@property
181			def subjects(self):
182			return self.vocab.subjects
183
184			def _get_info(self, key):
185			try:
186			be = self.backend
187			if be is not None:
188			return getattr(be, key)
189			except AnnifException as err:
190			logger.warning(err.format_message())
191			return None
192
193			@property
194			def is_trained(self):
195			return self._get_info("is_trained")
196
197			@property
198			def modification_time(self):
199			return self._get_info("modification_time")
200
201			def suggest_corpus(self, corpus, backend_params=None):
202			"""Suggest subjects for the given documents corpus in batches of documents."""
203			suggestions = (
204			self.suggest([doc.text for doc in doc_batch], backend_params)
205			for doc_batch in corpus.doc_batches
206			)
207			return itertools.chain.from_iterable(suggestions)
208
209			def suggest(self, texts, backend_params=None):
210			"""Suggest subjects for the given documents batch."""
211			if not self.is_trained:
212			if self.is_trained is None:
213			logger.warning("Could not get train state information.")
214			else:
215			raise NotInitializedException("Project is not trained.")
216			texts = [self.transform.transform_text(text) for text in texts]
217			return self._suggest_with_backend(texts, backend_params)
218
219			def train(self, corpus, backend_params=None, jobs=0):
220			"""train the project using documents from a metadata source"""
221			if corpus != "cached":
222			corpus = self.transform.transform_corpus(corpus)
223			if backend_params is None:
224			backend_params = {}
225			beparams = backend_params.get(self.backend.backend_id, {})
226			self.backend.train(corpus, beparams, jobs)
227
228			def learn(self, corpus, backend_params=None):
229			"""further train the project using documents from a metadata source"""
230			if backend_params is None:
231			backend_params = {}
232			beparams = backend_params.get(self.backend.backend_id, {})
233			corpus = self.transform.transform_corpus(corpus)
234			if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
235			self.backend.learn(corpus, beparams)
236			else:
237			raise NotSupportedException(
238			"Learning not supported by backend", project_id=self.project_id
239			)
240
241			def hyperopt(self, corpus, trials, jobs, metric, results_file):
242			"""optimize the hyperparameters of the project using a validation
243			corpus against a given metric"""
244			if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
245			optimizer = self.backend.get_hp_optimizer(corpus, metric)
246			return optimizer.optimize(trials, jobs, results_file)
247
248			raise NotSupportedException(
249			"Hyperparameter optimization not supported " "by backend",
250			project_id=self.project_id,
251			)
252
253			def dump(self):
254			"""return this project as a dict"""
255			return {
256			"project_id": self.project_id,
257			"name": self.name,
258			"language": self.language,
259			"backend": {"backend_id": self.config.get("backend")},
260			"is_trained": self.is_trained,
261			"modification_time": self.modification_time,
262			}
263
264			def remove_model_data(self):
265			"""remove the data of this project"""
266			datadir_path = self._datadir_path
267			if os.path.isdir(datadir_path):
268			rmtree(datadir_path)
269			logger.info("Removed model data for project {}.".format(self.project_id))
270			else:
271			logger.warning(
272			"No model data to remove for project {}.".format(self.project_id)
273			)
274

NatLibFi / Annif

Pull Request — master (#663)

annif.project B

Complexity

Size/Duplication

Importance

24 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like