annif.project.AnnifProject._suggest_batch_with_backend() - Code Metrics - Inspection of "Support for batch suggest operations in suggest an..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#663)

by Juho

created 2023-01-24 14:02 UTC

AnnifProject._suggest_batch_with_backend() A

↳ Parent: annif.project

Complexity

Conditions

Size

Total Lines	9
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	8
nop	3
dl	0
loc	9
rs	10
c	0
b	0
f	0

"""Project management functionality for Annif"""

import enum
import os.path
from shutil import rmtree

import annif
import annif.analyzer
import annif.backend
import annif.corpus
import annif.suggestion
import annif.transform
from annif.datadir import DatadirMixin
from annif.exception import (
    AnnifException,
    ConfigurationException,
    NotInitializedException,
    NotSupportedException,
)

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""

    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _transform = None
    _analyzer = None
    _backend = None
    _vocab = None
    _vocab_lang = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = "public"

    def __init__(self, project_id, config, datadir, registry):
        DatadirMixin.__init__(self, datadir, "projects", project_id)
        self.project_id = project_id
        self.name = config.get("name", project_id)
        self.language = config["language"]
        self.analyzer_spec = config.get("analyzer", None)
        self.transform_spec = config.get("transform", "pass")
        self.vocab_spec = config.get("vocab", None)
        self.config = config
        self._base_datadir = datadir
        self.registry = registry
        self._init_access()

    def _init_access(self):
        access = self.config.get("access", self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id,
            )

    def _initialize_analyzer(self):
        if not self.analyzer_spec:
            return  # not configured, so assume it's not needed
        analyzer = self.analyzer
        logger.debug(
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
        )

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug(
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
            )
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self, parallel):
        logger.debug("Project '%s': initializing backend", self.project_id)
        try:
            if not self.backend:
                logger.debug("Cannot initialize backend: does not exist")
                return
            self.backend.initialize(parallel)
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self, parallel=False):
        """Initialize this project and its backend so that they are ready to
        be used. If parallel is True, expect that the project will be used
        for parallel processing."""

        if self.initialized:
            return

        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_backend(parallel)

        self.initialized = True

    def _suggest_with_backend(self, text, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hits = self.backend.suggest(text, beparams)
        logger.debug("Got %d hits from backend %s", len(hits), self.backend.backend_id)
        return hits

    def _suggest_batch_with_backend(self, corpus, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hit_sets = self.backend.suggest_batch(corpus, beparams)
        logger.debug(
            "Got %d hit sets from backend %s", len(hit_sets), self.backend.backend_id
        )
        return hit_sets

    @property
    def analyzer(self):
        if self._analyzer is None:
            if self.analyzer_spec:
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
            else:
                raise ConfigurationException(
                    "analyzer setting is missing", project_id=self.project_id
                )
        return self._analyzer

    @property
    def transform(self):
        if self._transform is None:
            self._transform = annif.transform.get_transform(
                self.transform_spec, project=self
            )
        return self._transform

    @property
    def backend(self):
        if self._backend is None:
            if "backend" not in self.config:
                raise ConfigurationException(
                    "backend setting is missing", project_id=self.project_id
                )
            backend_id = self.config["backend"]
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, config_params=self.config, project=self
                )
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id,
                )
        return self._backend

    def _initialize_vocab(self):
        if self.vocab_spec is None:
            raise ConfigurationException(
                "vocab setting is missing", project_id=self.project_id
            )
        self._vocab, self._vocab_lang = self.registry.get_vocab(
            self.vocab_spec, self.language
        )

    @property
    def vocab(self):
        if self._vocab is None:
            self._initialize_vocab()
        return self._vocab

    @property
    def vocab_lang(self):
        if self._vocab_lang is None:
            self._initialize_vocab()
        return self._vocab_lang

    @property
    def subjects(self):
        return self.vocab.subjects

    def _get_info(self, key):
        try:
            be = self.backend
            if be is not None:
                return getattr(be, key)
        except AnnifException as err:
            logger.warning(err.format_message())
            return None

    @property
    def is_trained(self):
        return self._get_info("is_trained")

    @property
    def modification_time(self):
        return self._get_info("modification_time")

    def suggest(self, text, backend_params=None):
        """Suggest subjects the given text by passing it to the backend. Returns a
        list of SubjectSuggestion objects ordered by decreasing score."""
        if not self.is_trained:
            if self.is_trained is None:
                logger.warning("Could not get train state information.")
            else:
                raise NotInitializedException("Project is not trained.")
        logger.debug(
            'Suggesting subjects for text "%s..." (len=%d)', text[:20], len(text)
        )
        text = self.transform.transform_text(text)
        hits = self._suggest_with_backend(text, backend_params)
        logger.debug("%d hits from backend", len(hits))
        return hits

    def suggest_batch(self, corpus, backend_params=None):
        """Suggest subjects for the given documents using batches of documents in their
        operations when possible."""
        if not self.is_trained:
            if self.is_trained is None:
                logger.warning("Could not get train state information.")
            else:
                raise NotInitializedException("Project is not trained.")
        corpus = self.transform.transform_corpus(corpus)
        logger.debug(
            f"Suggesting subjects for a batch of {sum(1 for _ in corpus.documents)}"
            " documents"
        )
        return self._suggest_batch_with_backend(corpus, backend_params)

    def train(self, corpus, backend_params=None, jobs=0):
        """train the project using documents from a metadata source"""
        if corpus != "cached":
            corpus = self.transform.transform_corpus(corpus)
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        self.backend.train(corpus, beparams, jobs)

    def learn(self, corpus, backend_params=None):
        """further train the project using documents from a metadata source"""
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        corpus = self.transform.transform_corpus(corpus)
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, beparams)
        else:
            raise NotSupportedException(
                "Learning not supported by backend", project_id=self.project_id
            )

    def hyperopt(self, corpus, trials, jobs, metric, results_file):
        """optimize the hyperparameters of the project using a validation
        corpus against a given metric"""
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
            return optimizer.optimize(trials, jobs, results_file)

        raise NotSupportedException(
            "Hyperparameter optimization not supported " "by backend",
            project_id=self.project_id,
        )

    def dump(self):
        """return this project as a dict"""
        return {
            "project_id": self.project_id,
            "name": self.name,
            "language": self.language,
            "backend": {"backend_id": self.config.get("backend")},
            "is_trained": self.is_trained,
            "modification_time": self.modification_time,
        }

    def remove_model_data(self):
        """remove the data of this project"""
        datadir_path = self._datadir_path
        if os.path.isdir(datadir_path):
            rmtree(datadir_path)
            logger.info("Removed model data for project {}.".format(self.project_id))
        else:
            logger.warning(
                "No model data to remove for project {}.".format(self.project_id)
            )


1			"""Project management functionality for Annif"""
2
3			import enum
4			import os.path
5			from shutil import rmtree
6
7			import annif
8			import annif.analyzer
9			import annif.backend
10			import annif.corpus
11			import annif.suggestion
12			import annif.transform
13			from annif.datadir import DatadirMixin
14			from annif.exception import (
15			AnnifException,
16			ConfigurationException,
17			NotInitializedException,
18			NotSupportedException,
19			)
20
21			logger = annif.logger
22
23
24			class Access(enum.IntEnum):
25			"""Enumeration of access levels for projects"""
26
27			private = 1
28			hidden = 2
29			public = 3
30
31
32			class AnnifProject(DatadirMixin):
33			"""Class representing the configuration of a single Annif project."""
34
35			# defaults for uninitialized instances
36			_transform = None
37			_analyzer = None
38			_backend = None
39			_vocab = None
40			_vocab_lang = None
41			initialized = False
42
43			# default values for configuration settings
44			DEFAULT_ACCESS = "public"
45
46			def __init__(self, project_id, config, datadir, registry):
47			DatadirMixin.__init__(self, datadir, "projects", project_id)
48			self.project_id = project_id
49			self.name = config.get("name", project_id)
50			self.language = config["language"]
51			self.analyzer_spec = config.get("analyzer", None)
52			self.transform_spec = config.get("transform", "pass")
53			self.vocab_spec = config.get("vocab", None)
54			self.config = config
55			self._base_datadir = datadir
56			self.registry = registry
57			self._init_access()
58
59			def _init_access(self):
60			access = self.config.get("access", self.DEFAULT_ACCESS)
61			try:
62			self.access = getattr(Access, access)
63			except AttributeError:
64			raise ConfigurationException(
65			"'{}' is not a valid access setting".format(access),
66			project_id=self.project_id,
67			)
68
69			def _initialize_analyzer(self):
70			if not self.analyzer_spec:
71			return # not configured, so assume it's not needed
72			analyzer = self.analyzer
73			logger.debug(
74			"Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
75			)
76
77			def _initialize_subjects(self):
78			try:
79			subjects = self.subjects
80			logger.debug(
81			"Project '%s': initialized subjects: %s", self.project_id, str(subjects)
82			)
83			except AnnifException as err:
84			logger.warning(err.format_message())
85
86			def _initialize_backend(self, parallel):
87			logger.debug("Project '%s': initializing backend", self.project_id)
88			try:
89			if not self.backend:
90			logger.debug("Cannot initialize backend: does not exist")
91			return
92			self.backend.initialize(parallel)
93			except AnnifException as err:
94			logger.warning(err.format_message())
95
96			def initialize(self, parallel=False):
97			"""Initialize this project and its backend so that they are ready to
98			be used. If parallel is True, expect that the project will be used
99			for parallel processing."""
100
101			if self.initialized:
102			return
103
104			logger.debug("Initializing project '%s'", self.project_id)
105
106			self._initialize_analyzer()
107			self._initialize_subjects()
108			self._initialize_backend(parallel)
109
110			self.initialized = True
111
112			def _suggest_with_backend(self, text, backend_params):
113			if backend_params is None:
114			backend_params = {}
115			beparams = backend_params.get(self.backend.backend_id, {})
116			hits = self.backend.suggest(text, beparams)
117			logger.debug("Got %d hits from backend %s", len(hits), self.backend.backend_id)
118			return hits
119
120			def _suggest_batch_with_backend(self, corpus, backend_params):
121			if backend_params is None:
122			backend_params = {}
123			beparams = backend_params.get(self.backend.backend_id, {})
124			hit_sets = self.backend.suggest_batch(corpus, beparams)
125			logger.debug(
126			"Got %d hit sets from backend %s", len(hit_sets), self.backend.backend_id
127			)
128			return hit_sets
129
130			@property
131			def analyzer(self):
132			if self._analyzer is None:
133			if self.analyzer_spec:
134			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
135			else:
136			raise ConfigurationException(
137			"analyzer setting is missing", project_id=self.project_id
138			)
139			return self._analyzer
140
141			@property
142			def transform(self):
143			if self._transform is None:
144			self._transform = annif.transform.get_transform(
145			self.transform_spec, project=self
146			)
147			return self._transform
148
149			@property
150			def backend(self):
151			if self._backend is None:
152			if "backend" not in self.config:
153			raise ConfigurationException(
154			"backend setting is missing", project_id=self.project_id
155			)
156			backend_id = self.config["backend"]
157			try:
158			backend_class = annif.backend.get_backend(backend_id)
159			self._backend = backend_class(
160			backend_id, config_params=self.config, project=self
161			)
162			except ValueError:
163			logger.warning(
164			"Could not create backend %s, "
165			"make sure you've installed optional dependencies",
166			backend_id,
167			)
168			return self._backend
169
170			def _initialize_vocab(self):
171			if self.vocab_spec is None:
172			raise ConfigurationException(
173			"vocab setting is missing", project_id=self.project_id
174			)
175			self._vocab, self._vocab_lang = self.registry.get_vocab(
176			self.vocab_spec, self.language
177			)
178
179			@property
180			def vocab(self):
181			if self._vocab is None:
182			self._initialize_vocab()
183			return self._vocab
184
185			@property
186			def vocab_lang(self):
187			if self._vocab_lang is None:
188			self._initialize_vocab()
189			return self._vocab_lang
190
191			@property
192			def subjects(self):
193			return self.vocab.subjects
194
195			def _get_info(self, key):
196			try:
197			be = self.backend
198			if be is not None:
199			return getattr(be, key)
200			except AnnifException as err:
201			logger.warning(err.format_message())
202			return None
203
204			@property
205			def is_trained(self):
206			return self._get_info("is_trained")
207
208			@property
209			def modification_time(self):
210			return self._get_info("modification_time")
211
212			def suggest(self, text, backend_params=None):
213			"""Suggest subjects the given text by passing it to the backend. Returns a
214			list of SubjectSuggestion objects ordered by decreasing score."""
215			if not self.is_trained:
216			if self.is_trained is None:
217			logger.warning("Could not get train state information.")
218			else:
219			raise NotInitializedException("Project is not trained.")
220			logger.debug(
221			'Suggesting subjects for text "%s..." (len=%d)', text[:20], len(text)
222			)
223			text = self.transform.transform_text(text)
224			hits = self._suggest_with_backend(text, backend_params)
225			logger.debug("%d hits from backend", len(hits))
226			return hits
227
228			def suggest_batch(self, corpus, backend_params=None):
229			"""Suggest subjects for the given documents using batches of documents in their
230			operations when possible."""
231			if not self.is_trained:
232			if self.is_trained is None:
233			logger.warning("Could not get train state information.")
234			else:
235			raise NotInitializedException("Project is not trained.")
236			corpus = self.transform.transform_corpus(corpus)
237			logger.debug(
238			f"Suggesting subjects for a batch of {sum(1 for _ in corpus.documents)}"
239			" documents"
240			)
241			return self._suggest_batch_with_backend(corpus, backend_params)
242
243			def train(self, corpus, backend_params=None, jobs=0):
244			"""train the project using documents from a metadata source"""
245			if corpus != "cached":
246			corpus = self.transform.transform_corpus(corpus)
247			if backend_params is None:
248			backend_params = {}
249			beparams = backend_params.get(self.backend.backend_id, {})
250			self.backend.train(corpus, beparams, jobs)
251
252			def learn(self, corpus, backend_params=None):
253			"""further train the project using documents from a metadata source"""
254			if backend_params is None:
255			backend_params = {}
256			beparams = backend_params.get(self.backend.backend_id, {})
257			corpus = self.transform.transform_corpus(corpus)
258			if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
259			self.backend.learn(corpus, beparams)
260			else:
261			raise NotSupportedException(
262			"Learning not supported by backend", project_id=self.project_id
263			)
264
265			def hyperopt(self, corpus, trials, jobs, metric, results_file):
266			"""optimize the hyperparameters of the project using a validation
267			corpus against a given metric"""
268			if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
269			optimizer = self.backend.get_hp_optimizer(corpus, metric)
270			return optimizer.optimize(trials, jobs, results_file)
271
272			raise NotSupportedException(
273			"Hyperparameter optimization not supported " "by backend",
274			project_id=self.project_id,
275			)
276
277			def dump(self):
278			"""return this project as a dict"""
279			return {
280			"project_id": self.project_id,
281			"name": self.name,
282			"language": self.language,
283			"backend": {"backend_id": self.config.get("backend")},
284			"is_trained": self.is_trained,
285			"modification_time": self.modification_time,
286			}
287
288			def remove_model_data(self):
289			"""remove the data of this project"""
290			datadir_path = self._datadir_path
291			if os.path.isdir(datadir_path):
292			rmtree(datadir_path)
293			logger.info("Removed model data for project {}.".format(self.project_id))
294			else:
295			logger.warning(
296			"No model data to remove for project {}.".format(self.project_id)
297			)
298

NatLibFi / Annif

Pull Request — master (#663)

AnnifProject._suggest_batch_with_backend() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like