annif.project - Code Metrics - Inspection of "Support for batch suggest operations in suggest an..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#663)

by Juho

created 2023-02-01 15:18 UTC

annif.project C

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	298
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	217
dl	0
loc	298
rs	6
c	0
b	0
f	0
wmc	55

26 Methods

Rating	Name	Size	Complexity
A	AnnifProject._get_info()	8	3
A	AnnifProject.subjects()	3	1
A	AnnifProject.modification_time()	3	1
A	AnnifProject._initialize_vocab()	7	2
A	AnnifProject.suggest()	15	3
A	AnnifProject.analyzer()	10	3
A	AnnifProject.transform()	7	2
A	AnnifProject.vocab_lang()	5	2
A	AnnifProject.is_trained()	3	1
A	AnnifProject.vocab()	5	2
A	AnnifProject.backend()	20	4
A	AnnifProject._initialize_subjects()	8	2
A	AnnifProject._initialize_backend()	9	3
A	AnnifProject._init_access()	8	2
A	AnnifProject._suggest_with_backend()	7	2
A	AnnifProject.__init__()	12	1
A	AnnifProject._initialize_analyzer()	6	2
A	AnnifProject.initialize()	15	2
A	AnnifProject._suggest_batch_with_backend()	5	2
A	AnnifProject.remove_model_data()	9	2
A	AnnifProject.dump()	9	1
A	AnnifProject.hyperopt()	10	2
A	AnnifProject.suggest_corpus()	7	1
A	AnnifProject.train()	8	3
A	AnnifProject.learn()	11	3
A	AnnifProject.suggest_batch()	9	3

How to fix Complexity

"""Project management functionality for Annif"""

import enum
import itertools
import os.path
from shutil import rmtree

import annif
import annif.analyzer
import annif.backend
import annif.corpus
import annif.suggestion
import annif.transform
from annif.datadir import DatadirMixin
from annif.exception import (
    AnnifException,
    ConfigurationException,
    NotInitializedException,
    NotSupportedException,
)

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""

    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _transform = None
    _analyzer = None
    _backend = None
    _vocab = None
    _vocab_lang = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = "public"
    DOC_BATCH_SIZE = 32

    def __init__(self, project_id, config, datadir, registry):
        DatadirMixin.__init__(self, datadir, "projects", project_id)
        self.project_id = project_id
        self.name = config.get("name", project_id)
        self.language = config["language"]
        self.analyzer_spec = config.get("analyzer", None)
        self.transform_spec = config.get("transform", "pass")
        self.vocab_spec = config.get("vocab", None)
        self.config = config
        self._base_datadir = datadir
        self.registry = registry
        self._init_access()

    def _init_access(self):
        access = self.config.get("access", self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id,
            )

    def _initialize_analyzer(self):
        if not self.analyzer_spec:
            return  # not configured, so assume it's not needed
        analyzer = self.analyzer
        logger.debug(
            "Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
        )

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug(
                "Project '%s': initialized subjects: %s", self.project_id, str(subjects)
            )
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self, parallel):
        logger.debug("Project '%s': initializing backend", self.project_id)
        try:
            if not self.backend:
                logger.debug("Cannot initialize backend: does not exist")
                return
            self.backend.initialize(parallel)
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self, parallel=False):
        """Initialize this project and its backend so that they are ready to
        be used. If parallel is True, expect that the project will be used
        for parallel processing."""

        if self.initialized:
            return

        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_backend(parallel)

        self.initialized = True

    def _suggest_with_backend(self, text, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hits = self.backend.suggest(text, beparams)
        logger.debug("Got %d hits from backend %s", len(hits), self.backend.backend_id)
        return hits

    def _suggest_batch_with_backend(self, texts, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        return self.backend.suggest_batch(texts, beparams)

    @property
    def analyzer(self):
        if self._analyzer is None:
            if self.analyzer_spec:
                self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
            else:
                raise ConfigurationException(
                    "analyzer setting is missing", project_id=self.project_id
                )
        return self._analyzer

    @property
    def transform(self):
        if self._transform is None:
            self._transform = annif.transform.get_transform(
                self.transform_spec, project=self
            )
        return self._transform

    @property
    def backend(self):
        if self._backend is None:
            if "backend" not in self.config:
                raise ConfigurationException(
                    "backend setting is missing", project_id=self.project_id
                )
            backend_id = self.config["backend"]
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, config_params=self.config, project=self
                )
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id,
                )
        return self._backend

    def _initialize_vocab(self):
        if self.vocab_spec is None:
            raise ConfigurationException(
                "vocab setting is missing", project_id=self.project_id
            )
        self._vocab, self._vocab_lang = self.registry.get_vocab(
            self.vocab_spec, self.language
        )

    @property
    def vocab(self):
        if self._vocab is None:
            self._initialize_vocab()
        return self._vocab

    @property
    def vocab_lang(self):
        if self._vocab_lang is None:
            self._initialize_vocab()
        return self._vocab_lang

    @property
    def subjects(self):
        return self.vocab.subjects

    def _get_info(self, key):
        try:
            be = self.backend
            if be is not None:
                return getattr(be, key)
        except AnnifException as err:
            logger.warning(err.format_message())
            return None

    @property
    def is_trained(self):
        return self._get_info("is_trained")

    @property
    def modification_time(self):
        return self._get_info("modification_time")

    def suggest(self, text, backend_params=None):
        """Suggest subjects the given text by passing it to the backend. Returns a
        list of SubjectSuggestion objects ordered by decreasing score."""
        if not self.is_trained:
            if self.is_trained is None:
                logger.warning("Could not get train state information.")
            else:
                raise NotInitializedException("Project is not trained.")
        logger.debug(
            'Suggesting subjects for text "%s..." (len=%d)', text[:20], len(text)
        )
        text = self.transform.transform_text(text)
        hits = self._suggest_with_backend(text, backend_params)
        logger.debug("%d hits from backend", len(hits))
        return hits

    def suggest_corpus(self, corpus, backend_params=None):
        """Suggest subjects for the given documents corpus in batches of documents."""
        suggestions = (
            self.suggest_batch([doc.text for doc in doc_batch], backend_params)
            for doc_batch in corpus.doc_batches(self.DOC_BATCH_SIZE)
        )
        return itertools.chain.from_iterable(suggestions)

    def suggest_batch(self, texts, backend_params=None):
        """Suggest subjects for the given documents batch."""
        if not self.is_trained:
            if self.is_trained is None:
                logger.warning("Could not get train state information.")
            else:
                raise NotInitializedException("Project is not trained.")
        texts = [self.transform.transform_text(text) for text in texts]
        return self._suggest_batch_with_backend(texts, backend_params)

    def train(self, corpus, backend_params=None, jobs=0):
        """train the project using documents from a metadata source"""
        if corpus != "cached":
            corpus = self.transform.transform_corpus(corpus)
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        self.backend.train(corpus, beparams, jobs)

    def learn(self, corpus, backend_params=None):
        """further train the project using documents from a metadata source"""
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        corpus = self.transform.transform_corpus(corpus)
        if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, beparams)
        else:
            raise NotSupportedException(
                "Learning not supported by backend", project_id=self.project_id
            )

    def hyperopt(self, corpus, trials, jobs, metric, results_file):
        """optimize the hyperparameters of the project using a validation
        corpus against a given metric"""
        if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
            return optimizer.optimize(trials, jobs, results_file)

        raise NotSupportedException(
            "Hyperparameter optimization not supported " "by backend",
            project_id=self.project_id,
        )

    def dump(self):
        """return this project as a dict"""
        return {
            "project_id": self.project_id,
            "name": self.name,
            "language": self.language,
            "backend": {"backend_id": self.config.get("backend")},
            "is_trained": self.is_trained,
            "modification_time": self.modification_time,
        }

    def remove_model_data(self):
        """remove the data of this project"""
        datadir_path = self._datadir_path
        if os.path.isdir(datadir_path):
            rmtree(datadir_path)
            logger.info("Removed model data for project {}.".format(self.project_id))
        else:
            logger.warning(
                "No model data to remove for project {}.".format(self.project_id)
            )


1			"""Project management functionality for Annif"""
2
3			import enum
4			import itertools
5			import os.path
6			from shutil import rmtree
7
8			import annif
9			import annif.analyzer
10			import annif.backend
11			import annif.corpus
12			import annif.suggestion
13			import annif.transform
14			from annif.datadir import DatadirMixin
15			from annif.exception import (
16			AnnifException,
17			ConfigurationException,
18			NotInitializedException,
19			NotSupportedException,
20			)
21
22			logger = annif.logger
23
24
25			class Access(enum.IntEnum):
26			"""Enumeration of access levels for projects"""
27
28			private = 1
29			hidden = 2
30			public = 3
31
32
33			class AnnifProject(DatadirMixin):
34			"""Class representing the configuration of a single Annif project."""
35
36			# defaults for uninitialized instances
37			_transform = None
38			_analyzer = None
39			_backend = None
40			_vocab = None
41			_vocab_lang = None
42			initialized = False
43
44			# default values for configuration settings
45			DEFAULT_ACCESS = "public"
46			DOC_BATCH_SIZE = 32
47
48			def __init__(self, project_id, config, datadir, registry):
49			DatadirMixin.__init__(self, datadir, "projects", project_id)
50			self.project_id = project_id
51			self.name = config.get("name", project_id)
52			self.language = config["language"]
53			self.analyzer_spec = config.get("analyzer", None)
54			self.transform_spec = config.get("transform", "pass")
55			self.vocab_spec = config.get("vocab", None)
56			self.config = config
57			self._base_datadir = datadir
58			self.registry = registry
59			self._init_access()
60
61			def _init_access(self):
62			access = self.config.get("access", self.DEFAULT_ACCESS)
63			try:
64			self.access = getattr(Access, access)
65			except AttributeError:
66			raise ConfigurationException(
67			"'{}' is not a valid access setting".format(access),
68			project_id=self.project_id,
69			)
70
71			def _initialize_analyzer(self):
72			if not self.analyzer_spec:
73			return # not configured, so assume it's not needed
74			analyzer = self.analyzer
75			logger.debug(
76			"Project '%s': initialized analyzer: %s", self.project_id, str(analyzer)
77			)
78
79			def _initialize_subjects(self):
80			try:
81			subjects = self.subjects
82			logger.debug(
83			"Project '%s': initialized subjects: %s", self.project_id, str(subjects)
84			)
85			except AnnifException as err:
86			logger.warning(err.format_message())
87
88			def _initialize_backend(self, parallel):
89			logger.debug("Project '%s': initializing backend", self.project_id)
90			try:
91			if not self.backend:
92			logger.debug("Cannot initialize backend: does not exist")
93			return
94			self.backend.initialize(parallel)
95			except AnnifException as err:
96			logger.warning(err.format_message())
97
98			def initialize(self, parallel=False):
99			"""Initialize this project and its backend so that they are ready to
100			be used. If parallel is True, expect that the project will be used
101			for parallel processing."""
102
103			if self.initialized:
104			return
105
106			logger.debug("Initializing project '%s'", self.project_id)
107
108			self._initialize_analyzer()
109			self._initialize_subjects()
110			self._initialize_backend(parallel)
111
112			self.initialized = True
113
114			def _suggest_with_backend(self, text, backend_params):
115			if backend_params is None:
116			backend_params = {}
117			beparams = backend_params.get(self.backend.backend_id, {})
118			hits = self.backend.suggest(text, beparams)
119			logger.debug("Got %d hits from backend %s", len(hits), self.backend.backend_id)
120			return hits
121
122			def _suggest_batch_with_backend(self, texts, backend_params):
123			if backend_params is None:
124			backend_params = {}
125			beparams = backend_params.get(self.backend.backend_id, {})
126			return self.backend.suggest_batch(texts, beparams)
127
128			@property
129			def analyzer(self):
130			if self._analyzer is None:
131			if self.analyzer_spec:
132			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
133			else:
134			raise ConfigurationException(
135			"analyzer setting is missing", project_id=self.project_id
136			)
137			return self._analyzer
138
139			@property
140			def transform(self):
141			if self._transform is None:
142			self._transform = annif.transform.get_transform(
143			self.transform_spec, project=self
144			)
145			return self._transform
146
147			@property
148			def backend(self):
149			if self._backend is None:
150			if "backend" not in self.config:
151			raise ConfigurationException(
152			"backend setting is missing", project_id=self.project_id
153			)
154			backend_id = self.config["backend"]
155			try:
156			backend_class = annif.backend.get_backend(backend_id)
157			self._backend = backend_class(
158			backend_id, config_params=self.config, project=self
159			)
160			except ValueError:
161			logger.warning(
162			"Could not create backend %s, "
163			"make sure you've installed optional dependencies",
164			backend_id,
165			)
166			return self._backend
167
168			def _initialize_vocab(self):
169			if self.vocab_spec is None:
170			raise ConfigurationException(
171			"vocab setting is missing", project_id=self.project_id
172			)
173			self._vocab, self._vocab_lang = self.registry.get_vocab(
174			self.vocab_spec, self.language
175			)
176
177			@property
178			def vocab(self):
179			if self._vocab is None:
180			self._initialize_vocab()
181			return self._vocab
182
183			@property
184			def vocab_lang(self):
185			if self._vocab_lang is None:
186			self._initialize_vocab()
187			return self._vocab_lang
188
189			@property
190			def subjects(self):
191			return self.vocab.subjects
192
193			def _get_info(self, key):
194			try:
195			be = self.backend
196			if be is not None:
197			return getattr(be, key)
198			except AnnifException as err:
199			logger.warning(err.format_message())
200			return None
201
202			@property
203			def is_trained(self):
204			return self._get_info("is_trained")
205
206			@property
207			def modification_time(self):
208			return self._get_info("modification_time")
209
210			def suggest(self, text, backend_params=None):
211			"""Suggest subjects the given text by passing it to the backend. Returns a
212			list of SubjectSuggestion objects ordered by decreasing score."""
213			if not self.is_trained:
214			if self.is_trained is None:
215			logger.warning("Could not get train state information.")
216			else:
217			raise NotInitializedException("Project is not trained.")
218			logger.debug(
219			'Suggesting subjects for text "%s..." (len=%d)', text[:20], len(text)
220			)
221			text = self.transform.transform_text(text)
222			hits = self._suggest_with_backend(text, backend_params)
223			logger.debug("%d hits from backend", len(hits))
224			return hits
225
226			def suggest_corpus(self, corpus, backend_params=None):
227			"""Suggest subjects for the given documents corpus in batches of documents."""
228			suggestions = (
229			self.suggest_batch([doc.text for doc in doc_batch], backend_params)
230			for doc_batch in corpus.doc_batches(self.DOC_BATCH_SIZE)
231			)
232			return itertools.chain.from_iterable(suggestions)
233
234			def suggest_batch(self, texts, backend_params=None):
235			"""Suggest subjects for the given documents batch."""
236			if not self.is_trained:
237			if self.is_trained is None:
238			logger.warning("Could not get train state information.")
239			else:
240			raise NotInitializedException("Project is not trained.")
241			texts = [self.transform.transform_text(text) for text in texts]
242			return self._suggest_batch_with_backend(texts, backend_params)
243
244			def train(self, corpus, backend_params=None, jobs=0):
245			"""train the project using documents from a metadata source"""
246			if corpus != "cached":
247			corpus = self.transform.transform_corpus(corpus)
248			if backend_params is None:
249			backend_params = {}
250			beparams = backend_params.get(self.backend.backend_id, {})
251			self.backend.train(corpus, beparams, jobs)
252
253			def learn(self, corpus, backend_params=None):
254			"""further train the project using documents from a metadata source"""
255			if backend_params is None:
256			backend_params = {}
257			beparams = backend_params.get(self.backend.backend_id, {})
258			corpus = self.transform.transform_corpus(corpus)
259			if isinstance(self.backend, annif.backend.backend.AnnifLearningBackend):
260			self.backend.learn(corpus, beparams)
261			else:
262			raise NotSupportedException(
263			"Learning not supported by backend", project_id=self.project_id
264			)
265
266			def hyperopt(self, corpus, trials, jobs, metric, results_file):
267			"""optimize the hyperparameters of the project using a validation
268			corpus against a given metric"""
269			if isinstance(self.backend, annif.backend.hyperopt.AnnifHyperoptBackend):
270			optimizer = self.backend.get_hp_optimizer(corpus, metric)
271			return optimizer.optimize(trials, jobs, results_file)
272
273			raise NotSupportedException(
274			"Hyperparameter optimization not supported " "by backend",
275			project_id=self.project_id,
276			)
277
278			def dump(self):
279			"""return this project as a dict"""
280			return {
281			"project_id": self.project_id,
282			"name": self.name,
283			"language": self.language,
284			"backend": {"backend_id": self.config.get("backend")},
285			"is_trained": self.is_trained,
286			"modification_time": self.modification_time,
287			}
288
289			def remove_model_data(self):
290			"""remove the data of this project"""
291			datadir_path = self._datadir_path
292			if os.path.isdir(datadir_path):
293			rmtree(datadir_path)
294			logger.info("Removed model data for project {}.".format(self.project_id))
295			else:
296			logger.warning(
297			"No model data to remove for project {}.".format(self.project_id)
298			)
299

NatLibFi / Annif

Pull Request — master (#663)

annif.project C

Complexity

Size/Duplication

Importance

26 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like