annif.project - Code Metrics - Inspection of "Merge pull request #306 from NatLibFi/issue251-CLI..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 93695b...d2dff1 )

by Osma

created 2019-08-23 08:21 UTC

annif.project B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	287
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	47
eloc	220
dl	0
loc	287
rs	8.64
c	0
b	0
f	0

19 Methods

Rating	Name	Size	Complexity
A	AnnifProject.analyzer()	5	3
A	AnnifProject.vectorizer()	12	3
A	AnnifProject.subjects()	3	1
A	AnnifProject.vocab()	9	3
A	AnnifProject._initialize_subjects()	8	2
A	AnnifProject.dump()	6	1
A	AnnifProject._initialize_backend()	9	3
A	AnnifProject._initialize_vectorizer()	8	2
A	AnnifProject._init_access()	8	2
A	AnnifProject.train()	6	1
A	AnnifProject._suggest_with_backend()	9	2
A	AnnifProject.suggest()	9	1
A	AnnifProject.__init__()	10	1
A	AnnifProject._initialize_analyzer()	5	1
A	AnnifProject.learn()	11	2
A	AnnifProject._create_vectorizer()	13	2
A	AnnifProject.backend()	14	3
A	AnnifProject.initialize()	12	1
A	AnnifProject.remove_model_data()	10	2

4 Functions

Rating	Name	Size	Complexity
A	get_projects()	12	2
A	get_project()	7	2
B	_create_projects()	23	6
A	initialize_projects()	6	1

How to fix Complexity

"""Project management functionality for Annif"""

import collections
import configparser
import enum
import os.path
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import current_app
from shutil import rmtree
import annif
import annif.analyzer
import annif.corpus
import annif.suggestion
import annif.backend
import annif.util
import annif.vocab
from annif.datadir import DatadirMixin
from annif.exception import AnnifException, ConfigurationException, \
    NotInitializedException, NotSupportedException

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""
    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _analyzer = None
    _backend = None
    _vocab = None
    _vectorizer = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = 'public'

    def __init__(self, project_id, config, datadir):
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
        self.project_id = project_id
        self.name = config['name']
        self.language = config['language']
        self.analyzer_spec = config.get('analyzer', None)
        self.vocab_id = config.get('vocab', None)
        self.config = config
        self._base_datadir = datadir
        self._init_access()

    def _init_access(self):
        access = self.config.get('access', self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id)

    def _initialize_analyzer(self):
        analyzer = self.analyzer
        logger.debug("Project '%s': initialized analyzer: %s",
                     self.project_id,
                     str(analyzer))

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug("Project '%s': initialized subjects: %s",
                         self.project_id,
                         str(subjects))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_vectorizer(self):
        try:
            vectorizer = self.vectorizer
            logger.debug("Project '%s': initialized vectorizer: %s",
                         self.project_id,
                         str(vectorizer))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self):
        logger.debug("Project '%s': initializing backend", self.project_id)
        if not self.backend:
            logger.debug("Cannot initialize backend: does not exist")
            return
        try:
            self.backend.initialize()
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self):
        """initialize this project and its backend so that they are ready to
        be used"""

        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_vectorizer()
        self._initialize_backend()

        self.initialized = True

    def _suggest_with_backend(self, text, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hits = self.backend.suggest(text, project=self, params=beparams)
        logger.debug(
            'Got %d hits from backend %s',
            len(hits), self.backend.backend_id)
        return hits

    @property
    def analyzer(self):
        if self._analyzer is None and self.analyzer_spec:
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
        return self._analyzer

    @property
    def backend(self):
        if self._backend is None:
            backend_id = self.config['backend']
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, params=self.config, datadir=self.datadir)
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id)
        return self._backend

    @property
    def vocab(self):
        if self._vocab is None:
            if self.vocab_id is None:
                raise ConfigurationException("vocab setting is missing",
                                             project_id=self.project_id)
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
                                                      self._base_datadir)
        return self._vocab

    @property
    def subjects(self):
        return self.vocab.subjects

    @property
    def vectorizer(self):
        if self._vectorizer is None:
            path = os.path.join(self.datadir, 'vectorizer')
            if os.path.exists(path):
                logger.debug('loading vectorizer from %s', path)
                self._vectorizer = joblib.load(path)
            else:
                raise NotInitializedException(
                    "vectorizer file '{}' not found".format(path),
                    project_id=self.project_id)
        return self._vectorizer

    def suggest(self, text, backend_params=None):
        """Suggest subjects the given text by passing it to the backend. Returns a
        list of SubjectSuggestion objects ordered by decreasing score."""

        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
                     text[:20], len(text))
        hits = self._suggest_with_backend(text, backend_params)
        logger.debug('%d hits from backend', len(hits))
        return hits

    def _create_vectorizer(self, subjectcorpus):
        if not self.backend.needs_subject_vectorizer:
            logger.debug('not creating vectorizer: not needed by backend')
            return
        logger.info('creating vectorizer')
        self._vectorizer = TfidfVectorizer(
            tokenizer=self.analyzer.tokenize_words)
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))

        annif.util.atomic_save(
            self._vectorizer,
            self.datadir,
            'vectorizer',
            method=joblib.dump)

    def train(self, corpus):
        """train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        self._create_vectorizer(corpus)
        self.backend.train(corpus, project=self)

    def learn(self, corpus):
        """further train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        if isinstance(
                self.backend,
                annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, project=self)
        else:
            raise NotSupportedException("Learning not supported by backend",
                                        project_id=self.project_id)

    def dump(self):
        """return this project as a dict"""
        return {'project_id': self.project_id,
                'name': self.name,
                'language': self.language,
                'backend': {'backend_id': self.config['backend']}
                }

    def remove_model_data(self):
        """remove the data of this project"""
        datadir_path = self._datadir_path
        if os.path.isdir(datadir_path):
            rmtree(datadir_path)
            logger.info('Removed model data for project {}.'
                        .format(self.project_id))
        else:
            logger.warning('No model data to remove for project {}.'
                           .format(self.project_id))


def _create_projects(projects_file, datadir, init_projects):
    if not os.path.exists(projects_file):
        logger.warning(
            'Project configuration file "%s" is missing. Please provide one.' +
            ' You can set the path to the project configuration file using ' +
            'the ANNIF_PROJECTS environment variable or the command-line ' +
            'option "--projects".', projects_file)
        return {}

    config = configparser.ConfigParser()
    config.optionxform = lambda option: option
    with open(projects_file, encoding='utf-8') as projf:
        config.read_file(projf)

    # create AnnifProject objects from the configuration file
    projects = collections.OrderedDict()
    for project_id in config.sections():
        projects[project_id] = AnnifProject(project_id,
                                            config[project_id],
                                            datadir)
        if init_projects:
            projects[project_id].initialize()
    return projects


def initialize_projects(app):
    projects_file = app.config['PROJECTS_FILE']
    datadir = app.config['DATADIR']
    init_projects = app.config['INITIALIZE_PROJECTS']
    app.annif_projects = _create_projects(
        projects_file, datadir, init_projects)


def get_projects(min_access=Access.private):
    """Return the available projects as a dict of project_id ->
    AnnifProject. The min_access parameter may be used to set the minimum
    access level required for the returned projects."""

    if not hasattr(current_app, 'annif_projects'):
        initialize_projects(current_app)

    projects = [(project_id, project)
                for project_id, project in current_app.annif_projects.items()
                if project.access >= min_access]
    return collections.OrderedDict(projects)


def get_project(project_id, min_access=Access.private):
    """return the definition of a single Project by project_id"""
    projects = get_projects(min_access)
    try:
        return projects[project_id]
    except KeyError:
        raise ValueError("No such project {}".format(project_id))


1			"""Project management functionality for Annif"""
2
3			import collections
4			import configparser
5			import enum
6			import os.path
7			import joblib
8			from sklearn.feature_extraction.text import TfidfVectorizer
9			from flask import current_app
10			from shutil import rmtree
11			import annif
12			import annif.analyzer
13			import annif.corpus
14			import annif.suggestion
15			import annif.backend
16			import annif.util
17			import annif.vocab
18			from annif.datadir import DatadirMixin
19			from annif.exception import AnnifException, ConfigurationException, \
20			NotInitializedException, NotSupportedException
21
22			logger = annif.logger
23
24
25			class Access(enum.IntEnum):
26			"""Enumeration of access levels for projects"""
27			private = 1
28			hidden = 2
29			public = 3
30
31
32			class AnnifProject(DatadirMixin):
33			"""Class representing the configuration of a single Annif project."""
34
35			# defaults for uninitialized instances
36			_analyzer = None
37			_backend = None
38			_vocab = None
39			_vectorizer = None
40			initialized = False
41
42			# default values for configuration settings
43			DEFAULT_ACCESS = 'public'
44
45			def __init__(self, project_id, config, datadir):
46			DatadirMixin.__init__(self, datadir, 'projects', project_id)
47			self.project_id = project_id
48			self.name = config['name']
49			self.language = config['language']
50			self.analyzer_spec = config.get('analyzer', None)
51			self.vocab_id = config.get('vocab', None)
52			self.config = config
53			self._base_datadir = datadir
54			self._init_access()
55
56			def _init_access(self):
57			access = self.config.get('access', self.DEFAULT_ACCESS)
58			try:
59			self.access = getattr(Access, access)
60			except AttributeError:
61			raise ConfigurationException(
62			"'{}' is not a valid access setting".format(access),
63			project_id=self.project_id)
64
65			def _initialize_analyzer(self):
66			analyzer = self.analyzer
67			logger.debug("Project '%s': initialized analyzer: %s",
68			self.project_id,
69			str(analyzer))
70
71			def _initialize_subjects(self):
72			try:
73			subjects = self.subjects
74			logger.debug("Project '%s': initialized subjects: %s",
75			self.project_id,
76			str(subjects))
77			except AnnifException as err:
78			logger.warning(err.format_message())
79
80			def _initialize_vectorizer(self):
81			try:
82			vectorizer = self.vectorizer
83			logger.debug("Project '%s': initialized vectorizer: %s",
84			self.project_id,
85			str(vectorizer))
86			except AnnifException as err:
87			logger.warning(err.format_message())
88
89			def _initialize_backend(self):
90			logger.debug("Project '%s': initializing backend", self.project_id)
91			if not self.backend:
92			logger.debug("Cannot initialize backend: does not exist")
93			return
94			try:
95			self.backend.initialize()
96			except AnnifException as err:
97			logger.warning(err.format_message())
98
99			def initialize(self):
100			"""initialize this project and its backend so that they are ready to
101			be used"""
102
103			logger.debug("Initializing project '%s'", self.project_id)
104
105			self._initialize_analyzer()
106			self._initialize_subjects()
107			self._initialize_vectorizer()
108			self._initialize_backend()
109
110			self.initialized = True
111
112			def _suggest_with_backend(self, text, backend_params):
113			if backend_params is None:
114			backend_params = {}
115			beparams = backend_params.get(self.backend.backend_id, {})
116			hits = self.backend.suggest(text, project=self, params=beparams)
117			logger.debug(
118			'Got %d hits from backend %s',
119			len(hits), self.backend.backend_id)
120			return hits
121
122			@property
123			def analyzer(self):
124			if self._analyzer is None and self.analyzer_spec:
125			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
126			return self._analyzer
127
128			@property
129			def backend(self):
130			if self._backend is None:
131			backend_id = self.config['backend']
132			try:
133			backend_class = annif.backend.get_backend(backend_id)
134			self._backend = backend_class(
135			backend_id, params=self.config, datadir=self.datadir)
136			except ValueError:
137			logger.warning(
138			"Could not create backend %s, "
139			"make sure you've installed optional dependencies",
140			backend_id)
141			return self._backend
142
143			@property
144			def vocab(self):
145			if self._vocab is None:
146			if self.vocab_id is None:
147			raise ConfigurationException("vocab setting is missing",
148			project_id=self.project_id)
149			self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150			self._base_datadir)
151			return self._vocab
152
153			@property
154			def subjects(self):
155			return self.vocab.subjects
156
157			@property
158			def vectorizer(self):
159			if self._vectorizer is None:
160			path = os.path.join(self.datadir, 'vectorizer')
161			if os.path.exists(path):
162			logger.debug('loading vectorizer from %s', path)
163			self._vectorizer = joblib.load(path)
164			else:
165			raise NotInitializedException(
166			"vectorizer file '{}' not found".format(path),
167			project_id=self.project_id)
168			return self._vectorizer
169
170			def suggest(self, text, backend_params=None):
171			"""Suggest subjects the given text by passing it to the backend. Returns a
172			list of SubjectSuggestion objects ordered by decreasing score."""
173
174			logger.debug('Suggesting subjects for text "%s..." (len=%d)',
175			text[:20], len(text))
176			hits = self._suggest_with_backend(text, backend_params)
177			logger.debug('%d hits from backend', len(hits))
178			return hits
179
180			def _create_vectorizer(self, subjectcorpus):
181			if not self.backend.needs_subject_vectorizer:
182			logger.debug('not creating vectorizer: not needed by backend')
183			return
184			logger.info('creating vectorizer')
185			self._vectorizer = TfidfVectorizer(
186			tokenizer=self.analyzer.tokenize_words)
187			self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
			1 ignored issue – show Comprehensibility Best Practice introduced 2018-09-26 10:16 UTC by Report Bug Copy Issue Report The variable `subj` does not seem to be defined. Loading history...
188			annif.util.atomic_save(
189			self._vectorizer,
190			self.datadir,
191			'vectorizer',
192			method=joblib.dump)
193
194			def train(self, corpus):
195			"""train the project using documents from a metadata source"""
196
197			corpus.set_subject_index(self.subjects)
198			self._create_vectorizer(corpus)
199			self.backend.train(corpus, project=self)
200
201			def learn(self, corpus):
202			"""further train the project using documents from a metadata source"""
203
204			corpus.set_subject_index(self.subjects)
205			if isinstance(
206			self.backend,
207			annif.backend.backend.AnnifLearningBackend):
208			self.backend.learn(corpus, project=self)
209			else:
210			raise NotSupportedException("Learning not supported by backend",
211			project_id=self.project_id)
212
213			def dump(self):
214			"""return this project as a dict"""
215			return {'project_id': self.project_id,
216			'name': self.name,
217			'language': self.language,
218			'backend': {'backend_id': self.config['backend']}
219			}
220
221			def remove_model_data(self):
222			"""remove the data of this project"""
223			datadir_path = self._datadir_path
224			if os.path.isdir(datadir_path):
225			rmtree(datadir_path)
226			logger.info('Removed model data for project {}.'
227			.format(self.project_id))
228			else:
229			logger.warning('No model data to remove for project {}.'
230			.format(self.project_id))
231
232
233			def _create_projects(projects_file, datadir, init_projects):
234			if not os.path.exists(projects_file):
235			logger.warning(
236			'Project configuration file "%s" is missing. Please provide one.' +
237			' You can set the path to the project configuration file using ' +
238			'the ANNIF_PROJECTS environment variable or the command-line ' +
239			'option "--projects".', projects_file)
240			return {}
241
242			config = configparser.ConfigParser()
243			config.optionxform = lambda option: option
244			with open(projects_file, encoding='utf-8') as projf:
245			config.read_file(projf)
246
247			# create AnnifProject objects from the configuration file
248			projects = collections.OrderedDict()
249			for project_id in config.sections():
250			projects[project_id] = AnnifProject(project_id,
251			config[project_id],
252			datadir)
253			if init_projects:
254			projects[project_id].initialize()
255			return projects
256
257
258			def initialize_projects(app):
259			projects_file = app.config['PROJECTS_FILE']
260			datadir = app.config['DATADIR']
261			init_projects = app.config['INITIALIZE_PROJECTS']
262			app.annif_projects = _create_projects(
263			projects_file, datadir, init_projects)
264
265
266			def get_projects(min_access=Access.private):
267			"""Return the available projects as a dict of project_id ->
268			AnnifProject. The min_access parameter may be used to set the minimum
269			access level required for the returned projects."""
270
271			if not hasattr(current_app, 'annif_projects'):
272			initialize_projects(current_app)
273
274			projects = [(project_id, project)
275			for project_id, project in current_app.annif_projects.items()
276			if project.access >= min_access]
277			return collections.OrderedDict(projects)
278
279
280			def get_project(project_id, min_access=Access.private):
281			"""return the definition of a single Project by project_id"""
282			projects = get_projects(min_access)
283			try:
284			return projects[project_id]
285			except KeyError:
286			raise ValueError("No such project {}".format(project_id))
287

NatLibFi / Annif

Push — master ( 93695b...d2dff1 )

annif.project B

Complexity

Size/Duplication

Importance

19 Methods

4 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like