annif.project - Code Metrics - Inspection of "Refactor: extract data directory handling into a m..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#259)

by Osma

created 2019-02-13 18:28 UTC

annif.project A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	258
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	42
eloc	200
dl	0
loc	258
rs	9.0399
c	0
b	0
f	0

17 Methods

Rating	Name	Size	Complexity
A	AnnifProject.vectorizer()	12	3
A	AnnifProject._initialize_subjects()	8	2
A	AnnifProject.dump()	6	1
A	AnnifProject._analyze_with_backend()	9	2
A	AnnifProject._initialize_backend()	9	3
A	AnnifProject._initialize_vectorizer()	8	2
A	AnnifProject.subjects()	3	1
A	AnnifProject.analyze()	9	1
A	AnnifProject._init_access()	8	2
A	AnnifProject.train()	6	1
A	AnnifProject.__init__()	10	1
A	AnnifProject._initialize_analyzer()	5	1
A	AnnifProject.analyzer()	5	3
A	AnnifProject._create_vectorizer()	13	2
A	AnnifProject.vocab()	9	3
A	AnnifProject.backend()	14	3
A	AnnifProject.initialize()	11	1

4 Functions

Rating	Name	Size	Complexity
A	get_projects()	9	1
A	get_project()	7	2
B	_create_projects()	22	6
A	initialize_projects()	6	1

How to fix Complexity

"""Project management functionality for Annif"""

import collections
import configparser
import enum
import os.path
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import current_app
import annif
import annif.analyzer
import annif.corpus
import annif.hit
import annif.backend
import annif.util
import annif.vocab
from annif.datadir import DatadirMixin
from annif.exception import AnnifException, ConfigurationException, \
    NotInitializedException

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""
    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _analyzer = None
    _backend = None
    _vocab = None
    _vectorizer = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = 'public'

    def __init__(self, project_id, config, datadir):
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
        self.project_id = project_id
        self.name = config['name']
        self.language = config['language']
        self.analyzer_spec = config.get('analyzer', None)
        self.vocab_id = config.get('vocab', None)
        self.config = config
        self._base_datadir = datadir
        self._init_access()

    def _init_access(self):
        access = self.config.get('access', self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id)

    def _initialize_analyzer(self):
        analyzer = self.analyzer
        logger.debug("Project '%s': initialized analyzer: %s",
                     self.project_id,
                     str(analyzer))

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug("Project '%s': initialized subjects: %s",
                         self.project_id,
                         str(subjects))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_vectorizer(self):
        try:
            vectorizer = self.vectorizer
            logger.debug("Project '%s': initialized vectorizer: %s",
                         self.project_id,
                         str(vectorizer))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self):
        logger.debug("Project '%s': initializing backend", self.project_id)
        if not self.backend:
            logger.debug("Cannot initialize backend: does not exist")
            return
        try:
            self.backend.initialize()
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self):
        """initialize this project and its backend so that they are ready to
        analyze"""
        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_vectorizer()
        self._initialize_backend()

        self.initialized = True

    def _analyze_with_backend(self, text, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hits = self.backend.analyze(text, project=self, params=beparams)
        logger.debug(
            'Got %d hits from backend %s',
            len(hits), self.backend.backend_id)
        return hits

    @property
    def analyzer(self):
        if self._analyzer is None and self.analyzer_spec:
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
        return self._analyzer

    @property
    def backend(self):
        if self._backend is None:
            backend_id = self.config['backend']
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, params=self.config, datadir=self.datadir)
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id)
        return self._backend

    @property
    def vocab(self):
        if self._vocab is None:
            if self.vocab_id is None:
                raise ConfigurationException("vocab setting is missing",
                                             project_id=self.project_id)
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
                                                      self._base_datadir)
        return self._vocab

    @property
    def subjects(self):
        return self.vocab.subjects

    @property
    def vectorizer(self):
        if self._vectorizer is None:
            path = os.path.join(self.datadir, 'vectorizer')
            if os.path.exists(path):
                logger.debug('loading vectorizer from %s', path)
                self._vectorizer = joblib.load(path)
            else:
                raise NotInitializedException(
                    "vectorizer file '{}' not found".format(path),
                    project_id=self.project_id)
        return self._vectorizer

    def analyze(self, text, backend_params=None):
        """Analyze the given text by passing it to the backend. Returns a
        list of AnalysisHit objects ordered by decreasing score."""

        logger.debug('Analyzing text "%s..." (len=%d)',
                     text[:20], len(text))
        hits = self._analyze_with_backend(text, backend_params)
        logger.debug('%d hits from backend', len(hits))
        return hits

    def _create_vectorizer(self, subjectcorpus):
        if not self.backend.needs_subject_vectorizer:
            logger.debug('not creating vectorizer: not needed by backend')
            return
        logger.info('creating vectorizer')
        self._vectorizer = TfidfVectorizer(
            tokenizer=self.analyzer.tokenize_words)
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))

        annif.util.atomic_save(
            self._vectorizer,
            self.datadir,
            'vectorizer',
            method=joblib.dump)

    def train(self, corpus):
        """train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        self._create_vectorizer(corpus)
        self.backend.train(corpus, project=self)

    def dump(self):
        """return this project as a dict"""
        return {'project_id': self.project_id,
                'name': self.name,
                'language': self.language,
                'backend': {'backend_id': self.config['backend']}
                }


def _create_projects(projects_file, datadir, init_projects):
    if not os.path.exists(projects_file):
        logger.warning("Project configuration file '%s' is missing. " +
                       'Please provide one.', projects_file)
        logger.warning('You can set the path to the project configuration ' +
                       'file using the ANNIF_PROJECTS environment variable.')
        return {}

    config = configparser.ConfigParser()
    config.optionxform = lambda option: option
    with open(projects_file) as projf:
        config.read_file(projf)

    # create AnnifProject objects from the configuration file
    projects = collections.OrderedDict()
    for project_id in config.sections():
        projects[project_id] = AnnifProject(project_id,
                                            config[project_id],
                                            datadir)
        if init_projects:
            projects[project_id].initialize()
    return projects


def initialize_projects(app):
    projects_file = app.config['PROJECTS_FILE']
    datadir = app.config['DATADIR']
    init_projects = app.config['INITIALIZE_PROJECTS']
    app.annif_projects = _create_projects(
        projects_file, datadir, init_projects)


def get_projects(min_access=Access.private):
    """Return the available projects as a dict of project_id ->
    AnnifProject. The min_access parameter may be used to set the minimum
    access level required for the returned projects."""

    projects = [(project_id, project)
                for project_id, project in current_app.annif_projects.items()
                if project.access >= min_access]
    return collections.OrderedDict(projects)


def get_project(project_id, min_access=Access.private):
    """return the definition of a single Project by project_id"""
    projects = get_projects(min_access)
    try:
        return projects[project_id]
    except KeyError:
        raise ValueError("No such project {}".format(project_id))


1			"""Project management functionality for Annif"""
2
3			import collections
4			import configparser
5			import enum
6			import os.path
7			from sklearn.externals import joblib
8			from sklearn.feature_extraction.text import TfidfVectorizer
9			from flask import current_app
10			import annif
11			import annif.analyzer
12			import annif.corpus
13			import annif.hit
14			import annif.backend
15			import annif.util
16			import annif.vocab
17			from annif.datadir import DatadirMixin
18			from annif.exception import AnnifException, ConfigurationException, \
19			NotInitializedException
20
21			logger = annif.logger
22
23
24			class Access(enum.IntEnum):
25			"""Enumeration of access levels for projects"""
26			private = 1
27			hidden = 2
28			public = 3
29
30
31			class AnnifProject(DatadirMixin):
32			"""Class representing the configuration of a single Annif project."""
33
34			# defaults for uninitialized instances
35			_analyzer = None
36			_backend = None
37			_vocab = None
38			_vectorizer = None
39			initialized = False
40
41			# default values for configuration settings
42			DEFAULT_ACCESS = 'public'
43
44			def __init__(self, project_id, config, datadir):
45			DatadirMixin.__init__(self, datadir, 'projects', project_id)
46			self.project_id = project_id
47			self.name = config['name']
48			self.language = config['language']
49			self.analyzer_spec = config.get('analyzer', None)
50			self.vocab_id = config.get('vocab', None)
51			self.config = config
52			self._base_datadir = datadir
53			self._init_access()
54
55			def _init_access(self):
56			access = self.config.get('access', self.DEFAULT_ACCESS)
57			try:
58			self.access = getattr(Access, access)
59			except AttributeError:
60			raise ConfigurationException(
61			"'{}' is not a valid access setting".format(access),
62			project_id=self.project_id)
63
64			def _initialize_analyzer(self):
65			analyzer = self.analyzer
66			logger.debug("Project '%s': initialized analyzer: %s",
67			self.project_id,
68			str(analyzer))
69
70			def _initialize_subjects(self):
71			try:
72			subjects = self.subjects
73			logger.debug("Project '%s': initialized subjects: %s",
74			self.project_id,
75			str(subjects))
76			except AnnifException as err:
77			logger.warning(err.format_message())
78
79			def _initialize_vectorizer(self):
80			try:
81			vectorizer = self.vectorizer
82			logger.debug("Project '%s': initialized vectorizer: %s",
83			self.project_id,
84			str(vectorizer))
85			except AnnifException as err:
86			logger.warning(err.format_message())
87
88			def _initialize_backend(self):
89			logger.debug("Project '%s': initializing backend", self.project_id)
90			if not self.backend:
91			logger.debug("Cannot initialize backend: does not exist")
92			return
93			try:
94			self.backend.initialize()
95			except AnnifException as err:
96			logger.warning(err.format_message())
97
98			def initialize(self):
99			"""initialize this project and its backend so that they are ready to
100			analyze"""
101			logger.debug("Initializing project '%s'", self.project_id)
102
103			self._initialize_analyzer()
104			self._initialize_subjects()
105			self._initialize_vectorizer()
106			self._initialize_backend()
107
108			self.initialized = True
109
110			def _analyze_with_backend(self, text, backend_params):
111			if backend_params is None:
112			backend_params = {}
113			beparams = backend_params.get(self.backend.backend_id, {})
114			hits = self.backend.analyze(text, project=self, params=beparams)
115			logger.debug(
116			'Got %d hits from backend %s',
117			len(hits), self.backend.backend_id)
118			return hits
119
120			@property
121			def analyzer(self):
122			if self._analyzer is None and self.analyzer_spec:
123			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
124			return self._analyzer
125
126			@property
127			def backend(self):
128			if self._backend is None:
129			backend_id = self.config['backend']
130			try:
131			backend_class = annif.backend.get_backend(backend_id)
132			self._backend = backend_class(
133			backend_id, params=self.config, datadir=self.datadir)
134			except ValueError:
135			logger.warning(
136			"Could not create backend %s, "
137			"make sure you've installed optional dependencies",
138			backend_id)
139			return self._backend
140
141			@property
142			def vocab(self):
143			if self._vocab is None:
144			if self.vocab_id is None:
145			raise ConfigurationException("vocab setting is missing",
146			project_id=self.project_id)
147			self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
148			self._base_datadir)
149			return self._vocab
150
151			@property
152			def subjects(self):
153			return self.vocab.subjects
154
155			@property
156			def vectorizer(self):
157			if self._vectorizer is None:
158			path = os.path.join(self.datadir, 'vectorizer')
159			if os.path.exists(path):
160			logger.debug('loading vectorizer from %s', path)
161			self._vectorizer = joblib.load(path)
162			else:
163			raise NotInitializedException(
164			"vectorizer file '{}' not found".format(path),
165			project_id=self.project_id)
166			return self._vectorizer
167
168			def analyze(self, text, backend_params=None):
169			"""Analyze the given text by passing it to the backend. Returns a
170			list of AnalysisHit objects ordered by decreasing score."""
171
172			logger.debug('Analyzing text "%s..." (len=%d)',
173			text[:20], len(text))
174			hits = self._analyze_with_backend(text, backend_params)
175			logger.debug('%d hits from backend', len(hits))
176			return hits
177
178			def _create_vectorizer(self, subjectcorpus):
179			if not self.backend.needs_subject_vectorizer:
180			logger.debug('not creating vectorizer: not needed by backend')
181			return
182			logger.info('creating vectorizer')
183			self._vectorizer = TfidfVectorizer(
184			tokenizer=self.analyzer.tokenize_words)
185			self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
			1 ignored issue – show Comprehensibility Best Practice introduced 2018-09-26 10:16 UTC by Report Bug Copy Issue Report The variable `subj` does not seem to be defined. Loading history...
186			annif.util.atomic_save(
187			self._vectorizer,
188			self.datadir,
189			'vectorizer',
190			method=joblib.dump)
191
192			def train(self, corpus):
193			"""train the project using documents from a metadata source"""
194
195			corpus.set_subject_index(self.subjects)
196			self._create_vectorizer(corpus)
197			self.backend.train(corpus, project=self)
198
199			def dump(self):
200			"""return this project as a dict"""
201			return {'project_id': self.project_id,
202			'name': self.name,
203			'language': self.language,
204			'backend': {'backend_id': self.config['backend']}
205			}
206
207
208			def _create_projects(projects_file, datadir, init_projects):
209			if not os.path.exists(projects_file):
210			logger.warning("Project configuration file '%s' is missing. " +
211			'Please provide one.', projects_file)
212			logger.warning('You can set the path to the project configuration ' +
213			'file using the ANNIF_PROJECTS environment variable.')
214			return {}
215
216			config = configparser.ConfigParser()
217			config.optionxform = lambda option: option
218			with open(projects_file) as projf:
219			config.read_file(projf)
220
221			# create AnnifProject objects from the configuration file
222			projects = collections.OrderedDict()
223			for project_id in config.sections():
224			projects[project_id] = AnnifProject(project_id,
225			config[project_id],
226			datadir)
227			if init_projects:
228			projects[project_id].initialize()
229			return projects
230
231
232			def initialize_projects(app):
233			projects_file = app.config['PROJECTS_FILE']
234			datadir = app.config['DATADIR']
235			init_projects = app.config['INITIALIZE_PROJECTS']
236			app.annif_projects = _create_projects(
237			projects_file, datadir, init_projects)
238
239
240			def get_projects(min_access=Access.private):
241			"""Return the available projects as a dict of project_id ->
242			AnnifProject. The min_access parameter may be used to set the minimum
243			access level required for the returned projects."""
244
245			projects = [(project_id, project)
246			for project_id, project in current_app.annif_projects.items()
247			if project.access >= min_access]
248			return collections.OrderedDict(projects)
249
250
251			def get_project(project_id, min_access=Access.private):
252			"""return the definition of a single Project by project_id"""
253			projects = get_projects(min_access)
254			try:
255			return projects[project_id]
256			except KeyError:
257			raise ValueError("No such project {}".format(project_id))
258

NatLibFi / Annif

Pull Request — master (#259)

annif.project A

Complexity

Size/Duplication

Importance

17 Methods

4 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like