annif.project - Code Metrics - Inspection of "Initial support for online learning in vw_multi ba..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#257)

by Osma

created 2019-02-27 11:41 UTC

annif.project B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	270
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	44
eloc	208
dl	0
loc	270
rs	8.8798
c	0
b	0
f	0

18 Methods

Rating	Name	Size	Complexity
A	AnnifProject.dump()	6	1
A	AnnifProject.vectorizer()	12	3
A	AnnifProject._initialize_subjects()	8	2
A	AnnifProject._analyze_with_backend()	9	2
A	AnnifProject._initialize_backend()	9	3
A	AnnifProject._initialize_vectorizer()	8	2
A	AnnifProject.subjects()	3	1
A	AnnifProject.analyze()	9	1
A	AnnifProject._init_access()	8	2
A	AnnifProject.train()	6	1
A	AnnifProject.__init__()	10	1
A	AnnifProject._initialize_analyzer()	5	1
A	AnnifProject.analyzer()	5	3
A	AnnifProject._create_vectorizer()	13	2
A	AnnifProject.vocab()	9	3
A	AnnifProject.backend()	14	3
A	AnnifProject.initialize()	11	1
A	AnnifProject.learn()	11	2

4 Functions

Rating	Name	Size	Complexity
A	get_projects()	9	1
A	get_project()	7	2
B	_create_projects()	22	6
A	initialize_projects()	6	1

How to fix Complexity

"""Project management functionality for Annif"""

import collections
import configparser
import enum
import os.path
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import current_app
import annif
import annif.analyzer
import annif.corpus
import annif.hit
import annif.backend
import annif.util
import annif.vocab
from annif.datadir import DatadirMixin
from annif.exception import AnnifException, ConfigurationException, \
    NotInitializedException, NotSupportedException

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""
    private = 1
    hidden = 2
    public = 3


class AnnifProject(DatadirMixin):
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _analyzer = None
    _backend = None
    _vocab = None
    _vectorizer = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = 'public'

    def __init__(self, project_id, config, datadir):
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
        self.project_id = project_id
        self.name = config['name']
        self.language = config['language']
        self.analyzer_spec = config.get('analyzer', None)
        self.vocab_id = config.get('vocab', None)
        self.config = config
        self._base_datadir = datadir
        self._init_access()

    def _init_access(self):
        access = self.config.get('access', self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id)

    def _initialize_analyzer(self):
        analyzer = self.analyzer
        logger.debug("Project '%s': initialized analyzer: %s",
                     self.project_id,
                     str(analyzer))

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug("Project '%s': initialized subjects: %s",
                         self.project_id,
                         str(subjects))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_vectorizer(self):
        try:
            vectorizer = self.vectorizer
            logger.debug("Project '%s': initialized vectorizer: %s",
                         self.project_id,
                         str(vectorizer))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self):
        logger.debug("Project '%s': initializing backend", self.project_id)
        if not self.backend:
            logger.debug("Cannot initialize backend: does not exist")
            return
        try:
            self.backend.initialize()
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self):
        """initialize this project and its backend so that they are ready to
        analyze"""
        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_vectorizer()
        self._initialize_backend()

        self.initialized = True

    def _analyze_with_backend(self, text, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hits = self.backend.analyze(text, project=self, params=beparams)
        logger.debug(
            'Got %d hits from backend %s',
            len(hits), self.backend.backend_id)
        return hits

    @property
    def analyzer(self):
        if self._analyzer is None and self.analyzer_spec:
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
        return self._analyzer

    @property
    def backend(self):
        if self._backend is None:
            backend_id = self.config['backend']
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, params=self.config, datadir=self.datadir)
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id)
        return self._backend

    @property
    def vocab(self):
        if self._vocab is None:
            if self.vocab_id is None:
                raise ConfigurationException("vocab setting is missing",
                                             project_id=self.project_id)
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
                                                      self._base_datadir)
        return self._vocab

    @property
    def subjects(self):
        return self.vocab.subjects

    @property
    def vectorizer(self):
        if self._vectorizer is None:
            path = os.path.join(self.datadir, 'vectorizer')
            if os.path.exists(path):
                logger.debug('loading vectorizer from %s', path)
                self._vectorizer = joblib.load(path)
            else:
                raise NotInitializedException(
                    "vectorizer file '{}' not found".format(path),
                    project_id=self.project_id)
        return self._vectorizer

    def analyze(self, text, backend_params=None):
        """Analyze the given text by passing it to the backend. Returns a
        list of AnalysisHit objects ordered by decreasing score."""

        logger.debug('Analyzing text "%s..." (len=%d)',
                     text[:20], len(text))
        hits = self._analyze_with_backend(text, backend_params)
        logger.debug('%d hits from backend', len(hits))
        return hits

    def _create_vectorizer(self, subjectcorpus):
        if not self.backend.needs_subject_vectorizer:
            logger.debug('not creating vectorizer: not needed by backend')
            return
        logger.info('creating vectorizer')
        self._vectorizer = TfidfVectorizer(
            tokenizer=self.analyzer.tokenize_words)
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))

        annif.util.atomic_save(
            self._vectorizer,
            self.datadir,
            'vectorizer',
            method=joblib.dump)

    def train(self, corpus):
        """train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        self._create_vectorizer(corpus)
        self.backend.train(corpus, project=self)

    def learn(self, corpus):
        """further train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        if isinstance(
                self.backend,
                annif.backend.backend.AnnifLearningBackend):
            self.backend.learn(corpus, project=self)
        else:
            raise NotSupportedException("Learning not supported by backend",
                                        project_id=self.project_id)

    def dump(self):
        """return this project as a dict"""
        return {'project_id': self.project_id,
                'name': self.name,
                'language': self.language,
                'backend': {'backend_id': self.config['backend']}
                }


def _create_projects(projects_file, datadir, init_projects):
    if not os.path.exists(projects_file):
        logger.warning("Project configuration file '%s' is missing. " +
                       'Please provide one.', projects_file)
        logger.warning('You can set the path to the project configuration ' +
                       'file using the ANNIF_PROJECTS environment variable.')
        return {}

    config = configparser.ConfigParser()
    config.optionxform = lambda option: option
    with open(projects_file) as projf:
        config.read_file(projf)

    # create AnnifProject objects from the configuration file
    projects = collections.OrderedDict()
    for project_id in config.sections():
        projects[project_id] = AnnifProject(project_id,
                                            config[project_id],
                                            datadir)
        if init_projects:
            projects[project_id].initialize()
    return projects


def initialize_projects(app):
    projects_file = app.config['PROJECTS_FILE']
    datadir = app.config['DATADIR']
    init_projects = app.config['INITIALIZE_PROJECTS']
    app.annif_projects = _create_projects(
        projects_file, datadir, init_projects)


def get_projects(min_access=Access.private):
    """Return the available projects as a dict of project_id ->
    AnnifProject. The min_access parameter may be used to set the minimum
    access level required for the returned projects."""

    projects = [(project_id, project)
                for project_id, project in current_app.annif_projects.items()
                if project.access >= min_access]
    return collections.OrderedDict(projects)


def get_project(project_id, min_access=Access.private):
    """return the definition of a single Project by project_id"""
    projects = get_projects(min_access)
    try:
        return projects[project_id]
    except KeyError:
        raise ValueError("No such project {}".format(project_id))


1			"""Project management functionality for Annif"""
2
3			import collections
4			import configparser
5			import enum
6			import os.path
7			from sklearn.externals import joblib
8			from sklearn.feature_extraction.text import TfidfVectorizer
9			from flask import current_app
10			import annif
11			import annif.analyzer
12			import annif.corpus
13			import annif.hit
14			import annif.backend
15			import annif.util
16			import annif.vocab
17			from annif.datadir import DatadirMixin
18			from annif.exception import AnnifException, ConfigurationException, \
19			NotInitializedException, NotSupportedException
20
21			logger = annif.logger
22
23
24			class Access(enum.IntEnum):
25			"""Enumeration of access levels for projects"""
26			private = 1
27			hidden = 2
28			public = 3
29
30
31			class AnnifProject(DatadirMixin):
32			"""Class representing the configuration of a single Annif project."""
33
34			# defaults for uninitialized instances
35			_analyzer = None
36			_backend = None
37			_vocab = None
38			_vectorizer = None
39			initialized = False
40
41			# default values for configuration settings
42			DEFAULT_ACCESS = 'public'
43
44			def __init__(self, project_id, config, datadir):
45			DatadirMixin.__init__(self, datadir, 'projects', project_id)
46			self.project_id = project_id
47			self.name = config['name']
48			self.language = config['language']
49			self.analyzer_spec = config.get('analyzer', None)
50			self.vocab_id = config.get('vocab', None)
51			self.config = config
52			self._base_datadir = datadir
53			self._init_access()
54
55			def _init_access(self):
56			access = self.config.get('access', self.DEFAULT_ACCESS)
57			try:
58			self.access = getattr(Access, access)
59			except AttributeError:
60			raise ConfigurationException(
61			"'{}' is not a valid access setting".format(access),
62			project_id=self.project_id)
63
64			def _initialize_analyzer(self):
65			analyzer = self.analyzer
66			logger.debug("Project '%s': initialized analyzer: %s",
67			self.project_id,
68			str(analyzer))
69
70			def _initialize_subjects(self):
71			try:
72			subjects = self.subjects
73			logger.debug("Project '%s': initialized subjects: %s",
74			self.project_id,
75			str(subjects))
76			except AnnifException as err:
77			logger.warning(err.format_message())
78
79			def _initialize_vectorizer(self):
80			try:
81			vectorizer = self.vectorizer
82			logger.debug("Project '%s': initialized vectorizer: %s",
83			self.project_id,
84			str(vectorizer))
85			except AnnifException as err:
86			logger.warning(err.format_message())
87
88			def _initialize_backend(self):
89			logger.debug("Project '%s': initializing backend", self.project_id)
90			if not self.backend:
91			logger.debug("Cannot initialize backend: does not exist")
92			return
93			try:
94			self.backend.initialize()
95			except AnnifException as err:
96			logger.warning(err.format_message())
97
98			def initialize(self):
99			"""initialize this project and its backend so that they are ready to
100			analyze"""
101			logger.debug("Initializing project '%s'", self.project_id)
102
103			self._initialize_analyzer()
104			self._initialize_subjects()
105			self._initialize_vectorizer()
106			self._initialize_backend()
107
108			self.initialized = True
109
110			def _analyze_with_backend(self, text, backend_params):
111			if backend_params is None:
112			backend_params = {}
113			beparams = backend_params.get(self.backend.backend_id, {})
114			hits = self.backend.analyze(text, project=self, params=beparams)
115			logger.debug(
116			'Got %d hits from backend %s',
117			len(hits), self.backend.backend_id)
118			return hits
119
120			@property
121			def analyzer(self):
122			if self._analyzer is None and self.analyzer_spec:
123			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
124			return self._analyzer
125
126			@property
127			def backend(self):
128			if self._backend is None:
129			backend_id = self.config['backend']
130			try:
131			backend_class = annif.backend.get_backend(backend_id)
132			self._backend = backend_class(
133			backend_id, params=self.config, datadir=self.datadir)
134			except ValueError:
135			logger.warning(
136			"Could not create backend %s, "
137			"make sure you've installed optional dependencies",
138			backend_id)
139			return self._backend
140
141			@property
142			def vocab(self):
143			if self._vocab is None:
144			if self.vocab_id is None:
145			raise ConfigurationException("vocab setting is missing",
146			project_id=self.project_id)
147			self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
148			self._base_datadir)
149			return self._vocab
150
151			@property
152			def subjects(self):
153			return self.vocab.subjects
154
155			@property
156			def vectorizer(self):
157			if self._vectorizer is None:
158			path = os.path.join(self.datadir, 'vectorizer')
159			if os.path.exists(path):
160			logger.debug('loading vectorizer from %s', path)
161			self._vectorizer = joblib.load(path)
162			else:
163			raise NotInitializedException(
164			"vectorizer file '{}' not found".format(path),
165			project_id=self.project_id)
166			return self._vectorizer
167
168			def analyze(self, text, backend_params=None):
169			"""Analyze the given text by passing it to the backend. Returns a
170			list of AnalysisHit objects ordered by decreasing score."""
171
172			logger.debug('Analyzing text "%s..." (len=%d)',
173			text[:20], len(text))
174			hits = self._analyze_with_backend(text, backend_params)
175			logger.debug('%d hits from backend', len(hits))
176			return hits
177
178			def _create_vectorizer(self, subjectcorpus):
179			if not self.backend.needs_subject_vectorizer:
180			logger.debug('not creating vectorizer: not needed by backend')
181			return
182			logger.info('creating vectorizer')
183			self._vectorizer = TfidfVectorizer(
184			tokenizer=self.analyzer.tokenize_words)
185			self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
			1 ignored issue – show Comprehensibility Best Practice introduced 2018-09-26 10:16 UTC by Report Bug Copy Issue Report The variable `subj` does not seem to be defined. Loading history...
186			annif.util.atomic_save(
187			self._vectorizer,
188			self.datadir,
189			'vectorizer',
190			method=joblib.dump)
191
192			def train(self, corpus):
193			"""train the project using documents from a metadata source"""
194
195			corpus.set_subject_index(self.subjects)
196			self._create_vectorizer(corpus)
197			self.backend.train(corpus, project=self)
198
199			def learn(self, corpus):
200			"""further train the project using documents from a metadata source"""
201
202			corpus.set_subject_index(self.subjects)
203			if isinstance(
204			self.backend,
205			annif.backend.backend.AnnifLearningBackend):
206			self.backend.learn(corpus, project=self)
207			else:
208			raise NotSupportedException("Learning not supported by backend",
209			project_id=self.project_id)
210
211			def dump(self):
212			"""return this project as a dict"""
213			return {'project_id': self.project_id,
214			'name': self.name,
215			'language': self.language,
216			'backend': {'backend_id': self.config['backend']}
217			}
218
219
220			def _create_projects(projects_file, datadir, init_projects):
221			if not os.path.exists(projects_file):
222			logger.warning("Project configuration file '%s' is missing. " +
223			'Please provide one.', projects_file)
224			logger.warning('You can set the path to the project configuration ' +
225			'file using the ANNIF_PROJECTS environment variable.')
226			return {}
227
228			config = configparser.ConfigParser()
229			config.optionxform = lambda option: option
230			with open(projects_file) as projf:
231			config.read_file(projf)
232
233			# create AnnifProject objects from the configuration file
234			projects = collections.OrderedDict()
235			for project_id in config.sections():
236			projects[project_id] = AnnifProject(project_id,
237			config[project_id],
238			datadir)
239			if init_projects:
240			projects[project_id].initialize()
241			return projects
242
243
244			def initialize_projects(app):
245			projects_file = app.config['PROJECTS_FILE']
246			datadir = app.config['DATADIR']
247			init_projects = app.config['INITIALIZE_PROJECTS']
248			app.annif_projects = _create_projects(
249			projects_file, datadir, init_projects)
250
251
252			def get_projects(min_access=Access.private):
253			"""Return the available projects as a dict of project_id ->
254			AnnifProject. The min_access parameter may be used to set the minimum
255			access level required for the returned projects."""
256
257			projects = [(project_id, project)
258			for project_id, project in current_app.annif_projects.items()
259			if project.access >= min_access]
260			return collections.OrderedDict(projects)
261
262
263			def get_project(project_id, min_access=Access.private):
264			"""return the definition of a single Project by project_id"""
265			projects = get_projects(min_access)
266			try:
267			return projects[project_id]
268			except KeyError:
269			raise ValueError("No such project {}".format(project_id))
270

NatLibFi / Annif

Pull Request — master (#257)

annif.project B

Complexity

Size/Duplication

Importance

18 Methods

4 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like