annif.project - Code Metrics - Inspection of "Merge pull request #246 from NatLibFi/issue237-acc..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 4e1d6c...3804bf )

by Osma

created 2019-01-29 11:07 UTC

annif.project B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	264
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	44
eloc	203
dl	0
loc	264
rs	8.8798
c	0
b	0
f	0

18 Methods

Rating	Name	Size	Complexity
A	AnnifProject.vectorizer()	12	3
A	AnnifProject._initialize_subjects()	8	2
A	AnnifProject.dump()	6	1
A	AnnifProject._analyze_with_backend()	9	2
A	AnnifProject._initialize_backend()	9	3
A	AnnifProject._initialize_vectorizer()	8	2
A	AnnifProject.subjects()	3	1
A	AnnifProject.analyze()	9	1
A	AnnifProject._init_access()	8	2
A	AnnifProject.train()	6	1
A	AnnifProject.__init__()	10	1
A	AnnifProject._get_datadir()	6	2
A	AnnifProject._initialize_analyzer()	5	1
A	AnnifProject.analyzer()	5	3
A	AnnifProject._create_vectorizer()	13	2
A	AnnifProject.vocab()	9	3
A	AnnifProject.backend()	14	3
A	AnnifProject.initialize()	11	1

4 Functions

Rating	Name	Size	Complexity
A	get_projects()	9	1
A	get_project()	7	2
B	_create_projects()	22	6
A	initialize_projects()	6	1

How to fix Complexity

"""Project management functionality for Annif"""

import collections
import configparser
import enum
import os.path
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import current_app
import annif
import annif.analyzer
import annif.corpus
import annif.hit
import annif.backend
import annif.util
import annif.vocab
from annif.exception import AnnifException, ConfigurationException, \
    NotInitializedException

logger = annif.logger


class Access(enum.IntEnum):
    """Enumeration of access levels for projects"""
    private = 1
    hidden = 2
    public = 3


class AnnifProject:
    """Class representing the configuration of a single Annif project."""

    # defaults for uninitialized instances
    _analyzer = None
    _backend = None
    _vocab = None
    _vectorizer = None
    initialized = False

    # default values for configuration settings
    DEFAULT_ACCESS = 'public'

    def __init__(self, project_id, config, datadir):
        self.project_id = project_id
        self.name = config['name']
        self.language = config['language']
        self.analyzer_spec = config.get('analyzer', None)
        self.vocab_id = config.get('vocab', None)
        self._base_datadir = datadir
        self._datadir = os.path.join(datadir, 'projects', self.project_id)
        self.config = config
        self._init_access()

    def _init_access(self):
        access = self.config.get('access', self.DEFAULT_ACCESS)
        try:
            self.access = getattr(Access, access)
        except AttributeError:
            raise ConfigurationException(
                "'{}' is not a valid access setting".format(access),
                project_id=self.project_id)

    def _get_datadir(self):
        """return the path of the directory where this project can store its
        data files"""
        if not os.path.exists(self._datadir):
            os.makedirs(self._datadir)
        return self._datadir

    def _initialize_analyzer(self):
        analyzer = self.analyzer
        logger.debug("Project '%s': initialized analyzer: %s",
                     self.project_id,
                     str(analyzer))

    def _initialize_subjects(self):
        try:
            subjects = self.subjects
            logger.debug("Project '%s': initialized subjects: %s",
                         self.project_id,
                         str(subjects))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_vectorizer(self):
        try:
            vectorizer = self.vectorizer
            logger.debug("Project '%s': initialized vectorizer: %s",
                         self.project_id,
                         str(vectorizer))
        except AnnifException as err:
            logger.warning(err.format_message())

    def _initialize_backend(self):
        logger.debug("Project '%s': initializing backend", self.project_id)
        if not self.backend:
            logger.debug("Cannot initialize backend: does not exist")
            return
        try:
            self.backend.initialize()
        except AnnifException as err:
            logger.warning(err.format_message())

    def initialize(self):
        """initialize this project and its backend so that they are ready to
        analyze"""
        logger.debug("Initializing project '%s'", self.project_id)

        self._initialize_analyzer()
        self._initialize_subjects()
        self._initialize_vectorizer()
        self._initialize_backend()

        self.initialized = True

    def _analyze_with_backend(self, text, backend_params):
        if backend_params is None:
            backend_params = {}
        beparams = backend_params.get(self.backend.backend_id, {})
        hits = self.backend.analyze(text, project=self, params=beparams)
        logger.debug(
            'Got %d hits from backend %s',
            len(hits), self.backend.backend_id)
        return hits

    @property
    def analyzer(self):
        if self._analyzer is None and self.analyzer_spec:
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
        return self._analyzer

    @property
    def backend(self):
        if self._backend is None:
            backend_id = self.config['backend']
            try:
                backend_class = annif.backend.get_backend(backend_id)
                self._backend = backend_class(
                    backend_id, params=self.config, datadir=self._datadir)
            except ValueError:
                logger.warning(
                    "Could not create backend %s, "
                    "make sure you've installed optional dependencies",
                    backend_id)
        return self._backend

    @property
    def vocab(self):
        if self._vocab is None:
            if self.vocab_id is None:
                raise ConfigurationException("vocab setting is missing",
                                             project_id=self.project_id)
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
                                                      self._base_datadir)
        return self._vocab

    @property
    def subjects(self):
        return self.vocab.subjects

    @property
    def vectorizer(self):
        if self._vectorizer is None:
            path = os.path.join(self._get_datadir(), 'vectorizer')
            if os.path.exists(path):
                logger.debug('loading vectorizer from %s', path)
                self._vectorizer = joblib.load(path)
            else:
                raise NotInitializedException(
                    "vectorizer file '{}' not found".format(path),
                    project_id=self.project_id)
        return self._vectorizer

    def analyze(self, text, backend_params=None):
        """Analyze the given text by passing it to the backend. Returns a
        list of AnalysisHit objects ordered by decreasing score."""

        logger.debug('Analyzing text "%s..." (len=%d)',
                     text[:20], len(text))
        hits = self._analyze_with_backend(text, backend_params)
        logger.debug('%d hits from backend', len(hits))
        return hits

    def _create_vectorizer(self, subjectcorpus):
        if not self.backend.needs_subject_vectorizer:
            logger.debug('not creating vectorizer: not needed by backend')
            return
        logger.info('creating vectorizer')
        self._vectorizer = TfidfVectorizer(
            tokenizer=self.analyzer.tokenize_words)
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))

        annif.util.atomic_save(
            self._vectorizer,
            self._get_datadir(),
            'vectorizer',
            method=joblib.dump)

    def train(self, corpus):
        """train the project using documents from a metadata source"""

        corpus.set_subject_index(self.subjects)
        self._create_vectorizer(corpus)
        self.backend.train(corpus, project=self)

    def dump(self):
        """return this project as a dict"""
        return {'project_id': self.project_id,
                'name': self.name,
                'language': self.language,
                'backend': {'backend_id': self.config['backend']}
                }


def _create_projects(projects_file, datadir, init_projects):
    if not os.path.exists(projects_file):
        logger.warning("Project configuration file '%s' is missing. " +
                       'Please provide one.', projects_file)
        logger.warning('You can set the path to the project configuration ' +
                       'file using the ANNIF_PROJECTS environment variable.')
        return {}

    config = configparser.ConfigParser()
    config.optionxform = lambda option: option
    with open(projects_file) as projf:
        config.read_file(projf)

    # create AnnifProject objects from the configuration file
    projects = collections.OrderedDict()
    for project_id in config.sections():
        projects[project_id] = AnnifProject(project_id,
                                            config[project_id],
                                            datadir)
        if init_projects:
            projects[project_id].initialize()
    return projects


def initialize_projects(app):
    projects_file = app.config['PROJECTS_FILE']
    datadir = app.config['DATADIR']
    init_projects = app.config['INITIALIZE_PROJECTS']
    app.annif_projects = _create_projects(
        projects_file, datadir, init_projects)


def get_projects(min_access=Access.private):
    """Return the available projects as a dict of project_id ->
    AnnifProject. The min_access parameter may be used to set the minimum
    access level required for the returned projects."""

    projects = [(project_id, project)
                for project_id, project in current_app.annif_projects.items()
                if project.access >= min_access]
    return collections.OrderedDict(projects)


def get_project(project_id, min_access=Access.private):
    """return the definition of a single Project by project_id"""
    projects = get_projects(min_access)
    try:
        return projects[project_id]
    except KeyError:
        raise ValueError("No such project {}".format(project_id))


1			"""Project management functionality for Annif"""
2
3			import collections
4			import configparser
5			import enum
6			import os.path
7			from sklearn.externals import joblib
8			from sklearn.feature_extraction.text import TfidfVectorizer
9			from flask import current_app
10			import annif
11			import annif.analyzer
12			import annif.corpus
13			import annif.hit
14			import annif.backend
15			import annif.util
16			import annif.vocab
17			from annif.exception import AnnifException, ConfigurationException, \
18			NotInitializedException
19
20			logger = annif.logger
21
22
23			class Access(enum.IntEnum):
24			"""Enumeration of access levels for projects"""
25			private = 1
26			hidden = 2
27			public = 3
28
29
30			class AnnifProject:
31			"""Class representing the configuration of a single Annif project."""
32
33			# defaults for uninitialized instances
34			_analyzer = None
35			_backend = None
36			_vocab = None
37			_vectorizer = None
38			initialized = False
39
40			# default values for configuration settings
41			DEFAULT_ACCESS = 'public'
42
43			def __init__(self, project_id, config, datadir):
44			self.project_id = project_id
45			self.name = config['name']
46			self.language = config['language']
47			self.analyzer_spec = config.get('analyzer', None)
48			self.vocab_id = config.get('vocab', None)
49			self._base_datadir = datadir
50			self._datadir = os.path.join(datadir, 'projects', self.project_id)
51			self.config = config
52			self._init_access()
53
54			def _init_access(self):
55			access = self.config.get('access', self.DEFAULT_ACCESS)
56			try:
57			self.access = getattr(Access, access)
58			except AttributeError:
59			raise ConfigurationException(
60			"'{}' is not a valid access setting".format(access),
61			project_id=self.project_id)
62
63			def _get_datadir(self):
64			"""return the path of the directory where this project can store its
65			data files"""
66			if not os.path.exists(self._datadir):
67			os.makedirs(self._datadir)
68			return self._datadir
69
70			def _initialize_analyzer(self):
71			analyzer = self.analyzer
72			logger.debug("Project '%s': initialized analyzer: %s",
73			self.project_id,
74			str(analyzer))
75
76			def _initialize_subjects(self):
77			try:
78			subjects = self.subjects
79			logger.debug("Project '%s': initialized subjects: %s",
80			self.project_id,
81			str(subjects))
82			except AnnifException as err:
83			logger.warning(err.format_message())
84
85			def _initialize_vectorizer(self):
86			try:
87			vectorizer = self.vectorizer
88			logger.debug("Project '%s': initialized vectorizer: %s",
89			self.project_id,
90			str(vectorizer))
91			except AnnifException as err:
92			logger.warning(err.format_message())
93
94			def _initialize_backend(self):
95			logger.debug("Project '%s': initializing backend", self.project_id)
96			if not self.backend:
97			logger.debug("Cannot initialize backend: does not exist")
98			return
99			try:
100			self.backend.initialize()
101			except AnnifException as err:
102			logger.warning(err.format_message())
103
104			def initialize(self):
105			"""initialize this project and its backend so that they are ready to
106			analyze"""
107			logger.debug("Initializing project '%s'", self.project_id)
108
109			self._initialize_analyzer()
110			self._initialize_subjects()
111			self._initialize_vectorizer()
112			self._initialize_backend()
113
114			self.initialized = True
115
116			def _analyze_with_backend(self, text, backend_params):
117			if backend_params is None:
118			backend_params = {}
119			beparams = backend_params.get(self.backend.backend_id, {})
120			hits = self.backend.analyze(text, project=self, params=beparams)
121			logger.debug(
122			'Got %d hits from backend %s',
123			len(hits), self.backend.backend_id)
124			return hits
125
126			@property
127			def analyzer(self):
128			if self._analyzer is None and self.analyzer_spec:
129			self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
130			return self._analyzer
131
132			@property
133			def backend(self):
134			if self._backend is None:
135			backend_id = self.config['backend']
136			try:
137			backend_class = annif.backend.get_backend(backend_id)
138			self._backend = backend_class(
139			backend_id, params=self.config, datadir=self._datadir)
140			except ValueError:
141			logger.warning(
142			"Could not create backend %s, "
143			"make sure you've installed optional dependencies",
144			backend_id)
145			return self._backend
146
147			@property
148			def vocab(self):
149			if self._vocab is None:
150			if self.vocab_id is None:
151			raise ConfigurationException("vocab setting is missing",
152			project_id=self.project_id)
153			self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
154			self._base_datadir)
155			return self._vocab
156
157			@property
158			def subjects(self):
159			return self.vocab.subjects
160
161			@property
162			def vectorizer(self):
163			if self._vectorizer is None:
164			path = os.path.join(self._get_datadir(), 'vectorizer')
165			if os.path.exists(path):
166			logger.debug('loading vectorizer from %s', path)
167			self._vectorizer = joblib.load(path)
168			else:
169			raise NotInitializedException(
170			"vectorizer file '{}' not found".format(path),
171			project_id=self.project_id)
172			return self._vectorizer
173
174			def analyze(self, text, backend_params=None):
175			"""Analyze the given text by passing it to the backend. Returns a
176			list of AnalysisHit objects ordered by decreasing score."""
177
178			logger.debug('Analyzing text "%s..." (len=%d)',
179			text[:20], len(text))
180			hits = self._analyze_with_backend(text, backend_params)
181			logger.debug('%d hits from backend', len(hits))
182			return hits
183
184			def _create_vectorizer(self, subjectcorpus):
185			if not self.backend.needs_subject_vectorizer:
186			logger.debug('not creating vectorizer: not needed by backend')
187			return
188			logger.info('creating vectorizer')
189			self._vectorizer = TfidfVectorizer(
190			tokenizer=self.analyzer.tokenize_words)
191			self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
			1 ignored issue – show Comprehensibility Best Practice introduced 2018-09-26 10:16 UTC by Report Bug Copy Issue Report The variable `subj` does not seem to be defined. Loading history...
192			annif.util.atomic_save(
193			self._vectorizer,
194			self._get_datadir(),
195			'vectorizer',
196			method=joblib.dump)
197
198			def train(self, corpus):
199			"""train the project using documents from a metadata source"""
200
201			corpus.set_subject_index(self.subjects)
202			self._create_vectorizer(corpus)
203			self.backend.train(corpus, project=self)
204
205			def dump(self):
206			"""return this project as a dict"""
207			return {'project_id': self.project_id,
208			'name': self.name,
209			'language': self.language,
210			'backend': {'backend_id': self.config['backend']}
211			}
212
213
214			def _create_projects(projects_file, datadir, init_projects):
215			if not os.path.exists(projects_file):
216			logger.warning("Project configuration file '%s' is missing. " +
217			'Please provide one.', projects_file)
218			logger.warning('You can set the path to the project configuration ' +
219			'file using the ANNIF_PROJECTS environment variable.')
220			return {}
221
222			config = configparser.ConfigParser()
223			config.optionxform = lambda option: option
224			with open(projects_file) as projf:
225			config.read_file(projf)
226
227			# create AnnifProject objects from the configuration file
228			projects = collections.OrderedDict()
229			for project_id in config.sections():
230			projects[project_id] = AnnifProject(project_id,
231			config[project_id],
232			datadir)
233			if init_projects:
234			projects[project_id].initialize()
235			return projects
236
237
238			def initialize_projects(app):
239			projects_file = app.config['PROJECTS_FILE']
240			datadir = app.config['DATADIR']
241			init_projects = app.config['INITIALIZE_PROJECTS']
242			app.annif_projects = _create_projects(
243			projects_file, datadir, init_projects)
244
245
246			def get_projects(min_access=Access.private):
247			"""Return the available projects as a dict of project_id ->
248			AnnifProject. The min_access parameter may be used to set the minimum
249			access level required for the returned projects."""
250
251			projects = [(project_id, project)
252			for project_id, project in current_app.annif_projects.items()
253			if project.access >= min_access]
254			return collections.OrderedDict(projects)
255
256
257			def get_project(project_id, min_access=Access.private):
258			"""return the definition of a single Project by project_id"""
259			projects = get_projects(min_access)
260			try:
261			return projects[project_id]
262			except KeyError:
263			raise ValueError("No such project {}".format(project_id))
264

NatLibFi / Annif

Push — master ( 4e1d6c...3804bf )

annif.project B

Complexity

Size/Duplication

Importance

18 Methods

4 Functions

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like