Passed
Pull Request — master (#418)
by Osma
01:38
created

annif.project   D

Complexity

Total Complexity 59

Size/Duplication

Total Lines 345
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 59
eloc 254
dl 0
loc 345
rs 4.08
c 0
b 0
f 0

25 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject._init_access() 0 8 2
A AnnifProject.subjects() 0 3 1
A AnnifProject.vocab() 0 9 3
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject.remove_model_data() 0 10 2
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject._suggest_with_backend() 0 9 2
A AnnifProject.analyzer() 0 11 3
A AnnifProject.backend() 0 18 4
A AnnifProject.initialize() 0 11 1
A AnnifProject.dump() 0 8 1
A AnnifProject.train() 0 8 3
A AnnifProject.suggest() 0 13 3
A AnnifProject.learn() 0 13 3
A AnnifProject.__init__() 0 11 1
A AnnifProject._initialize_analyzer() 0 7 2
A AnnifRegistry.get_projects() 0 8 1
A AnnifRegistry.get_project() 0 8 2
A AnnifProject._get_info() 0 8 3
A AnnifProject.modification_time() 0 3 1
A AnnifProject.is_trained() 0 3 1
B AnnifRegistry._create_projects() 0 27 5
A AnnifRegistry.__init__() 0 7 3
A ProjectSuggestMap.suggest() 0 6 1
A ProjectSuggestMap.__init__() 0 6 1

3 Functions

Rating   Name   Duplication   Size   Complexity  
A get_projects() 0 8 2
A get_project() 0 8 2
A initialize_projects() 0 5 1

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from flask import current_app
8
from shutil import rmtree
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.suggestion
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.datadir import DatadirMixin
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotSupportedException, NotInitializedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject(DatadirMixin):
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    initialized = False
38
39
    # default values for configuration settings
40
    DEFAULT_ACCESS = 'public'
41
42
    def __init__(self, project_id, config, datadir, registry):
43
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
44
        self.project_id = project_id
45
        self.name = config.get('name', project_id)
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self.config = config
50
        self._base_datadir = datadir
51
        self.registry = registry
52
        self._init_access()
53
54
    def _init_access(self):
55
        access = self.config.get('access', self.DEFAULT_ACCESS)
56
        try:
57
            self.access = getattr(Access, access)
58
        except AttributeError:
59
            raise ConfigurationException(
60
                "'{}' is not a valid access setting".format(access),
61
                project_id=self.project_id)
62
63
    def _initialize_analyzer(self):
64
        if not self.analyzer_spec:
65
            return  # not configured, so assume it's not needed
66
        analyzer = self.analyzer
67
        logger.debug("Project '%s': initialized analyzer: %s",
68
                     self.project_id,
69
                     str(analyzer))
70
71
    def _initialize_subjects(self):
72
        try:
73
            subjects = self.subjects
74
            logger.debug("Project '%s': initialized subjects: %s",
75
                         self.project_id,
76
                         str(subjects))
77
        except AnnifException as err:
78
            logger.warning(err.format_message())
79
80
    def _initialize_backend(self):
81
        logger.debug("Project '%s': initializing backend", self.project_id)
82
        try:
83
            if not self.backend:
84
                logger.debug("Cannot initialize backend: does not exist")
85
                return
86
            self.backend.initialize()
87
        except AnnifException as err:
88
            logger.warning(err.format_message())
89
90
    def initialize(self):
91
        """initialize this project and its backend so that they are ready to
92
        be used"""
93
94
        logger.debug("Initializing project '%s'", self.project_id)
95
96
        self._initialize_analyzer()
97
        self._initialize_subjects()
98
        self._initialize_backend()
99
100
        self.initialized = True
101
102
    def _suggest_with_backend(self, text, backend_params):
103
        if backend_params is None:
104
            backend_params = {}
105
        beparams = backend_params.get(self.backend.backend_id, {})
106
        hits = self.backend.suggest(text, beparams)
107
        logger.debug(
108
            'Got %d hits from backend %s',
109
            len(hits), self.backend.backend_id)
110
        return hits
111
112
    @property
113
    def analyzer(self):
114
        if self._analyzer is None:
115
            if self.analyzer_spec:
116
                self._analyzer = annif.analyzer.get_analyzer(
117
                    self.analyzer_spec)
118
            else:
119
                raise ConfigurationException(
120
                    "analyzer setting is missing (and needed by the backend)",
121
                    project_id=self.project_id)
122
        return self._analyzer
123
124
    @property
125
    def backend(self):
126
        if self._backend is None:
127
            if 'backend' not in self.config:
128
                raise ConfigurationException(
129
                    "backend setting is missing", project_id=self.project_id)
130
            backend_id = self.config['backend']
131
            try:
132
                backend_class = annif.backend.get_backend(backend_id)
133
                self._backend = backend_class(
134
                    backend_id, config_params=self.config,
135
                    project=self)
136
            except ValueError:
137
                logger.warning(
138
                    "Could not create backend %s, "
139
                    "make sure you've installed optional dependencies",
140
                    backend_id)
141
        return self._backend
142
143
    @property
144
    def vocab(self):
145
        if self._vocab is None:
146
            if self.vocab_id is None:
147
                raise ConfigurationException("vocab setting is missing",
148
                                             project_id=self.project_id)
149
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150
                                                      self._base_datadir)
151
        return self._vocab
152
153
    @property
154
    def subjects(self):
155
        return self.vocab.subjects
156
157
    def _get_info(self, key):
158
        try:
159
            be = self.backend
160
            if be is not None:
161
                return getattr(be, key)
162
        except AnnifException as err:
163
            logger.warning(err.format_message())
164
            return None
165
166
    @property
167
    def is_trained(self):
168
        return self._get_info('is_trained')
169
170
    @property
171
    def modification_time(self):
172
        return self._get_info('modification_time')
173
174
    def suggest(self, text, backend_params=None):
175
        """Suggest subjects the given text by passing it to the backend. Returns a
176
        list of SubjectSuggestion objects ordered by decreasing score."""
177
        if not self.is_trained:
178
            if self.is_trained is None:
179
                logger.warn('Could not get train state information.')
180
            else:
181
                raise NotInitializedException('Project is not trained.')
182
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
183
                     text[:20], len(text))
184
        hits = self._suggest_with_backend(text, backend_params)
185
        logger.debug('%d hits from backend', len(hits))
186
        return hits
187
188
    def train(self, corpus, backend_params=None):
189
        """train the project using documents from a metadata source"""
190
        if corpus != 'cached':
191
            corpus.set_subject_index(self.subjects)
192
        if backend_params is None:
193
            backend_params = {}
194
        beparams = backend_params.get(self.backend.backend_id, {})
195
        self.backend.train(corpus, beparams)
196
197
    def learn(self, corpus, backend_params=None):
198
        """further train the project using documents from a metadata source"""
199
        corpus.set_subject_index(self.subjects)
200
        if backend_params is None:
201
            backend_params = {}
202
        beparams = backend_params.get(self.backend.backend_id, {})
203
        if isinstance(
204
                self.backend,
205
                annif.backend.backend.AnnifLearningBackend):
206
            self.backend.learn(corpus, beparams)
207
        else:
208
            raise NotSupportedException("Learning not supported by backend",
209
                                        project_id=self.project_id)
210
211
    def dump(self):
212
        """return this project as a dict"""
213
        return {'project_id': self.project_id,
214
                'name': self.name,
215
                'language': self.language,
216
                'backend': {'backend_id': self.config.get('backend')},
217
                'is_trained': self.is_trained,
218
                'modification_time': self.modification_time
219
                }
220
221
    def remove_model_data(self):
222
        """remove the data of this project"""
223
        datadir_path = self._datadir_path
224
        if os.path.isdir(datadir_path):
225
            rmtree(datadir_path)
226
            logger.info('Removed model data for project {}.'
227
                        .format(self.project_id))
228
        else:
229
            logger.warning('No model data to remove for project {}.'
230
                           .format(self.project_id))
231
232
233
class AnnifRegistry:
234
    """Class that keeps track of the Annif projects"""
235
236
    # Note: The individual projects are stored in a shared static variable,
237
    # keyed by the "registry ID" which is unique to the registry instance.
238
    # This is done to make it possible to serialize AnnifRegistry instances
239
    # without including the potentially huge project objects (which contain
240
    # backends with large models, vocabularies with lots of concepts etc).
241
    # Serialized AnnifRegistry instances can then be passed between
242
    # processes when using the multiprocessing module.
243
    _projects = {}
244
245
    def __init__(self, projects_file, datadir, init_projects):
246
        self._rid = id(self)
247
        self._projects[self._rid] = \
248
            self._create_projects(projects_file, datadir)
249
        if init_projects:
250
            for project in self._projects[self._rid].values():
251
                project.initialize()
252
253
    def _create_projects(self, projects_file, datadir):
254
        if not os.path.exists(projects_file):
255
            logger.warning(
256
                'Project configuration file "%s" is missing. ' +
257
                'Please provide one. You can set the path to the project ' +
258
                'configuration file using the ANNIF_PROJECTS environment ' +
259
                'variable or the command-line option "--projects".',
260
                projects_file)
261
            return {}
262
263
        config = configparser.ConfigParser()
264
        config.optionxform = annif.util.identity
265
        with open(projects_file, encoding='utf-8-sig') as projf:
266
            try:
267
                config.read_file(projf)
268
            except (configparser.DuplicateOptionError,
269
                    configparser.DuplicateSectionError) as err:
270
                raise ConfigurationException(err)
271
272
        # create AnnifProject objects from the configuration file
273
        projects = collections.OrderedDict()
274
        for project_id in config.sections():
275
            projects[project_id] = AnnifProject(project_id,
276
                                                config[project_id],
277
                                                datadir,
278
                                                self)
279
        return projects
280
281
    def get_projects(self, min_access=Access.private):
282
        """Return the available projects as a dict of project_id ->
283
        AnnifProject. The min_access parameter may be used to set the minimum
284
        access level required for the returned projects."""
285
286
        return {project_id: project
287
                for project_id, project in self._projects[self._rid].items()
288
                if project.access >= min_access}
289
290
    def get_project(self, project_id, min_access=Access.private):
291
        """return the definition of a single Project by project_id"""
292
293
        projects = self.get_projects(min_access)
294
        try:
295
            return projects[project_id]
296
        except KeyError:
297
            raise ValueError("No such project {}".format(project_id))
298
299
300
def initialize_projects(app):
301
    projects_file = app.config['PROJECTS_FILE']
302
    datadir = app.config['DATADIR']
303
    init_projects = app.config['INITIALIZE_PROJECTS']
304
    app.annif_registry = AnnifRegistry(projects_file, datadir, init_projects)
305
306
307
def get_projects(min_access=Access.private):
308
    """Return the available projects as a dict of project_id ->
309
    AnnifProject. The min_access parameter may be used to set the minimum
310
    access level required for the returned projects."""
311
    if not hasattr(current_app, 'annif_registry'):
312
        initialize_projects(current_app)
313
314
    return current_app.annif_registry.get_projects(min_access)
315
316
317
def get_project(project_id, min_access=Access.private):
318
    """return the definition of a single Project by project_id"""
319
320
    projects = get_projects(min_access)
321
    try:
322
        return projects[project_id]
323
    except KeyError:
324
        raise ValueError("No such project {}".format(project_id))
325
326
327
class ProjectSuggestMap:
328
    """A utility class that can be used to wrap a project and provide a
329
    mapping method that converts Document objects to suggestions. Intended
330
    to be used with the multiprocessing module."""
331
332
    def __init__(self, project, backend_params, limit, threshold):
333
        self.project_id = project.project_id
334
        self.registry = project.registry
335
        self.backend_params = backend_params
336
        self.limit = limit
337
        self.threshold = threshold
338
339
    def suggest(self, doc):
340
        project = self.registry.get_project(self.project_id)
341
        hits = project.suggest(doc.text, self.backend_params)
342
        filtered_hits = hits.filter(
343
            project.subjects, self.limit, self.threshold)
344
        return (filtered_hits, doc.uris, doc.labels)
345