Passed
Pull Request — master (#418)
by Osma
01:28
created

annif.project.ProjectSuggestMap.__init__()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 5
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from flask import current_app
8
from shutil import rmtree
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.suggestion
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.datadir import DatadirMixin
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotSupportedException, NotInitializedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject(DatadirMixin):
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    initialized = False
38
39
    # default values for configuration settings
40
    DEFAULT_ACCESS = 'public'
41
42
    def __init__(self, project_id, config, datadir, registry):
43
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
44
        self.project_id = project_id
45
        self.name = config.get('name', project_id)
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self.config = config
50
        self._base_datadir = datadir
51
        self.registry = registry
52
        self._init_access()
53
54
    def _init_access(self):
55
        access = self.config.get('access', self.DEFAULT_ACCESS)
56
        try:
57
            self.access = getattr(Access, access)
58
        except AttributeError:
59
            raise ConfigurationException(
60
                "'{}' is not a valid access setting".format(access),
61
                project_id=self.project_id)
62
63
    def _initialize_analyzer(self):
64
        if not self.analyzer_spec:
65
            return  # not configured, so assume it's not needed
66
        analyzer = self.analyzer
67
        logger.debug("Project '%s': initialized analyzer: %s",
68
                     self.project_id,
69
                     str(analyzer))
70
71
    def _initialize_subjects(self):
72
        try:
73
            subjects = self.subjects
74
            logger.debug("Project '%s': initialized subjects: %s",
75
                         self.project_id,
76
                         str(subjects))
77
        except AnnifException as err:
78
            logger.warning(err.format_message())
79
80
    def _initialize_backend(self):
81
        logger.debug("Project '%s': initializing backend", self.project_id)
82
        try:
83
            if not self.backend:
84
                logger.debug("Cannot initialize backend: does not exist")
85
                return
86
            self.backend.initialize()
87
        except AnnifException as err:
88
            logger.warning(err.format_message())
89
90
    def initialize(self):
91
        """initialize this project and its backend so that they are ready to
92
        be used"""
93
94
        logger.debug("Initializing project '%s'", self.project_id)
95
96
        self._initialize_analyzer()
97
        self._initialize_subjects()
98
        self._initialize_backend()
99
100
        self.initialized = True
101
102
    def _suggest_with_backend(self, text, backend_params):
103
        if backend_params is None:
104
            backend_params = {}
105
        beparams = backend_params.get(self.backend.backend_id, {})
106
        hits = self.backend.suggest(text, beparams)
107
        logger.debug(
108
            'Got %d hits from backend %s',
109
            len(hits), self.backend.backend_id)
110
        return hits
111
112
    @property
113
    def analyzer(self):
114
        if self._analyzer is None:
115
            if self.analyzer_spec:
116
                self._analyzer = annif.analyzer.get_analyzer(
117
                    self.analyzer_spec)
118
            else:
119
                raise ConfigurationException(
120
                    "analyzer setting is missing (and needed by the backend)",
121
                    project_id=self.project_id)
122
        return self._analyzer
123
124
    @property
125
    def backend(self):
126
        if self._backend is None:
127
            if 'backend' not in self.config:
128
                raise ConfigurationException(
129
                    "backend setting is missing", project_id=self.project_id)
130
            backend_id = self.config['backend']
131
            try:
132
                backend_class = annif.backend.get_backend(backend_id)
133
                self._backend = backend_class(
134
                    backend_id, config_params=self.config,
135
                    project=self)
136
            except ValueError:
137
                logger.warning(
138
                    "Could not create backend %s, "
139
                    "make sure you've installed optional dependencies",
140
                    backend_id)
141
        return self._backend
142
143
    @property
144
    def vocab(self):
145
        if self._vocab is None:
146
            if self.vocab_id is None:
147
                raise ConfigurationException("vocab setting is missing",
148
                                             project_id=self.project_id)
149
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150
                                                      self._base_datadir)
151
        return self._vocab
152
153
    @property
154
    def subjects(self):
155
        return self.vocab.subjects
156
157
    def _get_info(self, key):
158
        try:
159
            be = self.backend
160
            if be is not None:
161
                return getattr(be, key)
162
        except AnnifException as err:
163
            logger.warning(err.format_message())
164
            return None
165
166
    @property
167
    def is_trained(self):
168
        return self._get_info('is_trained')
169
170
    @property
171
    def modification_time(self):
172
        return self._get_info('modification_time')
173
174
    def suggest(self, text, backend_params=None):
175
        """Suggest subjects the given text by passing it to the backend. Returns a
176
        list of SubjectSuggestion objects ordered by decreasing score."""
177
        if not self.is_trained:
178
            if self.is_trained is None:
179
                logger.warn('Could not get train state information.')
180
            else:
181
                raise NotInitializedException('Project is not trained.')
182
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
183
                     text[:20], len(text))
184
        hits = self._suggest_with_backend(text, backend_params)
185
        logger.debug('%d hits from backend', len(hits))
186
        return hits
187
188
    def train(self, corpus, backend_params=None):
189
        """train the project using documents from a metadata source"""
190
        if corpus != 'cached':
191
            corpus.set_subject_index(self.subjects)
192
        if backend_params is None:
193
            backend_params = {}
194
        beparams = backend_params.get(self.backend.backend_id, {})
195
        self.backend.train(corpus, beparams)
196
197
    def learn(self, corpus, backend_params=None):
198
        """further train the project using documents from a metadata source"""
199
        corpus.set_subject_index(self.subjects)
200
        if backend_params is None:
201
            backend_params = {}
202
        beparams = backend_params.get(self.backend.backend_id, {})
203
        if isinstance(
204
                self.backend,
205
                annif.backend.backend.AnnifLearningBackend):
206
            self.backend.learn(corpus, beparams)
207
        else:
208
            raise NotSupportedException("Learning not supported by backend",
209
                                        project_id=self.project_id)
210
211
    def dump(self):
212
        """return this project as a dict"""
213
        return {'project_id': self.project_id,
214
                'name': self.name,
215
                'language': self.language,
216
                'backend': {'backend_id': self.config.get('backend')},
217
                'is_trained': self.is_trained,
218
                'modification_time': self.modification_time
219
                }
220
221
    def remove_model_data(self):
222
        """remove the data of this project"""
223
        datadir_path = self._datadir_path
224
        if os.path.isdir(datadir_path):
225
            rmtree(datadir_path)
226
            logger.info('Removed model data for project {}.'
227
                        .format(self.project_id))
228
        else:
229
            logger.warning('No model data to remove for project {}.'
230
                           .format(self.project_id))
231
232
233
class AnnifRegistry:
234
    """Class that keeps track of the Annif projects"""
235
236
    # Note: The individual projects are stored in a shared static variable,
237
    # keyed by the "registry ID" which is unique to the registry instance.
238
    # This is done to make it possible to serialize AnnifRegistry instances
239
    # without including the potentially huge project objects (which contain
240
    # backends with large models, vocabularies with lots of concepts etc).
241
    # Serialized AnnifRegistry instances can then be passed between
242
    # processes when using the multiprocessing module.
243
    _projects = {}
244
245
    def __init__(self, projects_file, datadir, init_projects):
246
        self._rid = id(self)
247
        self._projects[self._rid] = \
248
            self._create_projects(projects_file, datadir, init_projects)
249
250
    def _create_projects(self, projects_file, datadir, init_projects):
251
        if not os.path.exists(projects_file):
252
            logger.warning(
253
                'Project configuration file "%s" is missing. ' +
254
                'Please provide one. You can set the path to the project ' +
255
                'configuration file using the ANNIF_PROJECTS environment ' +
256
                'variable or the command-line option "--projects".',
257
                projects_file)
258
            return {}
259
260
        config = configparser.ConfigParser()
261
        config.optionxform = annif.util.identity
262
        with open(projects_file, encoding='utf-8-sig') as projf:
263
            try:
264
                config.read_file(projf)
265
            except (configparser.DuplicateOptionError,
266
                    configparser.DuplicateSectionError) as err:
267
                raise ConfigurationException(err)
268
269
        # create AnnifProject objects from the configuration file
270
        projects = collections.OrderedDict()
271
        for project_id in config.sections():
272
            projects[project_id] = AnnifProject(project_id,
273
                                                config[project_id],
274
                                                datadir,
275
                                                self)
276
            if init_projects:
277
                projects[project_id].initialize()
278
        return projects
279
280
    def get_projects(self, min_access=Access.private):
281
        """Return the available projects as a dict of project_id ->
282
        AnnifProject. The min_access parameter may be used to set the minimum
283
        access level required for the returned projects."""
284
285
        return {project_id: project
286
                for project_id, project in self._projects[self._rid].items()
287
                if project.access >= min_access}
288
289
    def get_project(self, project_id, min_access=Access.private):
290
        """return the definition of a single Project by project_id"""
291
292
        projects = self.get_projects(min_access)
293
        try:
294
            return projects[project_id]
295
        except KeyError:
296
            raise ValueError("No such project {}".format(project_id))
297
298
299
def initialize_projects(app):
300
    projects_file = app.config['PROJECTS_FILE']
301
    datadir = app.config['DATADIR']
302
    init_projects = app.config['INITIALIZE_PROJECTS']
303
    app.annif_registry = AnnifRegistry(projects_file, datadir, init_projects)
304
305
306
def get_projects(min_access=Access.private):
307
    """Return the available projects as a dict of project_id ->
308
    AnnifProject. The min_access parameter may be used to set the minimum
309
    access level required for the returned projects."""
310
    if not hasattr(current_app, 'annif_registry'):
311
        initialize_projects(current_app)
312
313
    return current_app.annif_registry.get_projects(min_access)
314
315
316
def get_project(project_id, min_access=Access.private):
317
    """return the definition of a single Project by project_id"""
318
319
    projects = get_projects(min_access)
320
    try:
321
        return projects[project_id]
322
    except KeyError:
323
        raise ValueError("No such project {}".format(project_id))
324
325
326
class ProjectSuggestMap:
327
    """A utility class that can be used to wrap a project and provide a
328
    mapping method that converts Document objects to suggestions. Intended
329
    to be used with the multiprocessing module."""
330
331
    def __init__(self, project, backend_params, limit, threshold):
332
        self.project_id = project.project_id
333
        self.registry = project.registry
334
        self.backend_params = backend_params
335
        self.limit = limit
336
        self.threshold = threshold
337
338
    def suggest(self, doc):
339
        project = self.registry.get_project(self.project_id)
340
        hits = project.suggest(doc.text, self.backend_params)
341
        filtered_hits = hits.filter(
342
            project.subjects, self.limit, self.threshold)
343
        return (filtered_hits, doc.uris, doc.labels)
344