Passed
Pull Request — master (#418)
by Osma
02:22
created

annif.project.ProjectSuggestMap.suggest()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from flask import current_app
8
from shutil import rmtree
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.suggestion
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.datadir import DatadirMixin
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotSupportedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject(DatadirMixin):
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    initialized = False
38
39
    # default values for configuration settings
40
    DEFAULT_ACCESS = 'public'
41
42
    def __init__(self, project_id, config, datadir, registry):
43
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
44
        self.project_id = project_id
45
        self.name = config.get('name', project_id)
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self.config = config
50
        self._base_datadir = datadir
51
        self.registry = registry
52
        self._init_access()
53
54
    def _init_access(self):
55
        access = self.config.get('access', self.DEFAULT_ACCESS)
56
        try:
57
            self.access = getattr(Access, access)
58
        except AttributeError:
59
            raise ConfigurationException(
60
                "'{}' is not a valid access setting".format(access),
61
                project_id=self.project_id)
62
63
    def _initialize_analyzer(self):
64
        if not self.analyzer_spec:
65
            return  # not configured, so assume it's not needed
66
        analyzer = self.analyzer
67
        logger.debug("Project '%s': initialized analyzer: %s",
68
                     self.project_id,
69
                     str(analyzer))
70
71
    def _initialize_subjects(self):
72
        try:
73
            subjects = self.subjects
74
            logger.debug("Project '%s': initialized subjects: %s",
75
                         self.project_id,
76
                         str(subjects))
77
        except AnnifException as err:
78
            logger.warning(err.format_message())
79
80
    def _initialize_backend(self):
81
        logger.debug("Project '%s': initializing backend", self.project_id)
82
        try:
83
            if not self.backend:
84
                logger.debug("Cannot initialize backend: does not exist")
85
                return
86
            self.backend.initialize()
87
        except AnnifException as err:
88
            logger.warning(err.format_message())
89
90
    def initialize(self):
91
        """initialize this project and its backend so that they are ready to
92
        be used"""
93
94
        logger.debug("Initializing project '%s'", self.project_id)
95
96
        self._initialize_analyzer()
97
        self._initialize_subjects()
98
        self._initialize_backend()
99
100
        self.initialized = True
101
102
    def _suggest_with_backend(self, text, backend_params):
103
        if backend_params is None:
104
            backend_params = {}
105
        beparams = backend_params.get(self.backend.backend_id, {})
106
        hits = self.backend.suggest(text, beparams)
107
        logger.debug(
108
            'Got %d hits from backend %s',
109
            len(hits), self.backend.backend_id)
110
        return hits
111
112
    @property
113
    def analyzer(self):
114
        if self._analyzer is None:
115
            if self.analyzer_spec:
116
                self._analyzer = annif.analyzer.get_analyzer(
117
                    self.analyzer_spec)
118
            else:
119
                raise ConfigurationException(
120
                    "analyzer setting is missing (and needed by the backend)",
121
                    project_id=self.project_id)
122
        return self._analyzer
123
124
    @property
125
    def backend(self):
126
        if self._backend is None:
127
            if 'backend' not in self.config:
128
                raise ConfigurationException(
129
                    "backend setting is missing", project_id=self.project_id)
130
            backend_id = self.config['backend']
131
            try:
132
                backend_class = annif.backend.get_backend(backend_id)
133
                self._backend = backend_class(
134
                    backend_id, config_params=self.config,
135
                    project=self)
136
            except ValueError:
137
                logger.warning(
138
                    "Could not create backend %s, "
139
                    "make sure you've installed optional dependencies",
140
                    backend_id)
141
        return self._backend
142
143
    @property
144
    def vocab(self):
145
        if self._vocab is None:
146
            if self.vocab_id is None:
147
                raise ConfigurationException("vocab setting is missing",
148
                                             project_id=self.project_id)
149
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150
                                                      self._base_datadir)
151
        return self._vocab
152
153
    @property
154
    def subjects(self):
155
        return self.vocab.subjects
156
157
    def suggest(self, text, backend_params=None):
158
        """Suggest subjects the given text by passing it to the backend. Returns a
159
        list of SubjectSuggestion objects ordered by decreasing score."""
160
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
161
                     text[:20], len(text))
162
        hits = self._suggest_with_backend(text, backend_params)
163
        logger.debug('%d hits from backend', len(hits))
164
        return hits
165
166
    def train(self, corpus, backend_params=None):
167
        """train the project using documents from a metadata source"""
168
        if corpus != 'cached':
169
            corpus.set_subject_index(self.subjects)
170
        if backend_params is None:
171
            backend_params = {}
172
        beparams = backend_params.get(self.backend.backend_id, {})
173
        self.backend.train(corpus, beparams)
174
175
    def learn(self, corpus, backend_params=None):
176
        """further train the project using documents from a metadata source"""
177
        corpus.set_subject_index(self.subjects)
178
        if backend_params is None:
179
            backend_params = {}
180
        beparams = backend_params.get(self.backend.backend_id, {})
181
        if isinstance(
182
                self.backend,
183
                annif.backend.backend.AnnifLearningBackend):
184
            self.backend.learn(corpus, beparams)
185
        else:
186
            raise NotSupportedException("Learning not supported by backend",
187
                                        project_id=self.project_id)
188
189
    def dump(self):
190
        """return this project as a dict"""
191
        return {'project_id': self.project_id,
192
                'name': self.name,
193
                'language': self.language,
194
                'backend': {'backend_id': self.config.get('backend')}
195
                }
196
197
    def remove_model_data(self):
198
        """remove the data of this project"""
199
        datadir_path = self._datadir_path
200
        if os.path.isdir(datadir_path):
201
            rmtree(datadir_path)
202
            logger.info('Removed model data for project {}.'
203
                        .format(self.project_id))
204
        else:
205
            logger.warning('No model data to remove for project {}.'
206
                           .format(self.project_id))
207
208
209
class AnnifRegistry:
210
    """Class that keeps track of the Annif projects"""
211
212
    # Note: The individual projects are stored in a shared static variable,
213
    # keyed by the "registry ID" which is unique to the registry instance.
214
    # This is done to make it possible to serialize AnnifRegistry instances
215
    # without including the potentially huge project objects (which contain
216
    # backends with large models, vocabularies with lots of concepts etc).
217
    # Serialized AnnifRegistry instances can then be passed between
218
    # processes when using the multiprocessing module.
219
    _projects = {}
220
221
    def __init__(self, projects_file, datadir, init_projects):
222
        self._rid = id(self)
223
        self._projects[self._rid] = \
224
            self._create_projects(projects_file, datadir, init_projects)
225
226
    def _create_projects(self, projects_file, datadir, init_projects):
227
        if not os.path.exists(projects_file):
228
            logger.warning(
229
                'Project configuration file "%s" is missing. ' +
230
                'Please provide one. You can set the path to the project ' +
231
                'configuration file using the ANNIF_PROJECTS environment ' +
232
                'variable or the command-line option "--projects".',
233
                projects_file)
234
            return {}
235
236
        config = configparser.ConfigParser()
237
        config.optionxform = annif.util.identity
238
        with open(projects_file, encoding='utf-8-sig') as projf:
239
            try:
240
                config.read_file(projf)
241
            except (configparser.DuplicateOptionError,
242
                    configparser.DuplicateSectionError) as err:
243
                raise ConfigurationException(err)
244
245
        # create AnnifProject objects from the configuration file
246
        projects = collections.OrderedDict()
247
        for project_id in config.sections():
248
            projects[project_id] = AnnifProject(project_id,
249
                                                config[project_id],
250
                                                datadir,
251
                                                self)
252
            if init_projects:
253
                projects[project_id].initialize()
254
        return projects
255
256
    def get_projects(self, min_access=Access.private):
257
        """Return the available projects as a dict of project_id ->
258
        AnnifProject. The min_access parameter may be used to set the minimum
259
        access level required for the returned projects."""
260
261
        return {project_id: project
262
                for project_id, project in self._projects[self._rid].items()
263
                if project.access >= min_access}
264
265
    def get_project(self, project_id, min_access=Access.private):
266
        """return the definition of a single Project by project_id"""
267
268
        projects = self.get_projects(min_access)
269
        try:
270
            return projects[project_id]
271
        except KeyError:
272
            raise ValueError("No such project {}".format(project_id))
273
274
275
def initialize_projects(app):
276
    projects_file = app.config['PROJECTS_FILE']
277
    datadir = app.config['DATADIR']
278
    init_projects = app.config['INITIALIZE_PROJECTS']
279
    app.annif_registry = AnnifRegistry(projects_file, datadir, init_projects)
280
281
282
def get_projects(min_access=Access.private):
283
    """Return the available projects as a dict of project_id ->
284
    AnnifProject. The min_access parameter may be used to set the minimum
285
    access level required for the returned projects."""
286
    if not hasattr(current_app, 'annif_registry'):
287
        initialize_projects(current_app)
288
289
    return current_app.annif_registry.get_projects(min_access)
290
291
292
def get_project(project_id, min_access=Access.private):
293
    """return the definition of a single Project by project_id"""
294
295
    projects = get_projects(min_access)
296
    try:
297
        return projects[project_id]
298
    except KeyError:
299
        raise ValueError("No such project {}".format(project_id))
300
301
302
class ProjectSuggestMap:
303
    """A utility class that can be used to wrap a project and provide a
304
    mapping method that converts Document objects to suggestions. Intended
305
    to be used with the multiprocessing module."""
306
307
    def __init__(self, project, backend_params, limit, threshold):
308
        self.project_id = project.project_id
309
        self.registry = project.registry
310
        self.backend_params = backend_params
311
        self.limit = limit
312
        self.threshold = threshold
313
314
    def suggest(self, doc):
315
        project = self.registry.get_project(self.project_id)
316
        hits = project.suggest(doc.text, self.backend_params)
317
        filtered_hits = hits.filter(
318
            project.subjects, self.limit, self.threshold)
319
        return (filtered_hits, doc.uris, doc.labels)
320