Passed
Pull Request — master (#417)
by Osma
01:40
created

annif.project.AnnifRegistry.get_projects()   A

Complexity

Conditions 1

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 8
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from flask import current_app
8
from shutil import rmtree
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.suggestion
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.datadir import DatadirMixin
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotSupportedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject(DatadirMixin):
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    initialized = False
38
39
    # default values for configuration settings
40
    DEFAULT_ACCESS = 'public'
41
42
    def __init__(self, project_id, config, datadir, registry):
43
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
44
        self.project_id = project_id
45
        self.name = config.get('name', project_id)
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self.config = config
50
        self._base_datadir = datadir
51
        self.registry = registry
52
        self._init_access()
53
54
    def _init_access(self):
55
        access = self.config.get('access', self.DEFAULT_ACCESS)
56
        try:
57
            self.access = getattr(Access, access)
58
        except AttributeError:
59
            raise ConfigurationException(
60
                "'{}' is not a valid access setting".format(access),
61
                project_id=self.project_id)
62
63
    def _initialize_analyzer(self):
64
        if not self.analyzer_spec:
65
            return  # not configured, so assume it's not needed
66
        analyzer = self.analyzer
67
        logger.debug("Project '%s': initialized analyzer: %s",
68
                     self.project_id,
69
                     str(analyzer))
70
71
    def _initialize_subjects(self):
72
        try:
73
            subjects = self.subjects
74
            logger.debug("Project '%s': initialized subjects: %s",
75
                         self.project_id,
76
                         str(subjects))
77
        except AnnifException as err:
78
            logger.warning(err.format_message())
79
80
    def _initialize_backend(self):
81
        logger.debug("Project '%s': initializing backend", self.project_id)
82
        try:
83
            if not self.backend:
84
                logger.debug("Cannot initialize backend: does not exist")
85
                return
86
            self.backend.initialize()
87
        except AnnifException as err:
88
            logger.warning(err.format_message())
89
90
    def initialize(self):
91
        """initialize this project and its backend so that they are ready to
92
        be used"""
93
94
        logger.debug("Initializing project '%s'", self.project_id)
95
96
        self._initialize_analyzer()
97
        self._initialize_subjects()
98
        self._initialize_backend()
99
100
        self.initialized = True
101
102
    def _suggest_with_backend(self, text, backend_params):
103
        if backend_params is None:
104
            backend_params = {}
105
        beparams = backend_params.get(self.backend.backend_id, {})
106
        hits = self.backend.suggest(text, beparams)
107
        logger.debug(
108
            'Got %d hits from backend %s',
109
            len(hits), self.backend.backend_id)
110
        return hits
111
112
    @property
113
    def analyzer(self):
114
        if self._analyzer is None:
115
            if self.analyzer_spec:
116
                self._analyzer = annif.analyzer.get_analyzer(
117
                    self.analyzer_spec)
118
            else:
119
                raise ConfigurationException(
120
                    "analyzer setting is missing (and needed by the backend)",
121
                    project_id=self.project_id)
122
        return self._analyzer
123
124
    @property
125
    def backend(self):
126
        if self._backend is None:
127
            if 'backend' not in self.config:
128
                raise ConfigurationException(
129
                    "backend setting is missing", project_id=self.project_id)
130
            backend_id = self.config['backend']
131
            try:
132
                backend_class = annif.backend.get_backend(backend_id)
133
                self._backend = backend_class(
134
                    backend_id, config_params=self.config,
135
                    project=self)
136
            except ValueError:
137
                logger.warning(
138
                    "Could not create backend %s, "
139
                    "make sure you've installed optional dependencies",
140
                    backend_id)
141
        return self._backend
142
143
    @property
144
    def vocab(self):
145
        if self._vocab is None:
146
            if self.vocab_id is None:
147
                raise ConfigurationException("vocab setting is missing",
148
                                             project_id=self.project_id)
149
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150
                                                      self._base_datadir)
151
        return self._vocab
152
153
    @property
154
    def subjects(self):
155
        return self.vocab.subjects
156
157
    def suggest(self, text, backend_params=None):
158
        """Suggest subjects the given text by passing it to the backend. Returns a
159
        list of SubjectSuggestion objects ordered by decreasing score."""
160
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
161
                     text[:20], len(text))
162
        hits = self._suggest_with_backend(text, backend_params)
163
        logger.debug('%d hits from backend', len(hits))
164
        return hits
165
166
    def train(self, corpus, backend_params=None):
167
        """train the project using documents from a metadata source"""
168
        if corpus != 'cached':
169
            corpus.set_subject_index(self.subjects)
170
        if backend_params is None:
171
            backend_params = {}
172
        beparams = backend_params.get(self.backend.backend_id, {})
173
        self.backend.train(corpus, beparams)
174
175
    def learn(self, corpus, backend_params=None):
176
        """further train the project using documents from a metadata source"""
177
        corpus.set_subject_index(self.subjects)
178
        if backend_params is None:
179
            backend_params = {}
180
        beparams = backend_params.get(self.backend.backend_id, {})
181
        if isinstance(
182
                self.backend,
183
                annif.backend.backend.AnnifLearningBackend):
184
            self.backend.learn(corpus, beparams)
185
        else:
186
            raise NotSupportedException("Learning not supported by backend",
187
                                        project_id=self.project_id)
188
189
    def dump(self):
190
        """return this project as a dict"""
191
        return {'project_id': self.project_id,
192
                'name': self.name,
193
                'language': self.language,
194
                'backend': {'backend_id': self.config.get('backend')}
195
                }
196
197
    def remove_model_data(self):
198
        """remove the data of this project"""
199
        datadir_path = self._datadir_path
200
        if os.path.isdir(datadir_path):
201
            rmtree(datadir_path)
202
            logger.info('Removed model data for project {}.'
203
                        .format(self.project_id))
204
        else:
205
            logger.warning('No model data to remove for project {}.'
206
                           .format(self.project_id))
207
208
209
class AnnifRegistry:
210
    """Class that keeps track of the Annif projects"""
211
212
    _projects = {}
213
214
    def __init__(self, projects_file, datadir, init_projects):
215
        self._projects[id(self)] = \
216
            self._create_projects(projects_file, datadir, init_projects)
217
218
    def _create_projects(self, projects_file, datadir, init_projects):
219
        if not os.path.exists(projects_file):
220
            logger.warning(
221
                'Project configuration file "%s" is missing. ' +
222
                'Please provide one. You can set the path to the project ' +
223
                'configuration file using the ANNIF_PROJECTS environment ' +
224
                'variable or the command-line option "--projects".',
225
                projects_file)
226
            return {}
227
228
        config = configparser.ConfigParser()
229
        config.optionxform = lambda option: option
230
        with open(projects_file, encoding='utf-8-sig') as projf:
231
            try:
232
                config.read_file(projf)
233
            except (configparser.DuplicateOptionError,
234
                    configparser.DuplicateSectionError) as err:
235
                raise ConfigurationException(err)
236
237
        # create AnnifProject objects from the configuration file
238
        projects = collections.OrderedDict()
239
        for project_id in config.sections():
240
            projects[project_id] = AnnifProject(project_id,
241
                                                config[project_id],
242
                                                datadir,
243
                                                self)
244
            if init_projects:
245
                projects[project_id].initialize()
246
        return projects
247
248
    def get_projects(self, min_access=Access.private):
249
        """Return the available projects as a dict of project_id ->
250
        AnnifProject. The min_access parameter may be used to set the minimum
251
        access level required for the returned projects."""
252
253
        return {project_id: project
254
                for project_id, project in self._projects[id(self)].items()
255
                if project.access >= min_access}
256
257
    def get_project(self, project_id, min_access=Access.private):
258
        """return the definition of a single Project by project_id"""
259
260
        projects = self.get_projects(min_access)
261
        try:
262
            return projects[project_id]
263
        except KeyError:
264
            raise ValueError("No such project {}".format(project_id))
265
266
267
def initialize_projects(app):
268
    projects_file = app.config['PROJECTS_FILE']
269
    datadir = app.config['DATADIR']
270
    init_projects = app.config['INITIALIZE_PROJECTS']
271
    app.annif_registry = AnnifRegistry(projects_file, datadir, init_projects)
272
273
274
def get_projects(min_access=Access.private):
275
    """Return the available projects as a dict of project_id ->
276
    AnnifProject. The min_access parameter may be used to set the minimum
277
    access level required for the returned projects."""
278
    if not hasattr(current_app, 'annif_registry'):
279
        initialize_projects(current_app)
280
281
    return current_app.annif_registry.get_projects(min_access)
282
283
284
def get_project(project_id, min_access=Access.private):
285
    """return the definition of a single Project by project_id"""
286
287
    projects = get_projects(min_access)
288
    try:
289
        return projects[project_id]
290
    except KeyError:
291
        raise ValueError("No such project {}".format(project_id))
292