Completed
Push — master ( 244db9...8e90e2 )
by Osma
26s queued 11s
created

annif.project.AnnifProject._create_vectorizer()   A

Complexity

Conditions 3

Size

Total Lines 16
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 16
rs 9.6
c 0
b 0
f 0
cc 3
nop 2
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from flask import current_app
8
from shutil import rmtree
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.suggestion
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.datadir import DatadirMixin
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotInitializedException, NotSupportedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject(DatadirMixin):
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    initialized = False
38
39
    # default values for configuration settings
40
    DEFAULT_ACCESS = 'public'
41
42
    def __init__(self, project_id, config, datadir):
43
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
44
        self.project_id = project_id
45
        self.name = config.get('name', project_id)
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self.config = config
50
        self._base_datadir = datadir
51
        self._init_access()
52
53
    def _init_access(self):
54
        access = self.config.get('access', self.DEFAULT_ACCESS)
55
        try:
56
            self.access = getattr(Access, access)
57
        except AttributeError:
58
            raise ConfigurationException(
59
                "'{}' is not a valid access setting".format(access),
60
                project_id=self.project_id)
61
62
    def _initialize_analyzer(self):
63
        try:
64
            analyzer = self.analyzer
65
            logger.debug("Project '%s': initialized analyzer: %s",
66
                         self.project_id,
67
                         str(analyzer))
68
        except AnnifException as err:
69
            logger.warning(err.format_message())
70
71
    def _initialize_subjects(self):
72
        try:
73
            subjects = self.subjects
74
            logger.debug("Project '%s': initialized subjects: %s",
75
                         self.project_id,
76
                         str(subjects))
77
        except AnnifException as err:
78
            logger.warning(err.format_message())
79
80
    def _initialize_backend(self):
81
        logger.debug("Project '%s': initializing backend", self.project_id)
82
        try:
83
            if not self.backend:
84
                logger.debug("Cannot initialize backend: does not exist")
85
                return
86
            self.backend.initialize()
87
        except AnnifException as err:
88
            logger.warning(err.format_message())
89
90
    def initialize(self):
91
        """initialize this project and its backend so that they are ready to
92
        be used"""
93
94
        logger.debug("Initializing project '%s'", self.project_id)
95
96
        self._initialize_analyzer()
97
        self._initialize_subjects()
98
        self._initialize_backend()
99
100
        self.initialized = True
101
102
    def _suggest_with_backend(self, text, backend_params):
103
        if backend_params is None:
104
            backend_params = {}
105
        beparams = backend_params.get(self.backend.backend_id, {})
106
        hits = self.backend.suggest(text, project=self, params=beparams)
107
        logger.debug(
108
            'Got %d hits from backend %s',
109
            len(hits), self.backend.backend_id)
110
        return hits
111
112
    @property
113
    def analyzer(self):
114
        if self._analyzer is None:
115
            if self.analyzer_spec:
116
                self._analyzer = annif.analyzer.get_analyzer(
117
                    self.analyzer_spec)
118
            else:
119
                raise ConfigurationException(
120
                    "analyzer setting is missing (and needed by the backend)",
121
                    project_id=self.project_id)
122
        return self._analyzer
123
124
    @property
125
    def backend(self):
126
        if self._backend is None:
127
            if 'backend' not in self.config:
128
                raise ConfigurationException(
129
                    "backend setting is missing", project_id=self.project_id)
130
            backend_id = self.config['backend']
131
            try:
132
                backend_class = annif.backend.get_backend(backend_id)
133
                self._backend = backend_class(
134
                    backend_id, config_params=self.config,
135
                    datadir=self.datadir)
136
            except ValueError:
137
                logger.warning(
138
                    "Could not create backend %s, "
139
                    "make sure you've installed optional dependencies",
140
                    backend_id)
141
        return self._backend
142
143
    @property
144
    def vocab(self):
145
        if self._vocab is None:
146
            if self.vocab_id is None:
147
                raise ConfigurationException("vocab setting is missing",
148
                                             project_id=self.project_id)
149
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150
                                                      self._base_datadir)
151
        return self._vocab
152
153
    @property
154
    def subjects(self):
155
        return self.vocab.subjects
156
157
    def suggest(self, text, backend_params=None):
158
        """Suggest subjects the given text by passing it to the backend. Returns a
159
        list of SubjectSuggestion objects ordered by decreasing score."""
160
161
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
162
                     text[:20], len(text))
163
        hits = self._suggest_with_backend(text, backend_params)
164
        logger.debug('%d hits from backend', len(hits))
165
        return hits
166
167
    def train(self, corpus):
168
        """train the project using documents from a metadata source"""
169
170
        corpus.set_subject_index(self.subjects)
171
        self.backend.train(corpus, project=self)
172
173
    def learn(self, corpus):
174
        """further train the project using documents from a metadata source"""
175
176
        corpus.set_subject_index(self.subjects)
177
        if isinstance(
178
                self.backend,
179
                annif.backend.backend.AnnifLearningBackend):
180
            self.backend.learn(corpus, project=self)
181
        else:
182
            raise NotSupportedException("Learning not supported by backend",
183
                                        project_id=self.project_id)
184
185
    def dump(self):
186
        """return this project as a dict"""
187
        return {'project_id': self.project_id,
188
                'name': self.name,
189
                'language': self.language,
190
                'backend': {'backend_id': self.config.get('backend')}
191
                }
192
193
    def remove_model_data(self):
194
        """remove the data of this project"""
195
        datadir_path = self._datadir_path
196
        if os.path.isdir(datadir_path):
197
            rmtree(datadir_path)
198
            logger.info('Removed model data for project {}.'
199
                        .format(self.project_id))
200
        else:
201
            logger.warning('No model data to remove for project {}.'
202
                           .format(self.project_id))
203
204
205
def _create_projects(projects_file, datadir, init_projects):
206
    if not os.path.exists(projects_file):
207
        logger.warning(
208
            'Project configuration file "%s" is missing. Please provide one.' +
209
            ' You can set the path to the project configuration file using ' +
210
            'the ANNIF_PROJECTS environment variable or the command-line ' +
211
            'option "--projects".', projects_file)
212
        return {}
213
214
    config = configparser.ConfigParser()
215
    config.optionxform = lambda option: option
216
    with open(projects_file, encoding='utf-8') as projf:
217
        try:
218
            config.read_file(projf)
219
        except (configparser.DuplicateOptionError,
220
                configparser.DuplicateSectionError) as err:
221
            raise ConfigurationException(err)
222
223
    # create AnnifProject objects from the configuration file
224
    projects = collections.OrderedDict()
225
    for project_id in config.sections():
226
        projects[project_id] = AnnifProject(project_id,
227
                                            config[project_id],
228
                                            datadir)
229
        if init_projects:
230
            projects[project_id].initialize()
231
    return projects
232
233
234
def initialize_projects(app):
235
    projects_file = app.config['PROJECTS_FILE']
236
    datadir = app.config['DATADIR']
237
    init_projects = app.config['INITIALIZE_PROJECTS']
238
    app.annif_projects = _create_projects(
239
        projects_file, datadir, init_projects)
240
241
242
def get_projects(min_access=Access.private):
243
    """Return the available projects as a dict of project_id ->
244
    AnnifProject. The min_access parameter may be used to set the minimum
245
    access level required for the returned projects."""
246
247
    if not hasattr(current_app, 'annif_projects'):
248
        initialize_projects(current_app)
249
250
    projects = [(project_id, project)
251
                for project_id, project in current_app.annif_projects.items()
252
                if project.access >= min_access]
253
    return collections.OrderedDict(projects)
254
255
256
def get_project(project_id, min_access=Access.private):
257
    """return the definition of a single Project by project_id"""
258
    projects = get_projects(min_access)
259
    try:
260
        return projects[project_id]
261
    except KeyError:
262
        raise ValueError("No such project {}".format(project_id))
263