Passed
Push — master ( c8c370...dee89b )
by Osma
03:14
created

annif.project.AnnifProject._analyze_with_backend()   A

Complexity

Conditions 2

Size

Total Lines 9
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 9
rs 9.95
c 0
b 0
f 0
cc 2
nop 3
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from sklearn.externals import joblib
8
from sklearn.feature_extraction.text import TfidfVectorizer
9
from flask import current_app
10
import annif
11
import annif.analyzer
12
import annif.corpus
13
import annif.suggestion
14
import annif.backend
15
import annif.util
16
import annif.vocab
17
from annif.datadir import DatadirMixin
18
from annif.exception import AnnifException, ConfigurationException, \
19
    NotInitializedException, NotSupportedException
20
21
logger = annif.logger
22
23
24
class Access(enum.IntEnum):
25
    """Enumeration of access levels for projects"""
26
    private = 1
27
    hidden = 2
28
    public = 3
29
30
31
class AnnifProject(DatadirMixin):
32
    """Class representing the configuration of a single Annif project."""
33
34
    # defaults for uninitialized instances
35
    _analyzer = None
36
    _backend = None
37
    _vocab = None
38
    _vectorizer = None
39
    initialized = False
40
41
    # default values for configuration settings
42
    DEFAULT_ACCESS = 'public'
43
44
    def __init__(self, project_id, config, datadir):
45
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
46
        self.project_id = project_id
47
        self.name = config['name']
48
        self.language = config['language']
49
        self.analyzer_spec = config.get('analyzer', None)
50
        self.vocab_id = config.get('vocab', None)
51
        self.config = config
52
        self._base_datadir = datadir
53
        self._init_access()
54
55
    def _init_access(self):
56
        access = self.config.get('access', self.DEFAULT_ACCESS)
57
        try:
58
            self.access = getattr(Access, access)
59
        except AttributeError:
60
            raise ConfigurationException(
61
                "'{}' is not a valid access setting".format(access),
62
                project_id=self.project_id)
63
64
    def _initialize_analyzer(self):
65
        analyzer = self.analyzer
66
        logger.debug("Project '%s': initialized analyzer: %s",
67
                     self.project_id,
68
                     str(analyzer))
69
70
    def _initialize_subjects(self):
71
        try:
72
            subjects = self.subjects
73
            logger.debug("Project '%s': initialized subjects: %s",
74
                         self.project_id,
75
                         str(subjects))
76
        except AnnifException as err:
77
            logger.warning(err.format_message())
78
79
    def _initialize_vectorizer(self):
80
        try:
81
            vectorizer = self.vectorizer
82
            logger.debug("Project '%s': initialized vectorizer: %s",
83
                         self.project_id,
84
                         str(vectorizer))
85
        except AnnifException as err:
86
            logger.warning(err.format_message())
87
88
    def _initialize_backend(self):
89
        logger.debug("Project '%s': initializing backend", self.project_id)
90
        if not self.backend:
91
            logger.debug("Cannot initialize backend: does not exist")
92
            return
93
        try:
94
            self.backend.initialize()
95
        except AnnifException as err:
96
            logger.warning(err.format_message())
97
98
    def initialize(self):
99
        """initialize this project and its backend so that they are ready to
100
        be used"""
101
102
        logger.debug("Initializing project '%s'", self.project_id)
103
104
        self._initialize_analyzer()
105
        self._initialize_subjects()
106
        self._initialize_vectorizer()
107
        self._initialize_backend()
108
109
        self.initialized = True
110
111
    def _suggest_with_backend(self, text, backend_params):
112
        if backend_params is None:
113
            backend_params = {}
114
        beparams = backend_params.get(self.backend.backend_id, {})
115
        hits = self.backend.suggest(text, project=self, params=beparams)
116
        logger.debug(
117
            'Got %d hits from backend %s',
118
            len(hits), self.backend.backend_id)
119
        return hits
120
121
    @property
122
    def analyzer(self):
123
        if self._analyzer is None and self.analyzer_spec:
124
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
125
        return self._analyzer
126
127
    @property
128
    def backend(self):
129
        if self._backend is None:
130
            backend_id = self.config['backend']
131
            try:
132
                backend_class = annif.backend.get_backend(backend_id)
133
                self._backend = backend_class(
134
                    backend_id, params=self.config, datadir=self.datadir)
135
            except ValueError:
136
                logger.warning(
137
                    "Could not create backend %s, "
138
                    "make sure you've installed optional dependencies",
139
                    backend_id)
140
        return self._backend
141
142
    @property
143
    def vocab(self):
144
        if self._vocab is None:
145
            if self.vocab_id is None:
146
                raise ConfigurationException("vocab setting is missing",
147
                                             project_id=self.project_id)
148
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
149
                                                      self._base_datadir)
150
        return self._vocab
151
152
    @property
153
    def subjects(self):
154
        return self.vocab.subjects
155
156
    @property
157
    def vectorizer(self):
158
        if self._vectorizer is None:
159
            path = os.path.join(self.datadir, 'vectorizer')
160
            if os.path.exists(path):
161
                logger.debug('loading vectorizer from %s', path)
162
                self._vectorizer = joblib.load(path)
163
            else:
164
                raise NotInitializedException(
165
                    "vectorizer file '{}' not found".format(path),
166
                    project_id=self.project_id)
167
        return self._vectorizer
168
169
    def suggest(self, text, backend_params=None):
170
        """Suggest subjects the given text by passing it to the backend. Returns a
171
        list of SubjectSuggestion objects ordered by decreasing score."""
172
173
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
174
                     text[:20], len(text))
175
        hits = self._suggest_with_backend(text, backend_params)
176
        logger.debug('%d hits from backend', len(hits))
177
        return hits
178
179
    def _create_vectorizer(self, subjectcorpus):
180
        if not self.backend.needs_subject_vectorizer:
181
            logger.debug('not creating vectorizer: not needed by backend')
182
            return
183
        logger.info('creating vectorizer')
184
        self._vectorizer = TfidfVectorizer(
185
            tokenizer=self.analyzer.tokenize_words)
186
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
187
        annif.util.atomic_save(
188
            self._vectorizer,
189
            self.datadir,
190
            'vectorizer',
191
            method=joblib.dump)
192
193
    def train(self, corpus):
194
        """train the project using documents from a metadata source"""
195
196
        corpus.set_subject_index(self.subjects)
197
        self._create_vectorizer(corpus)
198
        self.backend.train(corpus, project=self)
199
200
    def learn(self, corpus):
201
        """further train the project using documents from a metadata source"""
202
203
        corpus.set_subject_index(self.subjects)
204
        if isinstance(
205
                self.backend,
206
                annif.backend.backend.AnnifLearningBackend):
207
            self.backend.learn(corpus, project=self)
208
        else:
209
            raise NotSupportedException("Learning not supported by backend",
210
                                        project_id=self.project_id)
211
212
    def dump(self):
213
        """return this project as a dict"""
214
        return {'project_id': self.project_id,
215
                'name': self.name,
216
                'language': self.language,
217
                'backend': {'backend_id': self.config['backend']}
218
                }
219
220
221
def _create_projects(projects_file, datadir, init_projects):
222
    if not os.path.exists(projects_file):
223
        logger.warning("Project configuration file '%s' is missing. " +
224
                       'Please provide one.', projects_file)
225
        logger.warning('You can set the path to the project configuration ' +
226
                       'file using the ANNIF_PROJECTS environment variable.')
227
        return {}
228
229
    config = configparser.ConfigParser()
230
    config.optionxform = lambda option: option
231
    with open(projects_file, encoding='utf-8') as projf:
232
        config.read_file(projf)
233
234
    # create AnnifProject objects from the configuration file
235
    projects = collections.OrderedDict()
236
    for project_id in config.sections():
237
        projects[project_id] = AnnifProject(project_id,
238
                                            config[project_id],
239
                                            datadir)
240
        if init_projects:
241
            projects[project_id].initialize()
242
    return projects
243
244
245
def initialize_projects(app):
246
    projects_file = app.config['PROJECTS_FILE']
247
    datadir = app.config['DATADIR']
248
    init_projects = app.config['INITIALIZE_PROJECTS']
249
    app.annif_projects = _create_projects(
250
        projects_file, datadir, init_projects)
251
252
253
def get_projects(min_access=Access.private):
254
    """Return the available projects as a dict of project_id ->
255
    AnnifProject. The min_access parameter may be used to set the minimum
256
    access level required for the returned projects."""
257
258
    projects = [(project_id, project)
259
                for project_id, project in current_app.annif_projects.items()
260
                if project.access >= min_access]
261
    return collections.OrderedDict(projects)
262
263
264
def get_project(project_id, min_access=Access.private):
265
    """return the definition of a single Project by project_id"""
266
    projects = get_projects(min_access)
267
    try:
268
        return projects[project_id]
269
    except KeyError:
270
        raise ValueError("No such project {}".format(project_id))
271