Passed
Push — master ( 93695b...d2dff1 )
by Osma
04:07 queued 11s
created

annif.project.AnnifProject.vectorizer()   A

Complexity

Conditions 3

Size

Total Lines 12
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 12
rs 9.85
c 0
b 0
f 0
cc 3
nop 1
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
import joblib
8
from sklearn.feature_extraction.text import TfidfVectorizer
9
from flask import current_app
10
from shutil import rmtree
11
import annif
12
import annif.analyzer
13
import annif.corpus
14
import annif.suggestion
15
import annif.backend
16
import annif.util
17
import annif.vocab
18
from annif.datadir import DatadirMixin
19
from annif.exception import AnnifException, ConfigurationException, \
20
    NotInitializedException, NotSupportedException
21
22
logger = annif.logger
23
24
25
class Access(enum.IntEnum):
26
    """Enumeration of access levels for projects"""
27
    private = 1
28
    hidden = 2
29
    public = 3
30
31
32
class AnnifProject(DatadirMixin):
33
    """Class representing the configuration of a single Annif project."""
34
35
    # defaults for uninitialized instances
36
    _analyzer = None
37
    _backend = None
38
    _vocab = None
39
    _vectorizer = None
40
    initialized = False
41
42
    # default values for configuration settings
43
    DEFAULT_ACCESS = 'public'
44
45
    def __init__(self, project_id, config, datadir):
46
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
47
        self.project_id = project_id
48
        self.name = config['name']
49
        self.language = config['language']
50
        self.analyzer_spec = config.get('analyzer', None)
51
        self.vocab_id = config.get('vocab', None)
52
        self.config = config
53
        self._base_datadir = datadir
54
        self._init_access()
55
56
    def _init_access(self):
57
        access = self.config.get('access', self.DEFAULT_ACCESS)
58
        try:
59
            self.access = getattr(Access, access)
60
        except AttributeError:
61
            raise ConfigurationException(
62
                "'{}' is not a valid access setting".format(access),
63
                project_id=self.project_id)
64
65
    def _initialize_analyzer(self):
66
        analyzer = self.analyzer
67
        logger.debug("Project '%s': initialized analyzer: %s",
68
                     self.project_id,
69
                     str(analyzer))
70
71
    def _initialize_subjects(self):
72
        try:
73
            subjects = self.subjects
74
            logger.debug("Project '%s': initialized subjects: %s",
75
                         self.project_id,
76
                         str(subjects))
77
        except AnnifException as err:
78
            logger.warning(err.format_message())
79
80
    def _initialize_vectorizer(self):
81
        try:
82
            vectorizer = self.vectorizer
83
            logger.debug("Project '%s': initialized vectorizer: %s",
84
                         self.project_id,
85
                         str(vectorizer))
86
        except AnnifException as err:
87
            logger.warning(err.format_message())
88
89
    def _initialize_backend(self):
90
        logger.debug("Project '%s': initializing backend", self.project_id)
91
        if not self.backend:
92
            logger.debug("Cannot initialize backend: does not exist")
93
            return
94
        try:
95
            self.backend.initialize()
96
        except AnnifException as err:
97
            logger.warning(err.format_message())
98
99
    def initialize(self):
100
        """initialize this project and its backend so that they are ready to
101
        be used"""
102
103
        logger.debug("Initializing project '%s'", self.project_id)
104
105
        self._initialize_analyzer()
106
        self._initialize_subjects()
107
        self._initialize_vectorizer()
108
        self._initialize_backend()
109
110
        self.initialized = True
111
112
    def _suggest_with_backend(self, text, backend_params):
113
        if backend_params is None:
114
            backend_params = {}
115
        beparams = backend_params.get(self.backend.backend_id, {})
116
        hits = self.backend.suggest(text, project=self, params=beparams)
117
        logger.debug(
118
            'Got %d hits from backend %s',
119
            len(hits), self.backend.backend_id)
120
        return hits
121
122
    @property
123
    def analyzer(self):
124
        if self._analyzer is None and self.analyzer_spec:
125
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
126
        return self._analyzer
127
128
    @property
129
    def backend(self):
130
        if self._backend is None:
131
            backend_id = self.config['backend']
132
            try:
133
                backend_class = annif.backend.get_backend(backend_id)
134
                self._backend = backend_class(
135
                    backend_id, params=self.config, datadir=self.datadir)
136
            except ValueError:
137
                logger.warning(
138
                    "Could not create backend %s, "
139
                    "make sure you've installed optional dependencies",
140
                    backend_id)
141
        return self._backend
142
143
    @property
144
    def vocab(self):
145
        if self._vocab is None:
146
            if self.vocab_id is None:
147
                raise ConfigurationException("vocab setting is missing",
148
                                             project_id=self.project_id)
149
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
150
                                                      self._base_datadir)
151
        return self._vocab
152
153
    @property
154
    def subjects(self):
155
        return self.vocab.subjects
156
157
    @property
158
    def vectorizer(self):
159
        if self._vectorizer is None:
160
            path = os.path.join(self.datadir, 'vectorizer')
161
            if os.path.exists(path):
162
                logger.debug('loading vectorizer from %s', path)
163
                self._vectorizer = joblib.load(path)
164
            else:
165
                raise NotInitializedException(
166
                    "vectorizer file '{}' not found".format(path),
167
                    project_id=self.project_id)
168
        return self._vectorizer
169
170
    def suggest(self, text, backend_params=None):
171
        """Suggest subjects the given text by passing it to the backend. Returns a
172
        list of SubjectSuggestion objects ordered by decreasing score."""
173
174
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
175
                     text[:20], len(text))
176
        hits = self._suggest_with_backend(text, backend_params)
177
        logger.debug('%d hits from backend', len(hits))
178
        return hits
179
180
    def _create_vectorizer(self, subjectcorpus):
181
        if not self.backend.needs_subject_vectorizer:
182
            logger.debug('not creating vectorizer: not needed by backend')
183
            return
184
        logger.info('creating vectorizer')
185
        self._vectorizer = TfidfVectorizer(
186
            tokenizer=self.analyzer.tokenize_words)
187
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
188
        annif.util.atomic_save(
189
            self._vectorizer,
190
            self.datadir,
191
            'vectorizer',
192
            method=joblib.dump)
193
194
    def train(self, corpus):
195
        """train the project using documents from a metadata source"""
196
197
        corpus.set_subject_index(self.subjects)
198
        self._create_vectorizer(corpus)
199
        self.backend.train(corpus, project=self)
200
201
    def learn(self, corpus):
202
        """further train the project using documents from a metadata source"""
203
204
        corpus.set_subject_index(self.subjects)
205
        if isinstance(
206
                self.backend,
207
                annif.backend.backend.AnnifLearningBackend):
208
            self.backend.learn(corpus, project=self)
209
        else:
210
            raise NotSupportedException("Learning not supported by backend",
211
                                        project_id=self.project_id)
212
213
    def dump(self):
214
        """return this project as a dict"""
215
        return {'project_id': self.project_id,
216
                'name': self.name,
217
                'language': self.language,
218
                'backend': {'backend_id': self.config['backend']}
219
                }
220
221
    def remove_model_data(self):
222
        """remove the data of this project"""
223
        datadir_path = self._datadir_path
224
        if os.path.isdir(datadir_path):
225
            rmtree(datadir_path)
226
            logger.info('Removed model data for project {}.'
227
                        .format(self.project_id))
228
        else:
229
            logger.warning('No model data to remove for project {}.'
230
                           .format(self.project_id))
231
232
233
def _create_projects(projects_file, datadir, init_projects):
234
    if not os.path.exists(projects_file):
235
        logger.warning(
236
            'Project configuration file "%s" is missing. Please provide one.' +
237
            ' You can set the path to the project configuration file using ' +
238
            'the ANNIF_PROJECTS environment variable or the command-line ' +
239
            'option "--projects".', projects_file)
240
        return {}
241
242
    config = configparser.ConfigParser()
243
    config.optionxform = lambda option: option
244
    with open(projects_file, encoding='utf-8') as projf:
245
        config.read_file(projf)
246
247
    # create AnnifProject objects from the configuration file
248
    projects = collections.OrderedDict()
249
    for project_id in config.sections():
250
        projects[project_id] = AnnifProject(project_id,
251
                                            config[project_id],
252
                                            datadir)
253
        if init_projects:
254
            projects[project_id].initialize()
255
    return projects
256
257
258
def initialize_projects(app):
259
    projects_file = app.config['PROJECTS_FILE']
260
    datadir = app.config['DATADIR']
261
    init_projects = app.config['INITIALIZE_PROJECTS']
262
    app.annif_projects = _create_projects(
263
        projects_file, datadir, init_projects)
264
265
266
def get_projects(min_access=Access.private):
267
    """Return the available projects as a dict of project_id ->
268
    AnnifProject. The min_access parameter may be used to set the minimum
269
    access level required for the returned projects."""
270
271
    if not hasattr(current_app, 'annif_projects'):
272
        initialize_projects(current_app)
273
274
    projects = [(project_id, project)
275
                for project_id, project in current_app.annif_projects.items()
276
                if project.access >= min_access]
277
    return collections.OrderedDict(projects)
278
279
280
def get_project(project_id, min_access=Access.private):
281
    """return the definition of a single Project by project_id"""
282
    projects = get_projects(min_access)
283
    try:
284
        return projects[project_id]
285
    except KeyError:
286
        raise ValueError("No such project {}".format(project_id))
287