Passed
Pull Request — master (#257)
by Osma
02:44
created

annif.project.AnnifProject.learn()   A

Complexity

Conditions 2

Size

Total Lines 11
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 11
rs 10
c 0
b 0
f 0
cc 2
nop 2
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from sklearn.externals import joblib
8
from sklearn.feature_extraction.text import TfidfVectorizer
9
from flask import current_app
10
import annif
11
import annif.analyzer
12
import annif.corpus
13
import annif.hit
14
import annif.backend
15
import annif.util
16
import annif.vocab
17
from annif.datadir import DatadirMixin
18
from annif.exception import AnnifException, ConfigurationException, \
19
    NotInitializedException, NotSupportedException
20
21
logger = annif.logger
22
23
24
class Access(enum.IntEnum):
25
    """Enumeration of access levels for projects"""
26
    private = 1
27
    hidden = 2
28
    public = 3
29
30
31
class AnnifProject(DatadirMixin):
32
    """Class representing the configuration of a single Annif project."""
33
34
    # defaults for uninitialized instances
35
    _analyzer = None
36
    _backend = None
37
    _vocab = None
38
    _vectorizer = None
39
    initialized = False
40
41
    # default values for configuration settings
42
    DEFAULT_ACCESS = 'public'
43
44
    def __init__(self, project_id, config, datadir):
45
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
46
        self.project_id = project_id
47
        self.name = config['name']
48
        self.language = config['language']
49
        self.analyzer_spec = config.get('analyzer', None)
50
        self.vocab_id = config.get('vocab', None)
51
        self.config = config
52
        self._base_datadir = datadir
53
        self._init_access()
54
55
    def _init_access(self):
56
        access = self.config.get('access', self.DEFAULT_ACCESS)
57
        try:
58
            self.access = getattr(Access, access)
59
        except AttributeError:
60
            raise ConfigurationException(
61
                "'{}' is not a valid access setting".format(access),
62
                project_id=self.project_id)
63
64
    def _initialize_analyzer(self):
65
        analyzer = self.analyzer
66
        logger.debug("Project '%s': initialized analyzer: %s",
67
                     self.project_id,
68
                     str(analyzer))
69
70
    def _initialize_subjects(self):
71
        try:
72
            subjects = self.subjects
73
            logger.debug("Project '%s': initialized subjects: %s",
74
                         self.project_id,
75
                         str(subjects))
76
        except AnnifException as err:
77
            logger.warning(err.format_message())
78
79
    def _initialize_vectorizer(self):
80
        try:
81
            vectorizer = self.vectorizer
82
            logger.debug("Project '%s': initialized vectorizer: %s",
83
                         self.project_id,
84
                         str(vectorizer))
85
        except AnnifException as err:
86
            logger.warning(err.format_message())
87
88
    def _initialize_backend(self):
89
        logger.debug("Project '%s': initializing backend", self.project_id)
90
        if not self.backend:
91
            logger.debug("Cannot initialize backend: does not exist")
92
            return
93
        try:
94
            self.backend.initialize()
95
        except AnnifException as err:
96
            logger.warning(err.format_message())
97
98
    def initialize(self):
99
        """initialize this project and its backend so that they are ready to
100
        analyze"""
101
        logger.debug("Initializing project '%s'", self.project_id)
102
103
        self._initialize_analyzer()
104
        self._initialize_subjects()
105
        self._initialize_vectorizer()
106
        self._initialize_backend()
107
108
        self.initialized = True
109
110
    def _analyze_with_backend(self, text, backend_params):
111
        if backend_params is None:
112
            backend_params = {}
113
        beparams = backend_params.get(self.backend.backend_id, {})
114
        hits = self.backend.analyze(text, project=self, params=beparams)
115
        logger.debug(
116
            'Got %d hits from backend %s',
117
            len(hits), self.backend.backend_id)
118
        return hits
119
120
    @property
121
    def analyzer(self):
122
        if self._analyzer is None and self.analyzer_spec:
123
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
124
        return self._analyzer
125
126
    @property
127
    def backend(self):
128
        if self._backend is None:
129
            backend_id = self.config['backend']
130
            try:
131
                backend_class = annif.backend.get_backend(backend_id)
132
                self._backend = backend_class(
133
                    backend_id, params=self.config, datadir=self.datadir)
134
            except ValueError:
135
                logger.warning(
136
                    "Could not create backend %s, "
137
                    "make sure you've installed optional dependencies",
138
                    backend_id)
139
        return self._backend
140
141
    @property
142
    def vocab(self):
143
        if self._vocab is None:
144
            if self.vocab_id is None:
145
                raise ConfigurationException("vocab setting is missing",
146
                                             project_id=self.project_id)
147
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
148
                                                      self._base_datadir)
149
        return self._vocab
150
151
    @property
152
    def subjects(self):
153
        return self.vocab.subjects
154
155
    @property
156
    def vectorizer(self):
157
        if self._vectorizer is None:
158
            path = os.path.join(self.datadir, 'vectorizer')
159
            if os.path.exists(path):
160
                logger.debug('loading vectorizer from %s', path)
161
                self._vectorizer = joblib.load(path)
162
            else:
163
                raise NotInitializedException(
164
                    "vectorizer file '{}' not found".format(path),
165
                    project_id=self.project_id)
166
        return self._vectorizer
167
168
    def analyze(self, text, backend_params=None):
169
        """Analyze the given text by passing it to the backend. Returns a
170
        list of AnalysisHit objects ordered by decreasing score."""
171
172
        logger.debug('Analyzing text "%s..." (len=%d)',
173
                     text[:20], len(text))
174
        hits = self._analyze_with_backend(text, backend_params)
175
        logger.debug('%d hits from backend', len(hits))
176
        return hits
177
178
    def _create_vectorizer(self, subjectcorpus):
179
        if not self.backend.needs_subject_vectorizer:
180
            logger.debug('not creating vectorizer: not needed by backend')
181
            return
182
        logger.info('creating vectorizer')
183
        self._vectorizer = TfidfVectorizer(
184
            tokenizer=self.analyzer.tokenize_words)
185
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
186
        annif.util.atomic_save(
187
            self._vectorizer,
188
            self.datadir,
189
            'vectorizer',
190
            method=joblib.dump)
191
192
    def train(self, corpus):
193
        """train the project using documents from a metadata source"""
194
195
        corpus.set_subject_index(self.subjects)
196
        self._create_vectorizer(corpus)
197
        self.backend.train(corpus, project=self)
198
199
    def learn(self, corpus):
200
        """further train the project using documents from a metadata source"""
201
202
        corpus.set_subject_index(self.subjects)
203
        if isinstance(
204
                self.backend,
205
                annif.backend.backend.AnnifLearningBackend):
206
            self.backend.learn(corpus, project=self)
207
        else:
208
            raise NotSupportedException("Learning not supported by backend",
209
                                        project_id=self.project_id)
210
211
    def dump(self):
212
        """return this project as a dict"""
213
        return {'project_id': self.project_id,
214
                'name': self.name,
215
                'language': self.language,
216
                'backend': {'backend_id': self.config['backend']}
217
                }
218
219
220
def _create_projects(projects_file, datadir, init_projects):
221
    if not os.path.exists(projects_file):
222
        logger.warning("Project configuration file '%s' is missing. " +
223
                       'Please provide one.', projects_file)
224
        logger.warning('You can set the path to the project configuration ' +
225
                       'file using the ANNIF_PROJECTS environment variable.')
226
        return {}
227
228
    config = configparser.ConfigParser()
229
    config.optionxform = lambda option: option
230
    with open(projects_file) as projf:
231
        config.read_file(projf)
232
233
    # create AnnifProject objects from the configuration file
234
    projects = collections.OrderedDict()
235
    for project_id in config.sections():
236
        projects[project_id] = AnnifProject(project_id,
237
                                            config[project_id],
238
                                            datadir)
239
        if init_projects:
240
            projects[project_id].initialize()
241
    return projects
242
243
244
def initialize_projects(app):
245
    projects_file = app.config['PROJECTS_FILE']
246
    datadir = app.config['DATADIR']
247
    init_projects = app.config['INITIALIZE_PROJECTS']
248
    app.annif_projects = _create_projects(
249
        projects_file, datadir, init_projects)
250
251
252
def get_projects(min_access=Access.private):
253
    """Return the available projects as a dict of project_id ->
254
    AnnifProject. The min_access parameter may be used to set the minimum
255
    access level required for the returned projects."""
256
257
    projects = [(project_id, project)
258
                for project_id, project in current_app.annif_projects.items()
259
                if project.access >= min_access]
260
    return collections.OrderedDict(projects)
261
262
263
def get_project(project_id, min_access=Access.private):
264
    """return the definition of a single Project by project_id"""
265
    projects = get_projects(min_access)
266
    try:
267
        return projects[project_id]
268
    except KeyError:
269
        raise ValueError("No such project {}".format(project_id))
270