Passed
Pull Request — master (#414)
by Osma
01:58
created

annif.project.AnnifProject.suggest()   A

Complexity

Conditions 3

Size

Total Lines 13
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 10
dl 0
loc 13
rs 9.9
c 0
b 0
f 0
cc 3
nop 3
1
"""Project management functionality for Annif"""
2
3
import enum
4
import os.path
5
from shutil import rmtree
6
import annif
7
import annif.analyzer
8
import annif.corpus
9
import annif.suggestion
10
import annif.backend
11
import annif.vocab
12
from annif.datadir import DatadirMixin
13
from annif.exception import AnnifException, ConfigurationException, \
14
    NotSupportedException, NotInitializedException
15
16
logger = annif.logger
17
18
19
class Access(enum.IntEnum):
20
    """Enumeration of access levels for projects"""
21
    private = 1
22
    hidden = 2
23
    public = 3
24
25
26
class AnnifProject(DatadirMixin):
27
    """Class representing the configuration of a single Annif project."""
28
29
    # defaults for uninitialized instances
30
    _analyzer = None
31
    _backend = None
32
    _vocab = None
33
    initialized = False
34
35
    # default values for configuration settings
36
    DEFAULT_ACCESS = 'public'
37
38
    def __init__(self, project_id, config, datadir, registry):
39
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
40
        self.project_id = project_id
41
        self.name = config.get('name', project_id)
42
        self.language = config['language']
43
        self.analyzer_spec = config.get('analyzer', None)
44
        self.vocab_id = config.get('vocab', None)
45
        self.config = config
46
        self._base_datadir = datadir
47
        self.registry = registry
48
        self._init_access()
49
50
    def _init_access(self):
51
        access = self.config.get('access', self.DEFAULT_ACCESS)
52
        try:
53
            self.access = getattr(Access, access)
54
        except AttributeError:
55
            raise ConfigurationException(
56
                "'{}' is not a valid access setting".format(access),
57
                project_id=self.project_id)
58
59
    def _initialize_analyzer(self):
60
        if not self.analyzer_spec:
61
            return  # not configured, so assume it's not needed
62
        analyzer = self.analyzer
63
        logger.debug("Project '%s': initialized analyzer: %s",
64
                     self.project_id,
65
                     str(analyzer))
66
67
    def _initialize_subjects(self):
68
        try:
69
            subjects = self.subjects
70
            logger.debug("Project '%s': initialized subjects: %s",
71
                         self.project_id,
72
                         str(subjects))
73
        except AnnifException as err:
74
            logger.warning(err.format_message())
75
76
    def _initialize_backend(self):
77
        logger.debug("Project '%s': initializing backend", self.project_id)
78
        try:
79
            if not self.backend:
80
                logger.debug("Cannot initialize backend: does not exist")
81
                return
82
            self.backend.initialize()
83
        except AnnifException as err:
84
            logger.warning(err.format_message())
85
86
    def initialize(self):
87
        """initialize this project and its backend so that they are ready to
88
        be used"""
89
90
        logger.debug("Initializing project '%s'", self.project_id)
91
92
        self._initialize_analyzer()
93
        self._initialize_subjects()
94
        self._initialize_backend()
95
96
        self.initialized = True
97
98
    def _suggest_with_backend(self, text, backend_params):
99
        if backend_params is None:
100
            backend_params = {}
101
        beparams = backend_params.get(self.backend.backend_id, {})
102
        hits = self.backend.suggest(text, beparams)
103
        logger.debug(
104
            'Got %d hits from backend %s',
105
            len(hits), self.backend.backend_id)
106
        return hits
107
108
    @property
109
    def analyzer(self):
110
        if self._analyzer is None:
111
            if self.analyzer_spec:
112
                self._analyzer = annif.analyzer.get_analyzer(
113
                    self.analyzer_spec)
114
            else:
115
                raise ConfigurationException(
116
                    "analyzer setting is missing (and needed by the backend)",
117
                    project_id=self.project_id)
118
        return self._analyzer
119
120
    @property
121
    def backend(self):
122
        if self._backend is None:
123
            if 'backend' not in self.config:
124
                raise ConfigurationException(
125
                    "backend setting is missing", project_id=self.project_id)
126
            backend_id = self.config['backend']
127
            try:
128
                backend_class = annif.backend.get_backend(backend_id)
129
                self._backend = backend_class(
130
                    backend_id, config_params=self.config,
131
                    project=self)
132
            except ValueError:
133
                logger.warning(
134
                    "Could not create backend %s, "
135
                    "make sure you've installed optional dependencies",
136
                    backend_id)
137
        return self._backend
138
139
    @property
140
    def vocab(self):
141
        if self._vocab is None:
142
            if self.vocab_id is None:
143
                raise ConfigurationException("vocab setting is missing",
144
                                             project_id=self.project_id)
145
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
146
                                                      self._base_datadir)
147
        return self._vocab
148
149
    @property
150
    def subjects(self):
151
        return self.vocab.subjects
152
153
    def _get_info(self, key):
154
        try:
155
            be = self.backend
156
            if be is not None:
157
                return getattr(be, key)
158
        except AnnifException as err:
159
            logger.warning(err.format_message())
160
            return None
161
162
    @property
163
    def is_trained(self):
164
        return self._get_info('is_trained')
165
166
    @property
167
    def modification_time(self):
168
        return self._get_info('modification_time')
169
170
    def suggest(self, text, backend_params=None):
171
        """Suggest subjects the given text by passing it to the backend. Returns a
172
        list of SubjectSuggestion objects ordered by decreasing score."""
173
        if not self.is_trained:
174
            if self.is_trained is None:
175
                logger.warn('Could not get train state information.')
176
            else:
177
                raise NotInitializedException('Project is not trained.')
178
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
179
                     text[:20], len(text))
180
        hits = self._suggest_with_backend(text, backend_params)
181
        logger.debug('%d hits from backend', len(hits))
182
        return hits
183
184
    def train(self, corpus, backend_params=None):
185
        """train the project using documents from a metadata source"""
186
        if corpus != 'cached':
187
            corpus.set_subject_index(self.subjects)
188
        if backend_params is None:
189
            backend_params = {}
190
        beparams = backend_params.get(self.backend.backend_id, {})
191
        self.backend.train(corpus, beparams)
192
193
    def learn(self, corpus, backend_params=None):
194
        """further train the project using documents from a metadata source"""
195
        corpus.set_subject_index(self.subjects)
196
        if backend_params is None:
197
            backend_params = {}
198
        beparams = backend_params.get(self.backend.backend_id, {})
199
        if isinstance(
200
                self.backend,
201
                annif.backend.backend.AnnifLearningBackend):
202
            self.backend.learn(corpus, beparams)
203
        else:
204
            raise NotSupportedException("Learning not supported by backend",
205
                                        project_id=self.project_id)
206
207
    def hyperopt(self, corpus, trials, jobs, metric, results_file):
208
        """optimize the hyperparameters of the project using a validation
209
        corpus against a given metric"""
210
        if isinstance(
211
                self.backend,
212
                annif.backend.hyperopt.AnnifHyperoptBackend):
213
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
214
            return optimizer.optimize(trials, jobs, results_file)
215
216
        raise NotSupportedException(
217
            "Hyperparameter optimization not supported "
218
            "by backend", project_id=self.project_id)
219
220
    def dump(self):
221
        """return this project as a dict"""
222
        return {'project_id': self.project_id,
223
                'name': self.name,
224
                'language': self.language,
225
                'backend': {'backend_id': self.config.get('backend')},
226
                'is_trained': self.is_trained,
227
                'modification_time': self.modification_time
228
                }
229
230
    def remove_model_data(self):
231
        """remove the data of this project"""
232
        datadir_path = self._datadir_path
233
        if os.path.isdir(datadir_path):
234
            rmtree(datadir_path)
235
            logger.info('Removed model data for project {}.'
236
                        .format(self.project_id))
237
        else:
238
            logger.warning('No model data to remove for project {}.'
239
                           .format(self.project_id))
240