Completed
Push — master ( f6c6f1...e349d9 )
by Juho
26s queued 19s
created

annif.project.AnnifProject.transform()   A

Complexity

Conditions 2

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 6
nop 1
dl 0
loc 6
rs 10
c 0
b 0
f 0
1
"""Project management functionality for Annif"""
2
3
import enum
4
import os.path
5
from shutil import rmtree
6
import annif
7
import annif.transform
8
import annif.analyzer
9
import annif.corpus
10
import annif.suggestion
11
import annif.backend
12
import annif.vocab
13
from annif.datadir import DatadirMixin
14
from annif.exception import AnnifException, ConfigurationException, \
15
    NotSupportedException, NotInitializedException
16
17
logger = annif.logger
18
19
20
class Access(enum.IntEnum):
21
    """Enumeration of access levels for projects"""
22
    private = 1
23
    hidden = 2
24
    public = 3
25
26
27
class AnnifProject(DatadirMixin):
28
    """Class representing the configuration of a single Annif project."""
29
30
    # defaults for uninitialized instances
31
    _transform = None
32
    _analyzer = None
33
    _backend = None
34
    _vocab = None
35
    initialized = False
36
37
    # default values for configuration settings
38
    DEFAULT_ACCESS = 'public'
39
40
    def __init__(self, project_id, config, datadir, registry):
41
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
42
        self.project_id = project_id
43
        self.name = config.get('name', project_id)
44
        self.language = config['language']
45
        self.analyzer_spec = config.get('analyzer', None)
46
        self.transform_spec = config.get('transform', 'pass')
47
        self.vocab_id = config.get('vocab', None)
48
        self.config = config
49
        self._base_datadir = datadir
50
        self.registry = registry
51
        self._init_access()
52
53
    def _init_access(self):
54
        access = self.config.get('access', self.DEFAULT_ACCESS)
55
        try:
56
            self.access = getattr(Access, access)
57
        except AttributeError:
58
            raise ConfigurationException(
59
                "'{}' is not a valid access setting".format(access),
60
                project_id=self.project_id)
61
62
    def _initialize_analyzer(self):
63
        if not self.analyzer_spec:
64
            return  # not configured, so assume it's not needed
65
        analyzer = self.analyzer
66
        logger.debug("Project '%s': initialized analyzer: %s",
67
                     self.project_id,
68
                     str(analyzer))
69
70
    def _initialize_subjects(self):
71
        try:
72
            subjects = self.subjects
73
            logger.debug("Project '%s': initialized subjects: %s",
74
                         self.project_id,
75
                         str(subjects))
76
        except AnnifException as err:
77
            logger.warning(err.format_message())
78
79
    def _initialize_backend(self):
80
        logger.debug("Project '%s': initializing backend", self.project_id)
81
        try:
82
            if not self.backend:
83
                logger.debug("Cannot initialize backend: does not exist")
84
                return
85
            self.backend.initialize()
86
        except AnnifException as err:
87
            logger.warning(err.format_message())
88
89
    def initialize(self):
90
        """initialize this project and its backend so that they are ready to
91
        be used"""
92
93
        logger.debug("Initializing project '%s'", self.project_id)
94
95
        self._initialize_analyzer()
96
        self._initialize_subjects()
97
        self._initialize_backend()
98
99
        self.initialized = True
100
101
    def _suggest_with_backend(self, text, backend_params):
102
        if backend_params is None:
103
            backend_params = {}
104
        beparams = backend_params.get(self.backend.backend_id, {})
105
        hits = self.backend.suggest(text, beparams)
106
        logger.debug(
107
            'Got %d hits from backend %s',
108
            len(hits), self.backend.backend_id)
109
        return hits
110
111
    @property
112
    def analyzer(self):
113
        if self._analyzer is None:
114
            if self.analyzer_spec:
115
                self._analyzer = annif.analyzer.get_analyzer(
116
                    self.analyzer_spec)
117
            else:
118
                raise ConfigurationException(
119
                    "analyzer setting is missing (and needed by the backend)",
120
                    project_id=self.project_id)
121
        return self._analyzer
122
123
    @property
124
    def transform(self):
125
        if self._transform is None:
126
            self._transform = annif.transform.get_transform(
127
                self.transform_spec, project=self)
128
        return self._transform
129
130
    @property
131
    def backend(self):
132
        if self._backend is None:
133
            if 'backend' not in self.config:
134
                raise ConfigurationException(
135
                    "backend setting is missing", project_id=self.project_id)
136
            backend_id = self.config['backend']
137
            try:
138
                backend_class = annif.backend.get_backend(backend_id)
139
                self._backend = backend_class(
140
                    backend_id, config_params=self.config,
141
                    project=self)
142
            except ValueError:
143
                logger.warning(
144
                    "Could not create backend %s, "
145
                    "make sure you've installed optional dependencies",
146
                    backend_id)
147
        return self._backend
148
149
    @property
150
    def vocab(self):
151
        if self._vocab is None:
152
            if self.vocab_id is None:
153
                raise ConfigurationException("vocab setting is missing",
154
                                             project_id=self.project_id)
155
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
156
                                                      self._base_datadir,
157
                                                      self.language)
158
        return self._vocab
159
160
    @property
161
    def subjects(self):
162
        return self.vocab.subjects
163
164
    def _get_info(self, key):
165
        try:
166
            be = self.backend
167
            if be is not None:
168
                return getattr(be, key)
169
        except AnnifException as err:
170
            logger.warning(err.format_message())
171
            return None
172
173
    @property
174
    def is_trained(self):
175
        return self._get_info('is_trained')
176
177
    @property
178
    def modification_time(self):
179
        return self._get_info('modification_time')
180
181
    def suggest(self, text, backend_params=None):
182
        """Suggest subjects the given text by passing it to the backend. Returns a
183
        list of SubjectSuggestion objects ordered by decreasing score."""
184
        if not self.is_trained:
185
            if self.is_trained is None:
186
                logger.warning('Could not get train state information.')
187
            else:
188
                raise NotInitializedException('Project is not trained.')
189
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
190
                     text[:20], len(text))
191
        text = self.transform.transform_text(text)
192
        hits = self._suggest_with_backend(text, backend_params)
193
        logger.debug('%d hits from backend', len(hits))
194
        return hits
195
196
    def train(self, corpus, backend_params=None):
197
        """train the project using documents from a metadata source"""
198
        if corpus != 'cached':
199
            corpus.set_subject_index(self.subjects)
200
            corpus = self.transform.transform_corpus(corpus)
201
        if backend_params is None:
202
            backend_params = {}
203
        beparams = backend_params.get(self.backend.backend_id, {})
204
        self.backend.train(corpus, beparams)
205
206
    def learn(self, corpus, backend_params=None):
207
        """further train the project using documents from a metadata source"""
208
        corpus.set_subject_index(self.subjects)
209
        if backend_params is None:
210
            backend_params = {}
211
        beparams = backend_params.get(self.backend.backend_id, {})
212
        corpus = self.transform.transform_corpus(corpus)
213
        if isinstance(
214
                self.backend,
215
                annif.backend.backend.AnnifLearningBackend):
216
            self.backend.learn(corpus, beparams)
217
        else:
218
            raise NotSupportedException("Learning not supported by backend",
219
                                        project_id=self.project_id)
220
221
    def hyperopt(self, corpus, trials, jobs, metric, results_file):
222
        """optimize the hyperparameters of the project using a validation
223
        corpus against a given metric"""
224
        if isinstance(
225
                self.backend,
226
                annif.backend.hyperopt.AnnifHyperoptBackend):
227
            optimizer = self.backend.get_hp_optimizer(corpus, metric)
228
            return optimizer.optimize(trials, jobs, results_file)
229
230
        raise NotSupportedException(
231
            "Hyperparameter optimization not supported "
232
            "by backend", project_id=self.project_id)
233
234
    def dump(self):
235
        """return this project as a dict"""
236
        return {'project_id': self.project_id,
237
                'name': self.name,
238
                'language': self.language,
239
                'backend': {'backend_id': self.config.get('backend')},
240
                'is_trained': self.is_trained,
241
                'modification_time': self.modification_time
242
                }
243
244
    def remove_model_data(self):
245
        """remove the data of this project"""
246
        datadir_path = self._datadir_path
247
        if os.path.isdir(datadir_path):
248
            rmtree(datadir_path)
249
            logger.info('Removed model data for project {}.'
250
                        .format(self.project_id))
251
        else:
252
            logger.warning('No model data to remove for project {}.'
253
                           .format(self.project_id))
254