Passed
Pull Request — master (#414)
by Osma
02:14
created

annif.project   B

Complexity

Total Complexity 48

Size/Duplication

Total Lines 279
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 48
eloc 214
dl 0
loc 279
rs 8.5599
c 0
b 0
f 0

17 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject.subjects() 0 3 1
A AnnifProject.vocab() 0 9 3
A AnnifProject._init_access() 0 8 2
A AnnifProject.__init__() 0 10 1
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject._suggest_with_backend() 0 9 2
A AnnifProject._initialize_analyzer() 0 7 2
A AnnifProject.analyzer() 0 11 3
A AnnifProject.backend() 0 18 4
A AnnifProject.initialize() 0 11 1
A AnnifProject.suggest() 0 8 1
A AnnifProject.train() 0 8 3
A AnnifProject.learn() 0 13 3
A AnnifProject.remove_model_data() 0 10 2
A AnnifProject.dump() 0 6 1
A AnnifProject.hyperopt() 0 12 2

4 Functions

Rating   Name   Duplication   Size   Complexity  
A get_projects() 0 12 2
A get_project() 0 7 2
B _create_projects() 0 27 7
A initialize_projects() 0 6 1

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from flask import current_app
8
from shutil import rmtree
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.suggestion
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.datadir import DatadirMixin
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotSupportedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject(DatadirMixin):
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    initialized = False
38
39
    # default values for configuration settings
40
    DEFAULT_ACCESS = 'public'
41
42
    def __init__(self, project_id, config, datadir):
43
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
44
        self.project_id = project_id
45
        self.name = config.get('name', project_id)
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self.config = config
50
        self._base_datadir = datadir
51
        self._init_access()
52
53
    def _init_access(self):
54
        access = self.config.get('access', self.DEFAULT_ACCESS)
55
        try:
56
            self.access = getattr(Access, access)
57
        except AttributeError:
58
            raise ConfigurationException(
59
                "'{}' is not a valid access setting".format(access),
60
                project_id=self.project_id)
61
62
    def _initialize_analyzer(self):
63
        if not self.analyzer_spec:
64
            return  # not configured, so assume it's not needed
65
        analyzer = self.analyzer
66
        logger.debug("Project '%s': initialized analyzer: %s",
67
                     self.project_id,
68
                     str(analyzer))
69
70
    def _initialize_subjects(self):
71
        try:
72
            subjects = self.subjects
73
            logger.debug("Project '%s': initialized subjects: %s",
74
                         self.project_id,
75
                         str(subjects))
76
        except AnnifException as err:
77
            logger.warning(err.format_message())
78
79
    def _initialize_backend(self):
80
        logger.debug("Project '%s': initializing backend", self.project_id)
81
        try:
82
            if not self.backend:
83
                logger.debug("Cannot initialize backend: does not exist")
84
                return
85
            self.backend.initialize()
86
        except AnnifException as err:
87
            logger.warning(err.format_message())
88
89
    def initialize(self):
90
        """initialize this project and its backend so that they are ready to
91
        be used"""
92
93
        logger.debug("Initializing project '%s'", self.project_id)
94
95
        self._initialize_analyzer()
96
        self._initialize_subjects()
97
        self._initialize_backend()
98
99
        self.initialized = True
100
101
    def _suggest_with_backend(self, text, backend_params):
102
        if backend_params is None:
103
            backend_params = {}
104
        beparams = backend_params.get(self.backend.backend_id, {})
105
        hits = self.backend.suggest(text, beparams)
106
        logger.debug(
107
            'Got %d hits from backend %s',
108
            len(hits), self.backend.backend_id)
109
        return hits
110
111
    @property
112
    def analyzer(self):
113
        if self._analyzer is None:
114
            if self.analyzer_spec:
115
                self._analyzer = annif.analyzer.get_analyzer(
116
                    self.analyzer_spec)
117
            else:
118
                raise ConfigurationException(
119
                    "analyzer setting is missing (and needed by the backend)",
120
                    project_id=self.project_id)
121
        return self._analyzer
122
123
    @property
124
    def backend(self):
125
        if self._backend is None:
126
            if 'backend' not in self.config:
127
                raise ConfigurationException(
128
                    "backend setting is missing", project_id=self.project_id)
129
            backend_id = self.config['backend']
130
            try:
131
                backend_class = annif.backend.get_backend(backend_id)
132
                self._backend = backend_class(
133
                    backend_id, config_params=self.config,
134
                    project=self)
135
            except ValueError:
136
                logger.warning(
137
                    "Could not create backend %s, "
138
                    "make sure you've installed optional dependencies",
139
                    backend_id)
140
        return self._backend
141
142
    @property
143
    def vocab(self):
144
        if self._vocab is None:
145
            if self.vocab_id is None:
146
                raise ConfigurationException("vocab setting is missing",
147
                                             project_id=self.project_id)
148
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
149
                                                      self._base_datadir)
150
        return self._vocab
151
152
    @property
153
    def subjects(self):
154
        return self.vocab.subjects
155
156
    def suggest(self, text, backend_params=None):
157
        """Suggest subjects the given text by passing it to the backend. Returns a
158
        list of SubjectSuggestion objects ordered by decreasing score."""
159
        logger.debug('Suggesting subjects for text "%s..." (len=%d)',
160
                     text[:20], len(text))
161
        hits = self._suggest_with_backend(text, backend_params)
162
        logger.debug('%d hits from backend', len(hits))
163
        return hits
164
165
    def train(self, corpus, backend_params=None):
166
        """train the project using documents from a metadata source"""
167
        if corpus != 'cached':
168
            corpus.set_subject_index(self.subjects)
169
        if backend_params is None:
170
            backend_params = {}
171
        beparams = backend_params.get(self.backend.backend_id, {})
172
        self.backend.train(corpus, beparams)
173
174
    def learn(self, corpus, backend_params=None):
175
        """further train the project using documents from a metadata source"""
176
        corpus.set_subject_index(self.subjects)
177
        if backend_params is None:
178
            backend_params = {}
179
        beparams = backend_params.get(self.backend.backend_id, {})
180
        if isinstance(
181
                self.backend,
182
                annif.backend.backend.AnnifLearningBackend):
183
            self.backend.learn(corpus, beparams)
184
        else:
185
            raise NotSupportedException("Learning not supported by backend",
186
                                        project_id=self.project_id)
187
188
    def hyperopt(self, corpus, trials):
189
        """optimize the hyperparameters of the project using a validation
190
        corpus"""
191
        if isinstance(
192
                self.backend,
193
                annif.backend.hyperopt.AnnifHyperoptBackend):
194
            optimizer = self.backend.get_hp_optimizer(corpus)
195
            return optimizer.optimize(trials)
196
197
            raise NotSupportedException(
198
                "Hyperparameter optimization not supported "
199
                "by backend", project_id=self.project_id)
200
201
    def dump(self):
202
        """return this project as a dict"""
203
        return {'project_id': self.project_id,
204
                'name': self.name,
205
                'language': self.language,
206
                'backend': {'backend_id': self.config.get('backend')}
207
                }
208
209
    def remove_model_data(self):
210
        """remove the data of this project"""
211
        datadir_path = self._datadir_path
212
        if os.path.isdir(datadir_path):
213
            rmtree(datadir_path)
214
            logger.info('Removed model data for project {}.'
215
                        .format(self.project_id))
216
        else:
217
            logger.warning('No model data to remove for project {}.'
218
                           .format(self.project_id))
219
220
221
def _create_projects(projects_file, datadir, init_projects):
222
    if not os.path.exists(projects_file):
223
        logger.warning(
224
            'Project configuration file "%s" is missing. Please provide one.' +
225
            ' You can set the path to the project configuration file using ' +
226
            'the ANNIF_PROJECTS environment variable or the command-line ' +
227
            'option "--projects".', projects_file)
228
        return {}
229
230
    config = configparser.ConfigParser()
231
    config.optionxform = lambda option: option
232
    with open(projects_file, encoding='utf-8-sig') as projf:
233
        try:
234
            config.read_file(projf)
235
        except (configparser.DuplicateOptionError,
236
                configparser.DuplicateSectionError) as err:
237
            raise ConfigurationException(err)
238
239
    # create AnnifProject objects from the configuration file
240
    projects = collections.OrderedDict()
241
    for project_id in config.sections():
242
        projects[project_id] = AnnifProject(project_id,
243
                                            config[project_id],
244
                                            datadir)
245
        if init_projects:
246
            projects[project_id].initialize()
247
    return projects
248
249
250
def initialize_projects(app):
251
    projects_file = app.config['PROJECTS_FILE']
252
    datadir = app.config['DATADIR']
253
    init_projects = app.config['INITIALIZE_PROJECTS']
254
    app.annif_projects = _create_projects(
255
        projects_file, datadir, init_projects)
256
257
258
def get_projects(min_access=Access.private):
259
    """Return the available projects as a dict of project_id ->
260
    AnnifProject. The min_access parameter may be used to set the minimum
261
    access level required for the returned projects."""
262
263
    if not hasattr(current_app, 'annif_projects'):
264
        initialize_projects(current_app)
265
266
    projects = [(project_id, project)
267
                for project_id, project in current_app.annif_projects.items()
268
                if project.access >= min_access]
269
    return collections.OrderedDict(projects)
270
271
272
def get_project(project_id, min_access=Access.private):
273
    """return the definition of a single Project by project_id"""
274
    projects = get_projects(min_access)
275
    try:
276
        return projects[project_id]
277
    except KeyError:
278
        raise ValueError("No such project {}".format(project_id))
279