Passed
Pull Request — master (#259)
by Osma
02:45
created

annif.project   A

Complexity

Total Complexity 42

Size/Duplication

Total Lines 258
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 42
eloc 200
dl 0
loc 258
rs 9.0399
c 0
b 0
f 0

17 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject.vectorizer() 0 12 3
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject.dump() 0 6 1
A AnnifProject._analyze_with_backend() 0 9 2
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject._initialize_vectorizer() 0 8 2
A AnnifProject.subjects() 0 3 1
A AnnifProject.analyze() 0 9 1
A AnnifProject._init_access() 0 8 2
A AnnifProject.train() 0 6 1
A AnnifProject.__init__() 0 10 1
A AnnifProject._initialize_analyzer() 0 5 1
A AnnifProject.analyzer() 0 5 3
A AnnifProject._create_vectorizer() 0 13 2
A AnnifProject.vocab() 0 9 3
A AnnifProject.backend() 0 14 3
A AnnifProject.initialize() 0 11 1

4 Functions

Rating   Name   Duplication   Size   Complexity  
A get_projects() 0 9 1
A get_project() 0 7 2
B _create_projects() 0 22 6
A initialize_projects() 0 6 1

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from sklearn.externals import joblib
8
from sklearn.feature_extraction.text import TfidfVectorizer
9
from flask import current_app
10
import annif
11
import annif.analyzer
12
import annif.corpus
13
import annif.hit
14
import annif.backend
15
import annif.util
16
import annif.vocab
17
from annif.datadir import DatadirMixin
18
from annif.exception import AnnifException, ConfigurationException, \
19
    NotInitializedException
20
21
logger = annif.logger
22
23
24
class Access(enum.IntEnum):
25
    """Enumeration of access levels for projects"""
26
    private = 1
27
    hidden = 2
28
    public = 3
29
30
31
class AnnifProject(DatadirMixin):
32
    """Class representing the configuration of a single Annif project."""
33
34
    # defaults for uninitialized instances
35
    _analyzer = None
36
    _backend = None
37
    _vocab = None
38
    _vectorizer = None
39
    initialized = False
40
41
    # default values for configuration settings
42
    DEFAULT_ACCESS = 'public'
43
44
    def __init__(self, project_id, config, datadir):
45
        DatadirMixin.__init__(self, datadir, 'projects', project_id)
46
        self.project_id = project_id
47
        self.name = config['name']
48
        self.language = config['language']
49
        self.analyzer_spec = config.get('analyzer', None)
50
        self.vocab_id = config.get('vocab', None)
51
        self.config = config
52
        self._base_datadir = datadir
53
        self._init_access()
54
55
    def _init_access(self):
56
        access = self.config.get('access', self.DEFAULT_ACCESS)
57
        try:
58
            self.access = getattr(Access, access)
59
        except AttributeError:
60
            raise ConfigurationException(
61
                "'{}' is not a valid access setting".format(access),
62
                project_id=self.project_id)
63
64
    def _initialize_analyzer(self):
65
        analyzer = self.analyzer
66
        logger.debug("Project '%s': initialized analyzer: %s",
67
                     self.project_id,
68
                     str(analyzer))
69
70
    def _initialize_subjects(self):
71
        try:
72
            subjects = self.subjects
73
            logger.debug("Project '%s': initialized subjects: %s",
74
                         self.project_id,
75
                         str(subjects))
76
        except AnnifException as err:
77
            logger.warning(err.format_message())
78
79
    def _initialize_vectorizer(self):
80
        try:
81
            vectorizer = self.vectorizer
82
            logger.debug("Project '%s': initialized vectorizer: %s",
83
                         self.project_id,
84
                         str(vectorizer))
85
        except AnnifException as err:
86
            logger.warning(err.format_message())
87
88
    def _initialize_backend(self):
89
        logger.debug("Project '%s': initializing backend", self.project_id)
90
        if not self.backend:
91
            logger.debug("Cannot initialize backend: does not exist")
92
            return
93
        try:
94
            self.backend.initialize()
95
        except AnnifException as err:
96
            logger.warning(err.format_message())
97
98
    def initialize(self):
99
        """initialize this project and its backend so that they are ready to
100
        analyze"""
101
        logger.debug("Initializing project '%s'", self.project_id)
102
103
        self._initialize_analyzer()
104
        self._initialize_subjects()
105
        self._initialize_vectorizer()
106
        self._initialize_backend()
107
108
        self.initialized = True
109
110
    def _analyze_with_backend(self, text, backend_params):
111
        if backend_params is None:
112
            backend_params = {}
113
        beparams = backend_params.get(self.backend.backend_id, {})
114
        hits = self.backend.analyze(text, project=self, params=beparams)
115
        logger.debug(
116
            'Got %d hits from backend %s',
117
            len(hits), self.backend.backend_id)
118
        return hits
119
120
    @property
121
    def analyzer(self):
122
        if self._analyzer is None and self.analyzer_spec:
123
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
124
        return self._analyzer
125
126
    @property
127
    def backend(self):
128
        if self._backend is None:
129
            backend_id = self.config['backend']
130
            try:
131
                backend_class = annif.backend.get_backend(backend_id)
132
                self._backend = backend_class(
133
                    backend_id, params=self.config, datadir=self.datadir)
134
            except ValueError:
135
                logger.warning(
136
                    "Could not create backend %s, "
137
                    "make sure you've installed optional dependencies",
138
                    backend_id)
139
        return self._backend
140
141
    @property
142
    def vocab(self):
143
        if self._vocab is None:
144
            if self.vocab_id is None:
145
                raise ConfigurationException("vocab setting is missing",
146
                                             project_id=self.project_id)
147
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
148
                                                      self._base_datadir)
149
        return self._vocab
150
151
    @property
152
    def subjects(self):
153
        return self.vocab.subjects
154
155
    @property
156
    def vectorizer(self):
157
        if self._vectorizer is None:
158
            path = os.path.join(self.datadir, 'vectorizer')
159
            if os.path.exists(path):
160
                logger.debug('loading vectorizer from %s', path)
161
                self._vectorizer = joblib.load(path)
162
            else:
163
                raise NotInitializedException(
164
                    "vectorizer file '{}' not found".format(path),
165
                    project_id=self.project_id)
166
        return self._vectorizer
167
168
    def analyze(self, text, backend_params=None):
169
        """Analyze the given text by passing it to the backend. Returns a
170
        list of AnalysisHit objects ordered by decreasing score."""
171
172
        logger.debug('Analyzing text "%s..." (len=%d)',
173
                     text[:20], len(text))
174
        hits = self._analyze_with_backend(text, backend_params)
175
        logger.debug('%d hits from backend', len(hits))
176
        return hits
177
178
    def _create_vectorizer(self, subjectcorpus):
179
        if not self.backend.needs_subject_vectorizer:
180
            logger.debug('not creating vectorizer: not needed by backend')
181
            return
182
        logger.info('creating vectorizer')
183
        self._vectorizer = TfidfVectorizer(
184
            tokenizer=self.analyzer.tokenize_words)
185
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
186
        annif.util.atomic_save(
187
            self._vectorizer,
188
            self.datadir,
189
            'vectorizer',
190
            method=joblib.dump)
191
192
    def train(self, corpus):
193
        """train the project using documents from a metadata source"""
194
195
        corpus.set_subject_index(self.subjects)
196
        self._create_vectorizer(corpus)
197
        self.backend.train(corpus, project=self)
198
199
    def dump(self):
200
        """return this project as a dict"""
201
        return {'project_id': self.project_id,
202
                'name': self.name,
203
                'language': self.language,
204
                'backend': {'backend_id': self.config['backend']}
205
                }
206
207
208
def _create_projects(projects_file, datadir, init_projects):
209
    if not os.path.exists(projects_file):
210
        logger.warning("Project configuration file '%s' is missing. " +
211
                       'Please provide one.', projects_file)
212
        logger.warning('You can set the path to the project configuration ' +
213
                       'file using the ANNIF_PROJECTS environment variable.')
214
        return {}
215
216
    config = configparser.ConfigParser()
217
    config.optionxform = lambda option: option
218
    with open(projects_file) as projf:
219
        config.read_file(projf)
220
221
    # create AnnifProject objects from the configuration file
222
    projects = collections.OrderedDict()
223
    for project_id in config.sections():
224
        projects[project_id] = AnnifProject(project_id,
225
                                            config[project_id],
226
                                            datadir)
227
        if init_projects:
228
            projects[project_id].initialize()
229
    return projects
230
231
232
def initialize_projects(app):
233
    projects_file = app.config['PROJECTS_FILE']
234
    datadir = app.config['DATADIR']
235
    init_projects = app.config['INITIALIZE_PROJECTS']
236
    app.annif_projects = _create_projects(
237
        projects_file, datadir, init_projects)
238
239
240
def get_projects(min_access=Access.private):
241
    """Return the available projects as a dict of project_id ->
242
    AnnifProject. The min_access parameter may be used to set the minimum
243
    access level required for the returned projects."""
244
245
    projects = [(project_id, project)
246
                for project_id, project in current_app.annif_projects.items()
247
                if project.access >= min_access]
248
    return collections.OrderedDict(projects)
249
250
251
def get_project(project_id, min_access=Access.private):
252
    """return the definition of a single Project by project_id"""
253
    projects = get_projects(min_access)
254
    try:
255
        return projects[project_id]
256
    except KeyError:
257
        raise ValueError("No such project {}".format(project_id))
258