Passed
Pull Request — master (#246)
by Osma
03:18
created

annif.project   B

Complexity

Total Complexity 44

Size/Duplication

Total Lines 264
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 44
eloc 203
dl 0
loc 264
rs 8.8798
c 0
b 0
f 0

18 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifProject.vectorizer() 0 12 3
A AnnifProject._initialize_subjects() 0 8 2
A AnnifProject.dump() 0 6 1
A AnnifProject._analyze_with_backend() 0 9 2
A AnnifProject._initialize_backend() 0 9 3
A AnnifProject._initialize_vectorizer() 0 8 2
A AnnifProject.subjects() 0 3 1
A AnnifProject.analyze() 0 9 1
A AnnifProject._init_access() 0 8 2
A AnnifProject.train() 0 6 1
A AnnifProject.__init__() 0 10 1
A AnnifProject._get_datadir() 0 6 2
A AnnifProject._initialize_analyzer() 0 5 1
A AnnifProject.analyzer() 0 5 3
A AnnifProject._create_vectorizer() 0 13 2
A AnnifProject.vocab() 0 9 3
A AnnifProject.backend() 0 14 3
A AnnifProject.initialize() 0 11 1

4 Functions

Rating   Name   Duplication   Size   Complexity  
A get_projects() 0 9 1
A get_project() 0 7 2
B _create_projects() 0 22 6
A initialize_projects() 0 6 1

How to fix   Complexity   

Complexity

Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import enum
6
import os.path
7
from sklearn.externals import joblib
8
from sklearn.feature_extraction.text import TfidfVectorizer
9
from flask import current_app
10
import annif
11
import annif.analyzer
12
import annif.corpus
13
import annif.hit
14
import annif.backend
15
import annif.util
16
import annif.vocab
17
from annif.exception import AnnifException, ConfigurationException, \
18
    NotInitializedException
19
20
logger = annif.logger
21
22
23
class Access(enum.IntEnum):
24
    """Enumeration of access levels for projects"""
25
    private = 1
26
    hidden = 2
27
    public = 3
28
29
30
class AnnifProject:
31
    """Class representing the configuration of a single Annif project."""
32
33
    # defaults for uninitialized instances
34
    _analyzer = None
35
    _backend = None
36
    _vocab = None
37
    _vectorizer = None
38
    initialized = False
39
40
    # default values for configuration settings
41
    DEFAULT_ACCESS = 'public'
42
43
    def __init__(self, project_id, config, datadir):
44
        self.project_id = project_id
45
        self.name = config['name']
46
        self.language = config['language']
47
        self.analyzer_spec = config.get('analyzer', None)
48
        self.vocab_id = config.get('vocab', None)
49
        self._base_datadir = datadir
50
        self._datadir = os.path.join(datadir, 'projects', self.project_id)
51
        self.config = config
52
        self._init_access()
53
54
    def _init_access(self):
55
        access = self.config.get('access', self.DEFAULT_ACCESS)
56
        try:
57
            self.access = getattr(Access, access)
58
        except AttributeError:
59
            raise ConfigurationException(
60
                "'{}' is not a valid access setting".format(access),
61
                project_id=self.project_id)
62
63
    def _get_datadir(self):
64
        """return the path of the directory where this project can store its
65
        data files"""
66
        if not os.path.exists(self._datadir):
67
            os.makedirs(self._datadir)
68
        return self._datadir
69
70
    def _initialize_analyzer(self):
71
        analyzer = self.analyzer
72
        logger.debug("Project '%s': initialized analyzer: %s",
73
                     self.project_id,
74
                     str(analyzer))
75
76
    def _initialize_subjects(self):
77
        try:
78
            subjects = self.subjects
79
            logger.debug("Project '%s': initialized subjects: %s",
80
                         self.project_id,
81
                         str(subjects))
82
        except AnnifException as err:
83
            logger.warning(err.format_message())
84
85
    def _initialize_vectorizer(self):
86
        try:
87
            vectorizer = self.vectorizer
88
            logger.debug("Project '%s': initialized vectorizer: %s",
89
                         self.project_id,
90
                         str(vectorizer))
91
        except AnnifException as err:
92
            logger.warning(err.format_message())
93
94
    def _initialize_backend(self):
95
        logger.debug("Project '%s': initializing backend", self.project_id)
96
        if not self.backend:
97
            logger.debug("Cannot initialize backend: does not exist")
98
            return
99
        try:
100
            self.backend.initialize()
101
        except AnnifException as err:
102
            logger.warning(err.format_message())
103
104
    def initialize(self):
105
        """initialize this project and its backend so that they are ready to
106
        analyze"""
107
        logger.debug("Initializing project '%s'", self.project_id)
108
109
        self._initialize_analyzer()
110
        self._initialize_subjects()
111
        self._initialize_vectorizer()
112
        self._initialize_backend()
113
114
        self.initialized = True
115
116
    def _analyze_with_backend(self, text, backend_params):
117
        if backend_params is None:
118
            backend_params = {}
119
        beparams = backend_params.get(self.backend.backend_id, {})
120
        hits = self.backend.analyze(text, project=self, params=beparams)
121
        logger.debug(
122
            'Got %d hits from backend %s',
123
            len(hits), self.backend.backend_id)
124
        return hits
125
126
    @property
127
    def analyzer(self):
128
        if self._analyzer is None and self.analyzer_spec:
129
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
130
        return self._analyzer
131
132
    @property
133
    def backend(self):
134
        if self._backend is None:
135
            backend_id = self.config['backend']
136
            try:
137
                backend_class = annif.backend.get_backend(backend_id)
138
                self._backend = backend_class(
139
                    backend_id, params=self.config, datadir=self._datadir)
140
            except ValueError:
141
                logger.warning(
142
                    "Could not create backend %s, "
143
                    "make sure you've installed optional dependencies",
144
                    backend_id)
145
        return self._backend
146
147
    @property
148
    def vocab(self):
149
        if self._vocab is None:
150
            if self.vocab_id is None:
151
                raise ConfigurationException("vocab setting is missing",
152
                                             project_id=self.project_id)
153
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
154
                                                      self._base_datadir)
155
        return self._vocab
156
157
    @property
158
    def subjects(self):
159
        return self.vocab.subjects
160
161
    @property
162
    def vectorizer(self):
163
        if self._vectorizer is None:
164
            path = os.path.join(self._get_datadir(), 'vectorizer')
165
            if os.path.exists(path):
166
                logger.debug('loading vectorizer from %s', path)
167
                self._vectorizer = joblib.load(path)
168
            else:
169
                raise NotInitializedException(
170
                    "vectorizer file '{}' not found".format(path),
171
                    project_id=self.project_id)
172
        return self._vectorizer
173
174
    def analyze(self, text, backend_params=None):
175
        """Analyze the given text by passing it to the backend. Returns a
176
        list of AnalysisHit objects ordered by decreasing score."""
177
178
        logger.debug('Analyzing text "%s..." (len=%d)',
179
                     text[:20], len(text))
180
        hits = self._analyze_with_backend(text, backend_params)
181
        logger.debug('%d hits from backend', len(hits))
182
        return hits
183
184
    def _create_vectorizer(self, subjectcorpus):
185
        if not self.backend.needs_subject_vectorizer:
186
            logger.debug('not creating vectorizer: not needed by backend')
187
            return
188
        logger.info('creating vectorizer')
189
        self._vectorizer = TfidfVectorizer(
190
            tokenizer=self.analyzer.tokenize_words)
191
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
192
        annif.util.atomic_save(
193
            self._vectorizer,
194
            self._get_datadir(),
195
            'vectorizer',
196
            method=joblib.dump)
197
198
    def train(self, corpus):
199
        """train the project using documents from a metadata source"""
200
201
        corpus.set_subject_index(self.subjects)
202
        self._create_vectorizer(corpus)
203
        self.backend.train(corpus, project=self)
204
205
    def dump(self):
206
        """return this project as a dict"""
207
        return {'project_id': self.project_id,
208
                'name': self.name,
209
                'language': self.language,
210
                'backend': {'backend_id': self.config['backend']}
211
                }
212
213
214
def _create_projects(projects_file, datadir, init_projects):
215
    if not os.path.exists(projects_file):
216
        logger.warning("Project configuration file '%s' is missing. " +
217
                       'Please provide one.', projects_file)
218
        logger.warning('You can set the path to the project configuration ' +
219
                       'file using the ANNIF_PROJECTS environment variable.')
220
        return {}
221
222
    config = configparser.ConfigParser()
223
    config.optionxform = lambda option: option
224
    with open(projects_file) as projf:
225
        config.read_file(projf)
226
227
    # create AnnifProject objects from the configuration file
228
    projects = collections.OrderedDict()
229
    for project_id in config.sections():
230
        projects[project_id] = AnnifProject(project_id,
231
                                            config[project_id],
232
                                            datadir)
233
        if init_projects:
234
            projects[project_id].initialize()
235
    return projects
236
237
238
def initialize_projects(app):
239
    projects_file = app.config['PROJECTS_FILE']
240
    datadir = app.config['DATADIR']
241
    init_projects = app.config['INITIALIZE_PROJECTS']
242
    app.annif_projects = _create_projects(
243
        projects_file, datadir, init_projects)
244
245
246
def get_projects(min_access=Access.private):
247
    """Return the available projects as a dict of project_id ->
248
    AnnifProject. The min_access parameter may be used to set the minimum
249
    access level required for the returned projects."""
250
251
    projects = [(project_id, project)
252
                for project_id, project in current_app.annif_projects.items()
253
                if project.access >= min_access]
254
    return collections.OrderedDict(projects)
255
256
257
def get_project(project_id, min_access=Access.private):
258
    """return the definition of a single Project by project_id"""
259
    projects = get_projects(min_access)
260
    try:
261
        return projects[project_id]
262
    except KeyError:
263
        raise ValueError("No such project {}".format(project_id))
264