Completed
Push — master ( ac148f...299d84 )
by Osma
06:16 queued 25s
created

annif.project.AnnifProject.load_documents()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""Project management functionality for Annif"""
2
3
import collections
4
import configparser
5
import os.path
6
from sklearn.externals import joblib
7
from sklearn.feature_extraction.text import TfidfVectorizer
8
from flask import current_app
9
import annif
10
import annif.analyzer
11
import annif.corpus
12
import annif.hit
13
import annif.backend
14
import annif.util
15
import annif.vocab
16
from annif.exception import AnnifException, ConfigurationException, \
17
    NotInitializedException
18
19
logger = annif.logger
20
21
22
class AnnifProject:
23
    """Class representing the configuration of a single Annif project."""
24
25
    # defaults for uninitialized instances
26
    _analyzer = None
27
    _backend = None
28
    _vocab = None
29
    _vectorizer = None
30
    initialized = False
31
32
    def __init__(self, project_id, config, datadir):
33
        self.project_id = project_id
34
        self.name = config['name']
35
        self.language = config['language']
36
        self.analyzer_spec = config.get('analyzer', None)
37
        self.vocab_id = config.get('vocab', None)
38
        self._base_datadir = datadir
39
        self._datadir = os.path.join(datadir, 'projects', self.project_id)
40
        self.config = config
41
42
    def _get_datadir(self):
43
        """return the path of the directory where this project can store its
44
        data files"""
45
        if not os.path.exists(self._datadir):
46
            os.makedirs(self._datadir)
47
        return self._datadir
48
49
    def _initialize_analyzer(self):
50
        analyzer = self.analyzer
51
        logger.debug("Project '%s': initialized analyzer: %s",
52
                     self.project_id,
53
                     str(analyzer))
54
55
    def _initialize_subjects(self):
56
        try:
57
            subjects = self.subjects
58
            logger.debug("Project '%s': initialized subjects: %s",
59
                         self.project_id,
60
                         str(subjects))
61
        except AnnifException as err:
62
            logger.warning(err.format_message())
63
64
    def _initialize_vectorizer(self):
65
        try:
66
            vectorizer = self.vectorizer
67
            logger.debug("Project '%s': initialized vectorizer: %s",
68
                         self.project_id,
69
                         str(vectorizer))
70
        except AnnifException as err:
71
            logger.warning(err.format_message())
72
73
    def _initialize_backend(self):
74
        logger.debug("Project '%s': initializing backend", self.project_id)
75
        if not self.backend:
76
            logger.debug("Cannot initialize backend: does not exist")
77
            return
78
        try:
79
            self.backend.initialize()
80
        except AnnifException as err:
81
            logger.warning(err.format_message())
82
83
    def initialize(self):
84
        """initialize this project and its backend so that they are ready to
85
        analyze"""
86
        logger.debug("Initializing project '%s'", self.project_id)
87
88
        self._initialize_analyzer()
89
        self._initialize_subjects()
90
        self._initialize_vectorizer()
91
        self._initialize_backend()
92
93
        self.initialized = True
94
95
    def _analyze_with_backend(self, text, backend_params):
96
        if backend_params is None:
97
            backend_params = {}
98
        beparams = backend_params.get(self.backend.backend_id, {})
99
        hits = self.backend.analyze(text, project=self, params=beparams)
100
        logger.debug(
101
            'Got %d hits from backend %s',
102
            len(hits), self.backend.backend_id)
103
        return hits
104
105
    @property
106
    def analyzer(self):
107
        if self._analyzer is None and self.analyzer_spec:
108
            self._analyzer = annif.analyzer.get_analyzer(self.analyzer_spec)
109
        return self._analyzer
110
111
    @property
112
    def backend(self):
113
        if self._backend is None:
114
            backend_id = self.config['backend']
115
            try:
116
                backend_class = annif.backend.get_backend(backend_id)
117
                self._backend = backend_class(
118
                    backend_id, params=self.config, datadir=self._datadir)
119
            except ValueError:
120
                logger.debug("Could not create backend %s", backend_id)
121
        return self._backend
122
123
    @property
124
    def vocab(self):
125
        if self._vocab is None:
126
            if self.vocab_id is None:
127
                raise ConfigurationException("vocab setting is missing",
128
                                             project_id=self.project_id)
129
            self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
130
                                                      self._base_datadir)
131
        return self._vocab
132
133
    @property
134
    def subjects(self):
135
        return self.vocab.subjects
136
137
    @property
138
    def vectorizer(self):
139
        if self._vectorizer is None:
140
            path = os.path.join(self._get_datadir(), 'vectorizer')
141
            if os.path.exists(path):
142
                logger.debug('loading vectorizer from %s', path)
143
                self._vectorizer = joblib.load(path)
144
            else:
145
                raise NotInitializedException(
146
                    "vectorizer file '{}' not found".format(path),
147
                    project_id=self.project_id)
148
        return self._vectorizer
149
150
    def analyze(self, text, backend_params=None):
151
        """Analyze the given text by passing it to the backend. Returns a
152
        list of AnalysisHit objects ordered by decreasing score."""
153
154
        logger.debug('Analyzing text "%s..." (len=%d)',
155
                     text[:20], len(text))
156
        hits = self._analyze_with_backend(text, backend_params)
157
        logger.debug('%d hits from backend', len(hits))
158
        return hits
159
160
    def _create_vectorizer(self, subjectcorpus):
161
        if not self.backend.needs_subject_vectorizer:
162
            logger.debug('not creating vectorizer: not needed by backend')
163
            return
164
        logger.info('creating vectorizer')
165
        self._vectorizer = TfidfVectorizer(
166
            tokenizer=self.analyzer.tokenize_words)
167
        self._vectorizer.fit((subj.text for subj in subjectcorpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
168
        annif.util.atomic_save(
169
            self._vectorizer,
170
            self._get_datadir(),
171
            'vectorizer',
172
            method=joblib.dump)
173
174
    def train(self, corpus):
175
        """train the project using documents from a metadata source"""
176
177
        corpus.set_subject_index(self.subjects)
178
        self._create_vectorizer(corpus)
179
        self.backend.train(corpus, project=self)
180
181
    def dump(self):
182
        """return this project as a dict"""
183
        return {'project_id': self.project_id,
184
                'name': self.name,
185
                'language': self.language,
186
                'backend': {'backend_id': self.config['backend']}
187
                }
188
189
190
def _create_projects(projects_file, datadir, init_projects):
191
    if not os.path.exists(projects_file):
192
        logger.warning("Project configuration file '%s' is missing. " +
193
                       'Please provide one.', projects_file)
194
        logger.warning('You can set the path to the project configuration ' +
195
                       'file using the ANNIF_PROJECTS environment variable.')
196
        return {}
197
198
    config = configparser.ConfigParser()
199
    config.optionxform = lambda option: option
200
    with open(projects_file) as projf:
201
        config.read_file(projf)
202
203
    # create AnnifProject objects from the configuration file
204
    projects = collections.OrderedDict()
205
    for project_id in config.sections():
206
        projects[project_id] = AnnifProject(project_id,
207
                                            config[project_id],
208
                                            datadir)
209
        if init_projects:
210
            projects[project_id].initialize()
211
    return projects
212
213
214
def initialize_projects(app):
215
    projects_file = app.config['PROJECTS_FILE']
216
    datadir = app.config['DATADIR']
217
    init_projects = app.config['INITIALIZE_PROJECTS']
218
    app.annif_projects = _create_projects(
219
        projects_file, datadir, init_projects)
220
221
222
def get_projects():
223
    """return the available projects as a dict of project_id -> AnnifProject"""
224
    return current_app.annif_projects
225
226
227
def get_project(project_id):
228
    """return the definition of a single Project by project_id"""
229
    projects = get_projects()
230
    try:
231
        return projects[project_id]
232
    except KeyError:
233
        raise ValueError("No such project {}".format(project_id))
234