Passed
Pull Request — master (#366)
by Osma
03:53
created

ChunkingBackend.default_params()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""Annif backend mixins that can be used to implement features"""
2
3
4
import abc
5
import os.path
6
import joblib
7
from sklearn.feature_extraction.text import TfidfVectorizer
8
import annif.util
9
from annif.exception import NotInitializedException
10
from annif.suggestion import ListSuggestionResult
11
12
13
class ChunkingBackend(metaclass=abc.ABCMeta):
14
    """Annif backend mixin that implements chunking of input"""
15
16
    DEFAULT_PARAMS = {'chunksize': 1}
17
18
    def default_params(self):
19
        return self.DEFAULT_PARAMS
20
21
    @abc.abstractmethod
22
    def _suggest_chunks(self, chunktexts):
23
        """Suggest subjects for the chunked text; should be implemented by
24
        the subclass inheriting this mixin"""
25
26
        pass  # pragma: no cover
27
28
    def _suggest(self, text, params):
29
        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
30
            text[:20], len(text)))
31
        sentences = self.project.analyzer.tokenize_sentences(text)
32
        self.debug('Found {} sentences'.format(len(sentences)))
33
        chunksize = int(params['chunksize'])
34
        chunktexts = []
35
        for i in range(0, len(sentences), chunksize):
36
            chunktexts.append(' '.join(sentences[i:i + chunksize]))
37
        self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
38
        if len(chunktexts) == 0:  # no input, empty result
39
            return ListSuggestionResult(
40
                hits=[], subject_index=self.project.subjects)
41
        return self._suggest_chunks(chunktexts)
42
43
44
class TfidfVectorizerMixin:
45
    """Annif backend mixin that implements TfidfVectorizer functionality"""
46
47
    VECTORIZER_FILE = 'vectorizer'
48
49
    vectorizer = None
50
51
    def initialize_vectorizer(self):
52
        if self.vectorizer is None:
53
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
54
            if os.path.exists(path):
55
                self.debug('loading vectorizer from {}'.format(path))
56
                self.vectorizer = joblib.load(path)
57
            else:
58
                raise NotInitializedException(
59
                    "vectorizer file '{}' not found".format(path),
60
                    backend_id=self.backend_id)
61
62
    def create_vectorizer(self, input, params={}):
63
        self.info('creating vectorizer')
64
        self.vectorizer = TfidfVectorizer(**params)
65
        veccorpus = self.vectorizer.fit_transform(input)
66
        annif.util.atomic_save(
67
            self.vectorizer,
68
            self.datadir,
69
            self.VECTORIZER_FILE,
70
            method=joblib.dump)
71
        return veccorpus
72