Completed
Push — master ( 244db9...8e90e2 )
by Osma
26s queued 11s
created

TFIDFBackend._initialize_index()   A

Complexity

Conditions 3

Size

Total Lines 11
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 10
dl 0
loc 11
rs 9.9
c 0
b 0
f 0
cc 3
nop 1
1
"""Backend that returns most similar subjects based on similarity in sparse
2
TF-IDF normalized bag-of-words vector space"""
3
4
import os.path
5
import joblib
6
import gensim.similarities
7
from gensim.matutils import Sparse2Corpus
8
from sklearn.feature_extraction.text import TfidfVectorizer
9
import annif.util
10
from annif.suggestion import VectorSuggestionResult
11
from annif.exception import NotInitializedException, NotSupportedException
12
from . import backend
13
14
15
class TFIDFBackend(backend.AnnifBackend):
16
    """TF-IDF vector space similarity based backend for Annif"""
17
    name = "tfidf"
18
    needs_subject_index = True
19
20
    # defaults for uninitialized instances
21
    _vectorizer = None
22
    _index = None
23
24
    VECTORIZER_FILE = 'vectorizer'
25
    INDEX_FILE = 'tfidf-index'
26
27
    def _initialize_vectorizer(self):
28
        if self._vectorizer is None:
29
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
30
            if os.path.exists(path):
31
                self.debug('loading vectorizer from {}'.format(path))
32
                self._vectorizer = joblib.load(path)
33
            else:
34
                raise NotInitializedException(
35
                    "vectorizer file '{}' not found".format(path),
36
                    backend_id=self.backend_id)
37
38
    def _initialize_index(self):
39
        if self._index is None:
40
            path = os.path.join(self.datadir, self.INDEX_FILE)
41
            self.debug('loading similarity index from {}'.format(path))
42
            if os.path.exists(path):
43
                self._index = gensim.similarities.SparseMatrixSimilarity.load(
44
                    path)
45
            else:
46
                raise NotInitializedException(
47
                    'similarity index {} not found'.format(path),
48
                    backend_id=self.backend_id)
49
50
    def initialize(self):
51
        self._initialize_vectorizer()
52
        self._initialize_index()
53
54
    def _create_index(self, veccorpus):
55
        self.info('creating similarity index')
56
        gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
57
        self._index = gensim.similarities.SparseMatrixSimilarity(
58
            gscorpus, num_features=len(self._vectorizer.vocabulary_))
59
        annif.util.atomic_save(
60
            self._index,
61
            self.datadir,
62
            self.INDEX_FILE)
63
64
    def train(self, corpus, project):
65
        if corpus.is_empty():
66
            raise NotSupportedException(
67
                'Cannot train tfidf project with no documents')
68
        self.info('transforming subject corpus')
69
        subjects = corpus.subjects
70
        self.info('creating vectorizer')
71
        self._vectorizer = TfidfVectorizer(
72
            tokenizer=project.analyzer.tokenize_words)
73
        veccorpus = self._vectorizer.fit_transform(
74
            (subj.text for subj in subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
75
        annif.util.atomic_save(
76
            self._vectorizer,
77
            self.datadir,
78
            self.VECTORIZER_FILE,
79
            method=joblib.dump)
80
        self._create_index(veccorpus)
81
82
    def _suggest(self, text, project, params):
83
        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
84
            text[:20], len(text)))
85
        vectors = self._vectorizer.transform([text])
86
        docsim = self._index[vectors[0]]
87
        fullresult = VectorSuggestionResult(docsim, project.subjects)
88
        return fullresult.filter(limit=int(self.params['limit']))
89