Completed
Push — master ( ac148f...299d84 )
by Osma
06:16 queued 25s
created

annif.backend.tfidf.TFIDFBackend.load_corpus()   A

Complexity

Conditions 1

Size

Total Lines 11
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 11
rs 9.85
c 0
b 0
f 0
cc 1
nop 3
1
"""Backend that returns most similar subjects based on similarity in sparse
2
TF-IDF normalized bag-of-words vector space"""
3
4
import os.path
5
import gensim.similarities
6
from gensim.matutils import Sparse2Corpus
7
import annif.util
8
from annif.hit import VectorAnalysisResult
9
from annif.exception import NotInitializedException
10
from . import backend
11
12
13
class TFIDFBackend(backend.AnnifBackend):
14
    """TF-IDF vector space similarity based backend for Annif"""
15
    name = "tfidf"
16
    needs_subject_index = True
17
    needs_subject_vectorizer = True
18
19
    # defaults for uninitialized instances
20
    _index = None
21
22
    INDEX_FILE = 'tfidf-index'
23
24
    def initialize(self):
25
        if self._index is None:
26
            path = os.path.join(self._get_datadir(), self.INDEX_FILE)
27
            self.debug('loading similarity index from {}'.format(path))
28
            if os.path.exists(path):
29
                self._index = gensim.similarities.SparseMatrixSimilarity.load(
30
                    path)
31
            else:
32
                raise NotInitializedException(
33
                    'similarity index {} not found'.format(path),
34
                    backend_id=self.backend_id)
35
36
    def train(self, corpus, project):
37
        self.info('creating similarity index')
38
        veccorpus = project.vectorizer.transform(
39
            (subj.text for subj in corpus.subjects))
1 ignored issue
show
Comprehensibility Best Practice introduced by
The variable subj does not seem to be defined.
Loading history...
40
        gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
41
        self._index = gensim.similarities.SparseMatrixSimilarity(
42
            gscorpus, num_features=len(project.vectorizer.vocabulary_))
43
        annif.util.atomic_save(
44
            self._index,
45
            self._get_datadir(),
46
            self.INDEX_FILE)
47
48
    def _analyze(self, text, project, params):
49
        self.initialize()
50
        self.debug('Analyzing text "{}..." (len={})'.format(
51
            text[:20], len(text)))
52
        vectors = project.vectorizer.transform([text])
53
        docsim = self._index[vectors[0]]
54
        fullresult = VectorAnalysisResult(docsim, project.subjects)
55
        return fullresult.filter(limit=int(self.params['limit']))
56