Passed
Pull Request — master (#366)
by Osma
03:53
created

annif.backend.tfidf   A

Complexity

Total Complexity 25

Size/Duplication

Total Lines 124
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 25
eloc 93
dl 0
loc 124
rs 10
c 0
b 0
f 0

10 Methods

Rating   Name   Duplication   Size   Complexity  
A TFIDFBackend._suggest() 0 8 1
A TFIDFBackend.initialize() 0 3 1
A TFIDFBackend.train() 0 8 2
A SubjectBuffer.write() 0 4 2
A TFIDFBackend._initialize_index() 0 11 3
B TFIDFBackend._generate_subjects_from_documents() 0 17 7
A SubjectBuffer.read() 0 7 3
A SubjectBuffer.__init__() 0 5 1
A TFIDFBackend._create_index() 0 9 1
A SubjectBuffer.flush() 0 12 4
1
"""Backend that returns most similar subjects based on similarity in sparse
2
TF-IDF normalized bag-of-words vector space"""
3
4
import os.path
5
import tempfile
6
import gensim.similarities
7
from gensim.matutils import Sparse2Corpus
8
import annif.util
9
from annif.suggestion import VectorSuggestionResult
10
from annif.exception import NotInitializedException, NotSupportedException
11
from . import backend
12
from . import mixins
13
14
15
class SubjectBuffer:
16
    """A file-backed buffer to store and retrieve subject text."""
17
18
    BUFFER_SIZE = 100
19
20
    def __init__(self, tempdir, subject_id):
21
        filename = '{:08d}.txt'.format(subject_id)
22
        self._path = os.path.join(tempdir, filename)
23
        self._buffer = []
24
        self._created = False
25
26
    def flush(self):
27
        if self._created:
28
            mode = 'a'
29
        else:
30
            mode = 'w'
31
32
        with open(self._path, mode, encoding='utf-8') as subjfile:
33
            for text in self._buffer:
34
                print(text, file=subjfile)
35
36
        self._buffer = []
37
        self._created = True
38
39
    def write(self, text):
40
        self._buffer.append(text)
41
        if len(self._buffer) >= self.BUFFER_SIZE:
42
            self.flush()
43
44
    def read(self):
45
        if not self._created:
46
            # file was never created - we can simply return the buffer content
47
            return "\n".join(self._buffer)
48
        else:
49
            with open(self._path, 'r', encoding='utf-8') as subjfile:
50
                return subjfile.read() + "\n" + "\n".join(self._buffer)
51
52
53
class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
54
    """TF-IDF vector space similarity based backend for Annif"""
55
    name = "tfidf"
56
    needs_subject_index = True
57
58
    # defaults for uninitialized instances
59
    _index = None
60
61
    INDEX_FILE = 'tfidf-index'
62
63
    def _generate_subjects_from_documents(self, corpus):
64
        with tempfile.TemporaryDirectory() as tempdir:
65
            subject_buffer = {}
66
            for subject_id in range(len(self.project.subjects)):
67
                subject_buffer[subject_id] = SubjectBuffer(tempdir,
68
                                                           subject_id)
69
70
            for doc in corpus.documents:
71
                tokens = self.project.analyzer.tokenize_words(doc.text)
72
                for uri in doc.uris:
73
                    subject_id = self.project.subjects.by_uri(uri)
74
                    if subject_id is None:
75
                        continue
76
                    subject_buffer[subject_id].write(" ".join(tokens))
77
78
            for sid in range(len(self.project.subjects)):
79
                yield subject_buffer[sid].read()
80
81
    def _initialize_index(self):
82
        if self._index is None:
83
            path = os.path.join(self.datadir, self.INDEX_FILE)
84
            self.debug('loading similarity index from {}'.format(path))
85
            if os.path.exists(path):
86
                self._index = gensim.similarities.SparseMatrixSimilarity.load(
87
                    path)
88
            else:
89
                raise NotInitializedException(
90
                    'similarity index {} not found'.format(path),
91
                    backend_id=self.backend_id)
92
93
    def initialize(self):
94
        self.initialize_vectorizer()
95
        self._initialize_index()
96
97
    def _create_index(self, veccorpus):
98
        self.info('creating similarity index')
99
        gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
100
        self._index = gensim.similarities.SparseMatrixSimilarity(
101
            gscorpus, num_features=len(self.vectorizer.vocabulary_))
102
        annif.util.atomic_save(
103
            self._index,
104
            self.datadir,
105
            self.INDEX_FILE)
106
107
    def train(self, corpus):
108
        if corpus.is_empty():
109
            raise NotSupportedException(
110
                'Cannot train tfidf project with no documents')
111
        self.info('transforming subject corpus')
112
        subjects = self._generate_subjects_from_documents(corpus)
113
        veccorpus = self.create_vectorizer(subjects)
114
        self._create_index(veccorpus)
115
116
    def _suggest(self, text, params):
117
        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
118
            text[:20], len(text)))
119
        tokens = self.project.analyzer.tokenize_words(text)
120
        vectors = self.vectorizer.transform([" ".join(tokens)])
121
        docsim = self._index[vectors[0]]
122
        fullresult = VectorSuggestionResult(docsim, self.project.subjects)
123
        return fullresult.filter(limit=int(self.params['limit']))
124