Passed
Pull Request — master (#336)
by Osma
03:41
created

annif.backend.tfidf.SubjectBuffer.flush()   A

Complexity

Conditions 4

Size

Total Lines 12
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 12
rs 9.95
c 0
b 0
f 0
cc 4
nop 1
1
"""Backend that returns most similar subjects based on similarity in sparse
2
TF-IDF normalized bag-of-words vector space"""
3
4
import os.path
5
import tempfile
6
import joblib
7
import gensim.similarities
8
from gensim.matutils import Sparse2Corpus
9
from sklearn.feature_extraction.text import TfidfVectorizer
10
import annif.util
11
from annif.suggestion import VectorSuggestionResult
12
from annif.exception import NotInitializedException, NotSupportedException
13
from . import backend
14
15
16
class SubjectBuffer:
17
    """A file-backed buffer to store and retrieve subject text."""
18
19
    BUFFER_SIZE = 100
20
21
    def __init__(self, tempdir, subject_id):
22
        filename = '{:08d}.txt'.format(subject_id)
23
        self._path = os.path.join(tempdir, filename)
24
        self._buffer = []
25
        self._created = False
26
27
    def flush(self):
28
        if self._created:
29
            mode = 'a'
30
        else:
31
            mode = 'w'
32
33
        with open(self._path, mode, encoding='utf-8') as subjfile:
34
            for text in self._buffer:
35
                print(text, file=subjfile)
36
37
        self._buffer = []
38
        self._created = True
39
40
    def write(self, text):
41
        self._buffer.append(text)
42
        if len(self._buffer) >= self.BUFFER_SIZE:
43
            self.flush()
44
45
    def read(self):
46
        if not self._created:
47
            # file was never created - we can simply return the buffer content
48
            return "\n".join(self._buffer)
49
        else:
50
            with open(self._path, 'r', encoding='utf-8') as subjfile:
51
                return subjfile.read() + "\n" + "\n".join(self._buffer)
52
53
54
class TFIDFBackend(backend.AnnifBackend):
55
    """TF-IDF vector space similarity based backend for Annif"""
56
    name = "tfidf"
57
    needs_subject_index = True
58
59
    # defaults for uninitialized instances
60
    _vectorizer = None
61
    _index = None
62
63
    VECTORIZER_FILE = 'vectorizer'
64
    INDEX_FILE = 'tfidf-index'
65
66
    def _generate_subjects_from_documents(self, corpus, project):
67
        with tempfile.TemporaryDirectory() as tempdir:
68
            subject_buffer = {}
69
            for subject_id in range(len(project.subjects)):
70
                subject_buffer[subject_id] = SubjectBuffer(tempdir,
71
                                                           subject_id)
72
73
            for doc in corpus.documents:
74
                tokens = project.analyzer.tokenize_words(doc.text)
75
                for uri in doc.uris:
76
                    subject_id = project.subjects.by_uri(uri)
77
                    if subject_id is None:
78
                        continue
79
                    subject_buffer[subject_id].write(" ".join(tokens))
80
81
            for sid in range(len(project.subjects)):
82
                yield subject_buffer[sid].read()
83
84
    def _initialize_vectorizer(self):
85
        if self._vectorizer is None:
86
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
87
            if os.path.exists(path):
88
                self.debug('loading vectorizer from {}'.format(path))
89
                self._vectorizer = joblib.load(path)
90
            else:
91
                raise NotInitializedException(
92
                    "vectorizer file '{}' not found".format(path),
93
                    backend_id=self.backend_id)
94
95
    def _initialize_index(self):
96
        if self._index is None:
97
            path = os.path.join(self.datadir, self.INDEX_FILE)
98
            self.debug('loading similarity index from {}'.format(path))
99
            if os.path.exists(path):
100
                self._index = gensim.similarities.SparseMatrixSimilarity.load(
101
                    path)
102
            else:
103
                raise NotInitializedException(
104
                    'similarity index {} not found'.format(path),
105
                    backend_id=self.backend_id)
106
107
    def initialize(self):
108
        self._initialize_vectorizer()
109
        self._initialize_index()
110
111
    def _create_index(self, veccorpus):
112
        self.info('creating similarity index')
113
        gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
114
        self._index = gensim.similarities.SparseMatrixSimilarity(
115
            gscorpus, num_features=len(self._vectorizer.vocabulary_))
116
        annif.util.atomic_save(
117
            self._index,
118
            self.datadir,
119
            self.INDEX_FILE)
120
121
    def train(self, corpus, project):
122
        if corpus.is_empty():
123
            raise NotSupportedException(
124
                'Cannot train tfidf project with no documents')
125
        self.info('transforming subject corpus')
126
        subjects = self._generate_subjects_from_documents(corpus, project)
127
        self.info('creating vectorizer')
128
        self._vectorizer = TfidfVectorizer()
129
        veccorpus = self._vectorizer.fit_transform(subjects)
130
        annif.util.atomic_save(
131
            self._vectorizer,
132
            self.datadir,
133
            self.VECTORIZER_FILE,
134
            method=joblib.dump)
135
        self._create_index(veccorpus)
136
137
    def _suggest(self, text, project, params):
138
        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
139
            text[:20], len(text)))
140
        tokens = project.analyzer.tokenize_words(text)
141
        vectors = self._vectorizer.transform([" ".join(tokens)])
142
        docsim = self._index[vectors[0]]
143
        fullresult = VectorSuggestionResult(docsim, project.subjects)
144
        return fullresult.filter(limit=int(self.params['limit']))
145