TFIDFBackend._generate_subjects_from_documents()   B
last analyzed

Complexity

Conditions 6

Size

Total Lines 15
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 12
nop 2
dl 0
loc 15
rs 8.6666
c 0
b 0
f 0
1
"""Backend that returns most similar subjects based on similarity in sparse
2
TF-IDF normalized bag-of-words vector space"""
3
4
from __future__ import annotations
5
6
import os.path
7
import tempfile
8
from typing import TYPE_CHECKING, Any
9
10
from scipy.sparse import load_npz, save_npz
11
from sklearn.preprocessing import normalize
12
13
import annif.util
14
from annif.exception import NotInitializedException, NotSupportedException
15
from annif.suggestion import vector_to_suggestions
16
17
from . import backend, mixins
18
19
if TYPE_CHECKING:
20
    from collections.abc import Iterator
21
22
    from annif.corpus import Document, DocumentCorpus
23
24
25
class SubjectBuffer:
26
    """A file-backed buffer to store and retrieve subject text."""
27
28
    BUFFER_SIZE = 100
29
30
    def __init__(self, tempdir: str, subject_id: int) -> None:
31
        filename = "{:08d}.txt".format(subject_id)
32
        self._path = os.path.join(tempdir, filename)
33
        self._buffer = []
34
        self._created = False
35
36
    def flush(self) -> None:
37
        if self._created:
38
            mode = "a"
39
        else:
40
            mode = "w"
41
42
        with open(self._path, mode, encoding="utf-8") as subjfile:
43
            for text in self._buffer:
44
                print(text, file=subjfile)
45
46
        self._buffer = []
47
        self._created = True
48
49
    def write(self, text: str) -> None:
50
        self._buffer.append(text)
51
        if len(self._buffer) >= self.BUFFER_SIZE:
52
            self.flush()
53
54
    def read(self) -> str:
55
        if not self._created:
56
            # file was never created - we can simply return the buffer content
57
            return "\n".join(self._buffer)
58
        else:
59
            with open(self._path, "r", encoding="utf-8") as subjfile:
60
                return subjfile.read() + "\n" + "\n".join(self._buffer)
61
62
63
class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
64
    """TF-IDF vector space similarity based backend for Annif"""
65
66
    name = "tfidf"
67
68
    # defaults for uninitialized instances
69
    _tfidf_matrix = None
70
71
    MATRIX_FILE = "tfidf-matrix.npz"
72
73
    def _generate_subjects_from_documents(
74
        self, corpus: DocumentCorpus
75
    ) -> Iterator[str]:
76
        with tempfile.TemporaryDirectory() as tempdir:
77
            subject_buffer = {}
78
            for subject_id in range(len(self.project.subjects)):
79
                subject_buffer[subject_id] = SubjectBuffer(tempdir, subject_id)
80
81
            for doc in corpus.documents:
82
                tokens = self.project.analyzer.tokenize_words(doc.text)
83
                for subject_id in doc.subject_set:
84
                    subject_buffer[subject_id].write(" ".join(tokens))
85
86
            for sid in range(len(self.project.subjects)):
87
                yield subject_buffer[sid].read()
88
89
    def _initialize_index(self) -> None:
90
        if self._tfidf_matrix is None:
91
            path = os.path.join(self.datadir, self.MATRIX_FILE)
92
            self.debug("loading tf-idf matrix from {}".format(path))
93
            if os.path.exists(path):
94
                self._tfidf_matrix = load_npz(path)
95
            else:
96
                raise NotInitializedException(
97
                    "tf-idf matrix {} not found".format(path),
98
                    backend_id=self.backend_id,
99
                )
100
101
    def initialize(self, parallel: bool = False) -> None:
102
        self.initialize_vectorizer()
103
        self._initialize_index()
104
105
    def _train(
106
        self,
107
        corpus: DocumentCorpus,
108
        params: dict[str, Any],
109
        jobs: int = 0,
110
    ) -> None:
111
        if corpus == "cached":
112
            raise NotSupportedException(
113
                "Training tfidf project from cached data not supported."
114
            )
115
        if corpus.is_empty():
116
            raise NotSupportedException("Cannot train tfidf project with no documents")
117
        self.info("transforming subject corpus")
118
        subjects = self._generate_subjects_from_documents(corpus)
119
        self._tfidf_matrix = normalize(self.create_vectorizer(subjects))
120
        self.info("saving tf-idf matrix")
121
        annif.util.atomic_save(
122
            self._tfidf_matrix,
123
            self.datadir,
124
            self.MATRIX_FILE,
125
            lambda obj, filename: save_npz(filename, obj),
126
        )
127
128
    def _suggest(self, doc: Document, params: dict[str, Any]) -> Iterator:
129
        self.debug(
130
            'Suggesting subjects for text "{}..." (len={})'.format(
131
                doc.text[:20], len(doc.text)
132
            )
133
        )
134
        tokens = self.project.analyzer.tokenize_words(doc.text)
135
        query_vector = normalize(self.vectorizer.transform([" ".join(tokens)]))
136
137
        # Compute cosine similarity between query and indexed corpus
138
        similarities = (query_vector @ self._tfidf_matrix.T).toarray().flatten()
139
140
        return vector_to_suggestions(similarities, int(params["limit"]))
141