annif.backend.tfidf.TFIDFBackend._generate_subjects_from_documents() - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

TFIDFBackend._generate_subjects_from_documents() B
last analyzed 2025-08-19 08:37 UTC

↳ Parent: annif.backend.tfidf

Complexity

Conditions

Size

Total Lines	15
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	6
eloc	12
nop	2
dl	0
loc	15
rs	8.6666
c	0
b	0
f	0

"""Backend that returns most similar subjects based on similarity in sparse
TF-IDF normalized bag-of-words vector space"""

from __future__ import annotations

import os.path
import tempfile
from typing import TYPE_CHECKING, Any

from scipy.sparse import load_npz, save_npz
from sklearn.preprocessing import normalize

import annif.util
from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import vector_to_suggestions

from . import backend, mixins

if TYPE_CHECKING:
    from collections.abc import Iterator

    from annif.corpus import Document, DocumentCorpus


class SubjectBuffer:
    """A file-backed buffer to store and retrieve subject text."""

    BUFFER_SIZE = 100

    def __init__(self, tempdir: str, subject_id: int) -> None:
        filename = "{:08d}.txt".format(subject_id)
        self._path = os.path.join(tempdir, filename)
        self._buffer = []
        self._created = False

    def flush(self) -> None:
        if self._created:
            mode = "a"
        else:
            mode = "w"

        with open(self._path, mode, encoding="utf-8") as subjfile:
            for text in self._buffer:
                print(text, file=subjfile)

        self._buffer = []
        self._created = True

    def write(self, text: str) -> None:
        self._buffer.append(text)
        if len(self._buffer) >= self.BUFFER_SIZE:
            self.flush()

    def read(self) -> str:
        if not self._created:
            # file was never created - we can simply return the buffer content
            return "\n".join(self._buffer)
        else:
            with open(self._path, "r", encoding="utf-8") as subjfile:
                return subjfile.read() + "\n" + "\n".join(self._buffer)


class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
    """TF-IDF vector space similarity based backend for Annif"""

    name = "tfidf"

    # defaults for uninitialized instances
    _tfidf_matrix = None

    MATRIX_FILE = "tfidf-matrix.npz"

    def _generate_subjects_from_documents(
        self, corpus: DocumentCorpus
    ) -> Iterator[str]:
        with tempfile.TemporaryDirectory() as tempdir:
            subject_buffer = {}
            for subject_id in range(len(self.project.subjects)):
                subject_buffer[subject_id] = SubjectBuffer(tempdir, subject_id)

            for doc in corpus.documents:
                tokens = self.project.analyzer.tokenize_words(doc.text)
                for subject_id in doc.subject_set:
                    subject_buffer[subject_id].write(" ".join(tokens))

            for sid in range(len(self.project.subjects)):
                yield subject_buffer[sid].read()

    def _initialize_index(self) -> None:
        if self._tfidf_matrix is None:
            path = os.path.join(self.datadir, self.MATRIX_FILE)
            self.debug("loading tf-idf matrix from {}".format(path))
            if os.path.exists(path):
                self._tfidf_matrix = load_npz(path)
            else:
                raise NotInitializedException(
                    "tf-idf matrix {} not found".format(path),
                    backend_id=self.backend_id,
                )

    def initialize(self, parallel: bool = False) -> None:
        self.initialize_vectorizer()
        self._initialize_index()

    def _train(
        self,
        corpus: DocumentCorpus,
        params: dict[str, Any],
        jobs: int = 0,
    ) -> None:
        if corpus == "cached":
            raise NotSupportedException(
                "Training tfidf project from cached data not supported."
            )
        if corpus.is_empty():
            raise NotSupportedException("Cannot train tfidf project with no documents")
        self.info("transforming subject corpus")
        subjects = self._generate_subjects_from_documents(corpus)
        self._tfidf_matrix = normalize(self.create_vectorizer(subjects))
        self.info("saving tf-idf matrix")
        annif.util.atomic_save(
            self._tfidf_matrix,
            self.datadir,
            self.MATRIX_FILE,
            lambda obj, filename: save_npz(filename, obj),
        )

    def _suggest(self, doc: Document, params: dict[str, Any]) -> Iterator:
        self.debug(
            'Suggesting subjects for text "{}..." (len={})'.format(
                doc.text[:20], len(doc.text)
            )
        )
        tokens = self.project.analyzer.tokenize_words(doc.text)
        query_vector = normalize(self.vectorizer.transform([" ".join(tokens)]))

        # Compute cosine similarity between query and indexed corpus
        similarities = (query_vector @ self._tfidf_matrix.T).toarray().flatten()

        return vector_to_suggestions(similarities, int(params["limit"]))


1			"""Backend that returns most similar subjects based on similarity in sparse
2			TF-IDF normalized bag-of-words vector space"""
3
4			from __future__ import annotations
5
6			import os.path
7			import tempfile
8			from typing import TYPE_CHECKING, Any
9
10			from scipy.sparse import load_npz, save_npz
11			from sklearn.preprocessing import normalize
12
13			import annif.util
14			from annif.exception import NotInitializedException, NotSupportedException
15			from annif.suggestion import vector_to_suggestions
16
17			from . import backend, mixins
18
19			if TYPE_CHECKING:
20			from collections.abc import Iterator
21
22			from annif.corpus import Document, DocumentCorpus
23
24
25			class SubjectBuffer:
26			"""A file-backed buffer to store and retrieve subject text."""
27
28			BUFFER_SIZE = 100
29
30			def __init__(self, tempdir: str, subject_id: int) -> None:
31			filename = "{:08d}.txt".format(subject_id)
32			self._path = os.path.join(tempdir, filename)
33			self._buffer = []
34			self._created = False
35
36			def flush(self) -> None:
37			if self._created:
38			mode = "a"
39			else:
40			mode = "w"
41
42			with open(self._path, mode, encoding="utf-8") as subjfile:
43			for text in self._buffer:
44			print(text, file=subjfile)
45
46			self._buffer = []
47			self._created = True
48
49			def write(self, text: str) -> None:
50			self._buffer.append(text)
51			if len(self._buffer) >= self.BUFFER_SIZE:
52			self.flush()
53
54			def read(self) -> str:
55			if not self._created:
56			# file was never created - we can simply return the buffer content
57			return "\n".join(self._buffer)
58			else:
59			with open(self._path, "r", encoding="utf-8") as subjfile:
60			return subjfile.read() + "\n" + "\n".join(self._buffer)
61
62
63			class TFIDFBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
64			"""TF-IDF vector space similarity based backend for Annif"""
65
66			name = "tfidf"
67
68			# defaults for uninitialized instances
69			_tfidf_matrix = None
70
71			MATRIX_FILE = "tfidf-matrix.npz"
72
73			def _generate_subjects_from_documents(
74			self, corpus: DocumentCorpus
75			) -> Iterator[str]:
76			with tempfile.TemporaryDirectory() as tempdir:
77			subject_buffer = {}
78			for subject_id in range(len(self.project.subjects)):
79			subject_buffer[subject_id] = SubjectBuffer(tempdir, subject_id)
80
81			for doc in corpus.documents:
82			tokens = self.project.analyzer.tokenize_words(doc.text)
83			for subject_id in doc.subject_set:
84			subject_buffer[subject_id].write(" ".join(tokens))
85
86			for sid in range(len(self.project.subjects)):
87			yield subject_buffer[sid].read()
88
89			def _initialize_index(self) -> None:
90			if self._tfidf_matrix is None:
91			path = os.path.join(self.datadir, self.MATRIX_FILE)
92			self.debug("loading tf-idf matrix from {}".format(path))
93			if os.path.exists(path):
94			self._tfidf_matrix = load_npz(path)
95			else:
96			raise NotInitializedException(
97			"tf-idf matrix {} not found".format(path),
98			backend_id=self.backend_id,
99			)
100
101			def initialize(self, parallel: bool = False) -> None:
102			self.initialize_vectorizer()
103			self._initialize_index()
104
105			def _train(
106			self,
107			corpus: DocumentCorpus,
108			params: dict[str, Any],
109			jobs: int = 0,
110			) -> None:
111			if corpus == "cached":
112			raise NotSupportedException(
113			"Training tfidf project from cached data not supported."
114			)
115			if corpus.is_empty():
116			raise NotSupportedException("Cannot train tfidf project with no documents")
117			self.info("transforming subject corpus")
118			subjects = self._generate_subjects_from_documents(corpus)
119			self._tfidf_matrix = normalize(self.create_vectorizer(subjects))
120			self.info("saving tf-idf matrix")
121			annif.util.atomic_save(
122			self._tfidf_matrix,
123			self.datadir,
124			self.MATRIX_FILE,
125			lambda obj, filename: save_npz(filename, obj),
126			)
127
128			def _suggest(self, doc: Document, params: dict[str, Any]) -> Iterator:
129			self.debug(
130			'Suggesting subjects for text "{}..." (len={})'.format(
131			doc.text[:20], len(doc.text)
132			)
133			)
134			tokens = self.project.analyzer.tokenize_words(doc.text)
135			query_vector = normalize(self.vectorizer.transform([" ".join(tokens)]))
136
137			# Compute cosine similarity between query and indexed corpus
138			similarities = (query_vector @ self._tfidf_matrix.T).toarray().flatten()
139
140			return vector_to_suggestions(similarities, int(params["limit"]))
141

NatLibFi / Annif

TFIDFBackend._generate_subjects_from_documents() B last analyzed 2025-08-19 08:37 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

TFIDFBackend._generate_subjects_from_documents() B
last analyzed 2025-08-19 08:37 UTC