annif.backend.mixins   A
last analyzed

Complexity

Total Complexity 11

Size/Duplication

Total Lines 95
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 61
dl 0
loc 95
rs 10
c 0
b 0
f 0
wmc 11

5 Methods

Rating   Name   Duplication   Size   Complexity  
A ChunkingBackend._suggest() 0 18 3
A TfidfVectorizerMixin.create_vectorizer() 0 15 3
A ChunkingBackend._suggest_chunks() 0 8 1
A TfidfVectorizerMixin.initialize_vectorizer() 0 10 3
A ChunkingBackend.default_params() 0 2 1
1
"""Annif backend mixins that can be used to implement features"""
2
3
from __future__ import annotations
4
5
import abc
6
import os.path
7
from typing import TYPE_CHECKING, Any
8
9
import joblib
10
from sklearn.feature_extraction.text import TfidfVectorizer
11
12
import annif.util
13
from annif.exception import NotInitializedException
14
15
if TYPE_CHECKING:
16
    from collections.abc import Iterable
17
18
    from scipy.sparse._csr import csr_matrix
19
20
    from annif.corpus import Document
21
    from annif.suggestion import SubjectSuggestion
22
23
24
class ChunkingBackend(metaclass=abc.ABCMeta):
25
    """Annif backend mixin that implements chunking of input"""
26
27
    DEFAULT_PARAMETERS = {"chunksize": 1}
28
29
    def default_params(self) -> dict[str, Any]:
30
        return self.DEFAULT_PARAMETERS
31
32
    @abc.abstractmethod
33
    def _suggest_chunks(
34
        self, chunktexts: list[str], params: dict[str, Any]
35
    ) -> list[SubjectSuggestion]:
36
        """Suggest subjects for the chunked text; should be implemented by
37
        the subclass inheriting this mixin"""
38
39
        pass  # pragma: no cover
40
41
    def _suggest(
42
        self, doc: Document, params: dict[str, Any]
43
    ) -> list[SubjectSuggestion]:
44
        self.debug(
45
            'Suggesting subjects for text "{}..." (len={})'.format(
46
                doc.text[:20], len(doc.text)
47
            )
48
        )
49
        sentences = self.project.analyzer.tokenize_sentences(doc.text)
50
        self.debug("Found {} sentences".format(len(sentences)))
51
        chunksize = int(params["chunksize"])
52
        chunktexts = []
53
        for i in range(0, len(sentences), chunksize):
54
            chunktexts.append(" ".join(sentences[i : i + chunksize]))
55
        self.debug("Split sentences into {} chunks".format(len(chunktexts)))
56
        if len(chunktexts) == 0:  # no input, empty result
57
            return []
58
        return self._suggest_chunks(chunktexts, params)
59
60
61
class TfidfVectorizerMixin:
62
    """Annif backend mixin that implements TfidfVectorizer functionality"""
63
64
    VECTORIZER_FILE = "vectorizer"
65
66
    vectorizer = None
67
68
    def initialize_vectorizer(self) -> None:
69
        if self.vectorizer is None:
70
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
71
            if os.path.exists(path):
72
                self.debug("loading vectorizer from {}".format(path))
73
                self.vectorizer = joblib.load(path)
74
            else:
75
                raise NotInitializedException(
76
                    "vectorizer file '{}' not found".format(path),
77
                    backend_id=self.backend_id,
78
                )
79
80
    def create_vectorizer(
81
        self, input: Iterable[str], params: dict[str, Any] = None
82
    ) -> csr_matrix:
83
        self.info("creating vectorizer")
84
        if params is None:
85
            params = {}
86
        # avoid UserWarning when overriding tokenizer
87
        if "tokenizer" in params:
88
            params["token_pattern"] = None
89
        self.vectorizer = TfidfVectorizer(**params)
90
        veccorpus = self.vectorizer.fit_transform(input)
91
        annif.util.atomic_save(
92
            self.vectorizer, self.datadir, self.VECTORIZER_FILE, method=joblib.dump
93
        )
94
        return veccorpus
95