1
|
|
|
"""Annif backend mixins that can be used to implement features""" |
2
|
|
|
|
3
|
|
|
from __future__ import annotations |
4
|
|
|
|
5
|
|
|
import abc |
6
|
|
|
import os.path |
7
|
|
|
from typing import TYPE_CHECKING, Any |
8
|
|
|
|
9
|
|
|
import joblib |
10
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
11
|
|
|
|
12
|
|
|
import annif.util |
13
|
|
|
from annif.exception import NotInitializedException |
14
|
|
|
|
15
|
|
|
if TYPE_CHECKING: |
16
|
|
|
from collections.abc import Iterable |
17
|
|
|
|
18
|
|
|
from scipy.sparse._csr import csr_matrix |
19
|
|
|
|
20
|
|
|
from annif.corpus import Document |
21
|
|
|
from annif.suggestion import SubjectSuggestion |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
class ChunkingBackend(metaclass=abc.ABCMeta): |
25
|
|
|
"""Annif backend mixin that implements chunking of input""" |
26
|
|
|
|
27
|
|
|
DEFAULT_PARAMETERS = {"chunksize": 1} |
28
|
|
|
|
29
|
|
|
def default_params(self) -> dict[str, Any]: |
30
|
|
|
return self.DEFAULT_PARAMETERS |
31
|
|
|
|
32
|
|
|
@abc.abstractmethod |
33
|
|
|
def _suggest_chunks( |
34
|
|
|
self, chunktexts: list[str], params: dict[str, Any] |
35
|
|
|
) -> list[SubjectSuggestion]: |
36
|
|
|
"""Suggest subjects for the chunked text; should be implemented by |
37
|
|
|
the subclass inheriting this mixin""" |
38
|
|
|
|
39
|
|
|
pass # pragma: no cover |
40
|
|
|
|
41
|
|
|
def _suggest( |
42
|
|
|
self, doc: Document, params: dict[str, Any] |
43
|
|
|
) -> list[SubjectSuggestion]: |
44
|
|
|
self.debug( |
45
|
|
|
'Suggesting subjects for text "{}..." (len={})'.format( |
46
|
|
|
doc.text[:20], len(doc.text) |
47
|
|
|
) |
48
|
|
|
) |
49
|
|
|
sentences = self.project.analyzer.tokenize_sentences(doc.text) |
50
|
|
|
self.debug("Found {} sentences".format(len(sentences))) |
51
|
|
|
chunksize = int(params["chunksize"]) |
52
|
|
|
chunktexts = [] |
53
|
|
|
for i in range(0, len(sentences), chunksize): |
54
|
|
|
chunktexts.append(" ".join(sentences[i : i + chunksize])) |
55
|
|
|
self.debug("Split sentences into {} chunks".format(len(chunktexts))) |
56
|
|
|
if len(chunktexts) == 0: # no input, empty result |
57
|
|
|
return [] |
58
|
|
|
return self._suggest_chunks(chunktexts, params) |
59
|
|
|
|
60
|
|
|
|
61
|
|
|
class TfidfVectorizerMixin: |
62
|
|
|
"""Annif backend mixin that implements TfidfVectorizer functionality""" |
63
|
|
|
|
64
|
|
|
VECTORIZER_FILE = "vectorizer" |
65
|
|
|
|
66
|
|
|
vectorizer = None |
67
|
|
|
|
68
|
|
|
def initialize_vectorizer(self) -> None: |
69
|
|
|
if self.vectorizer is None: |
70
|
|
|
path = os.path.join(self.datadir, self.VECTORIZER_FILE) |
71
|
|
|
if os.path.exists(path): |
72
|
|
|
self.debug("loading vectorizer from {}".format(path)) |
73
|
|
|
self.vectorizer = joblib.load(path) |
74
|
|
|
else: |
75
|
|
|
raise NotInitializedException( |
76
|
|
|
"vectorizer file '{}' not found".format(path), |
77
|
|
|
backend_id=self.backend_id, |
78
|
|
|
) |
79
|
|
|
|
80
|
|
|
def create_vectorizer( |
81
|
|
|
self, input: Iterable[str], params: dict[str, Any] = None |
82
|
|
|
) -> csr_matrix: |
83
|
|
|
self.info("creating vectorizer") |
84
|
|
|
if params is None: |
85
|
|
|
params = {} |
86
|
|
|
# avoid UserWarning when overriding tokenizer |
87
|
|
|
if "tokenizer" in params: |
88
|
|
|
params["token_pattern"] = None |
89
|
|
|
self.vectorizer = TfidfVectorizer(**params) |
90
|
|
|
veccorpus = self.vectorizer.fit_transform(input) |
91
|
|
|
annif.util.atomic_save( |
92
|
|
|
self.vectorizer, self.datadir, self.VECTORIZER_FILE, method=joblib.dump |
93
|
|
|
) |
94
|
|
|
return veccorpus |
95
|
|
|
|