Passed
Pull Request — main (#798)
by
unknown
03:25
created

PecosTfidfVectorizerMixin.vectorizer_dict()   A

Complexity

Conditions 1

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
nop 2
dl 0
loc 14
rs 10
c 0
b 0
f 0
1
"""Annif backend mixins that can be used to implement features"""
2
3
from __future__ import annotations
4
5
import abc
6
import os.path
7
from typing import TYPE_CHECKING, Any
8
9
import joblib
10
import numpy as np
11
from pecos.utils.featurization.text.vectorizers import Vectorizer
12
from sklearn.feature_extraction.text import TfidfVectorizer
13
14
import annif.util
15
from annif.exception import NotInitializedException
16
17
if TYPE_CHECKING:
18
    from collections.abc import Iterable
19
20
    from scipy.sparse._csr import csr_matrix
21
22
    from annif.corpus import Document
23
    from annif.suggestion import SubjectSuggestion
24
25
26
class ChunkingBackend(metaclass=abc.ABCMeta):
27
    """Annif backend mixin that implements chunking of input"""
28
29
    DEFAULT_PARAMETERS = {"chunksize": 1}
30
31
    def default_params(self) -> dict[str, Any]:
32
        return self.DEFAULT_PARAMETERS
33
34
    @abc.abstractmethod
35
    def _suggest_chunks(
36
        self, chunktexts: list[str], params: dict[str, Any]
37
    ) -> list[SubjectSuggestion]:
38
        """Suggest subjects for the chunked text; should be implemented by
39
        the subclass inheriting this mixin"""
40
41
        pass  # pragma: no cover
42
43
    def _suggest(
44
        self, doc: Document, params: dict[str, Any]
45
    ) -> list[SubjectSuggestion]:
46
        self.debug(
47
            'Suggesting subjects for text "{}..." (len={})'.format(
48
                doc.text[:20], len(doc.text)
49
            )
50
        )
51
        sentences = self.project.analyzer.tokenize_sentences(doc.text)
52
        self.debug("Found {} sentences".format(len(sentences)))
53
        chunksize = int(params["chunksize"])
54
        chunktexts = []
55
        for i in range(0, len(sentences), chunksize):
56
            chunktexts.append(" ".join(sentences[i : i + chunksize]))
57
        self.debug("Split sentences into {} chunks".format(len(chunktexts)))
58
        if len(chunktexts) == 0:  # no input, empty result
59
            return []
60
        return self._suggest_chunks(chunktexts, params)
61
62
class TfidfVectorizerMixin:
63
    """Annif backend mixin that implements TfidfVectorizer functionality"""
64
65
    VECTORIZER_FILE = "vectorizer"
66
67
    vectorizer = None
68
69
    def initialize_vectorizer(self) -> None:
70
        if self.vectorizer is None:
71
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
72
            if os.path.exists(path):
73
                self.debug("loading vectorizer from {}".format(path))
74
                self.vectorizer = joblib.load(path)
75
            else:
76
                raise NotInitializedException(
77
                    "vectorizer file '{}' not found".format(path),
78
                    backend_id=self.backend_id,
79
                )
80
81
    def create_vectorizer(
82
        self, input: Iterable[str], params: dict[str, Any] = None
83
    ) -> csr_matrix:
84
        self.info("creating vectorizer")
85
        if params is None:
86
            params = {}
87
        # avoid UserWarning when overriding tokenizer
88
        if "tokenizer" in params:
89
            params["token_pattern"] = None
90
        self.vectorizer = TfidfVectorizer(**params)
91
        veccorpus = self.vectorizer.fit_transform(input)
92
        annif.util.atomic_save(
93
            self.vectorizer, self.datadir, self.VECTORIZER_FILE, method=joblib.dump
94
        )
95
        return veccorpus
96
97
class PecosTfidfVectorizerMixin:
98
    """Annif backend mixin that implements TfidfVectorizer functionality from Pecos"""
99
100
    VECTORIZER_FILE = "vectorizer"
101
102
    vectorizer = None
103
104
    def initialize_vectorizer(self) -> None:
105
        if self.vectorizer is None:
106
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
107
            if os.path.exists(path):
108
                self.debug("loading vectorizer from {}".format(path))
109
                
110
                self.vectorizer = Vectorizer.load(path)
111
            else:
112
                raise NotInitializedException(
113
                    "vectorizer file '{}' not found".format(path),
114
                    backend_id=self.backend_id,
115
                )
116
            
117
    def vectorizer_dict(self, params: dict[str, Any]) -> dict[str, Any]:
118
        """Create a vectorizer configuration dictionary from the given parameters."""
119
        
120
        config = {
121
            "base_vect_configs": [
122
                {
123
                    "ngram_range": params.get("ngram_range", [1, 1]),
124
                    "max_df_ratio": 0.98,
125
                    "analyzer": "word",
126
                    "min_df_cnt": params.get("min_df", 1),
127
                }
128
            ]
129
        }
130
        return {"type": "tfidf", "kwargs": {**config}} 
131
    
132
133
    def create_vectorizer(
134
        self, input: Iterable[str], params: dict[str, Any] = None
135
    ) -> csr_matrix:
136
137
        self.info("creating Pecos vectorizer")
138
        if params is None:
139
            params = {}
140
        data = list(input)
141
        vectorizer_config = self.vectorizer_dict(params)
142
        self.vectorizer = Vectorizer.train(data, vectorizer_config, np.float32)
143
        self.vectorizer.save(os.path.join(self.datadir, self.VECTORIZER_FILE))
144
        veccorpus = self.vectorizer.predict(
145
            data,
146
            threads=params.get("threads", -1)
147
        )
148
        
149
        return veccorpus
150