Passed
Push — upgrade-to-connexion3 ( e417e0...5d7ec9 )
by Juho
09:39 queued 05:10
created

AnnifBackend._model_file_paths()   A

Complexity

Conditions 1

Size

Total Lines 10
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 9
nop 1
dl 0
loc 10
rs 9.95
c 0
b 0
f 0
1
"""Common functionality for backends."""
2
from __future__ import annotations
3
4
import abc
5
import os.path
6
from datetime import datetime, timezone
7
from glob import glob
8
from typing import TYPE_CHECKING, Any
9
10
from annif import logger
11
from annif.suggestion import SuggestionBatch
12
13
if TYPE_CHECKING:
14
    from configparser import SectionProxy
15
16
    from annif.corpus.document import DocumentCorpus
17
    from annif.project import AnnifProject
18
19
20
class AnnifBackend(metaclass=abc.ABCMeta):
21
    """Base class for Annif backends that perform analysis. The
22
    non-implemented methods should be overridden in subclasses."""
23
24
    name = None
25
26
    DEFAULT_PARAMETERS = {"limit": 100}
27
28
    def __init__(
29
        self,
30
        backend_id: str,
31
        config_params: dict[str, Any] | SectionProxy,
32
        project: AnnifProject,
33
    ) -> None:
34
        """Initialize backend with specific parameters. The
35
        parameters are a dict. Keys and values depend on the specific
36
        backend type."""
37
        self.backend_id = backend_id
38
        self.config_params = config_params
39
        self.project = project
40
        self.datadir = project.datadir
41
42
    def default_params(self) -> dict[str, Any]:
43
        params = AnnifBackend.DEFAULT_PARAMETERS.copy()
44
        params.update(self.DEFAULT_PARAMETERS)  # Optional backend specific parameters
45
        return params
46
47
    @property
48
    def params(self) -> dict[str, Any]:
49
        params = {}
50
        params.update(self.default_params())
51
        params.update(self.config_params)
52
        return params
53
54
    @property
55
    def _model_file_paths(self) -> list:
56
        all_paths = glob(os.path.join(self.datadir, "*"))
57
        ignore_patterns = ("*-train*", "tmp-*", "vectorizer")
58
        ignore_paths = [
59
            path
60
            for igp in ignore_patterns
61
            for path in glob(os.path.join(self.datadir, igp))
62
        ]
63
        return list(set(all_paths) - set(ignore_paths))
64
65
    @property
66
    def is_trained(self) -> bool:
67
        return bool(self._model_file_paths)
68
69
    @property
70
    def modification_time(self) -> datetime | None:
71
        mtimes = [
72
            datetime.utcfromtimestamp(os.path.getmtime(p))
73
            for p in self._model_file_paths
74
        ]
75
        most_recent = max(mtimes, default=None)
76
        if most_recent is None:
77
            return None
78
        return most_recent.replace(tzinfo=timezone.utc)
79
80
    def _get_backend_params(
81
        self,
82
        params: dict[str, Any] | None,
83
    ) -> dict[str, Any]:
84
        backend_params = dict(self.params)
85
        if params is not None:
86
            backend_params.update(params)
87
        return backend_params
88
89
    def _train(
90
        self,
91
        corpus: DocumentCorpus,
92
        params: dict[str, Any],
93
        jobs: int = 0,
94
    ) -> None:
95
        """This method can be overridden by backends. It implements
96
        the train functionality, with pre-processed parameters."""
97
        pass  # default is to do nothing, subclasses may override
98
99
    def train(
100
        self,
101
        corpus: DocumentCorpus,
102
        params: dict[str, Any] | None = None,
103
        jobs: int = 0,
104
    ) -> None:
105
        """Train the model on the given document or subject corpus."""
106
        beparams = self._get_backend_params(params)
107
        return self._train(corpus, params=beparams, jobs=jobs)
108
109
    def initialize(self, parallel: bool = False) -> None:
110
        """This method can be overridden by backends. It should cause the
111
        backend to pre-load all data it needs during operation.
112
        If parallel is True, the backend should expect to be used for
113
        parallel operation."""
114
        pass
115
116
    def _suggest(self, text, params):
117
        """Either this method or _suggest_batch should be implemented by by
118
        backends.  It implements the suggest functionality for a single
119
        document, with pre-processed parameters."""
120
        pass  # pragma: no cover
121
122
    def _suggest_batch(
123
        self, texts: list[str], params: dict[str, Any]
124
    ) -> SuggestionBatch:
125
        """This method can be implemented by backends to use batching of documents in
126
        their operations. This default implementation uses the regular suggest
127
        functionality."""
128
        return SuggestionBatch.from_sequence(
129
            [self._suggest(text, params) for text in texts],
130
            self.project.subjects,
131
            limit=int(params.get("limit")),
132
        )
133
134
    def suggest(
135
        self,
136
        texts: list[str],
137
        params: dict[str, Any] | None = None,
138
    ) -> SuggestionBatch:
139
        """Suggest subjects for the input documents and return a list of subject sets
140
        represented as a list of SubjectSuggestion objects."""
141
        beparams = self._get_backend_params(params)
142
        self.initialize()
143
        return self._suggest_batch(texts, params=beparams)
144
145
    def debug(self, message: str) -> None:
146
        """Log a debug message from this backend"""
147
        logger.debug("Backend {}: {}".format(self.backend_id, message))
148
149
    def info(self, message: str) -> None:
150
        """Log an info message from this backend"""
151
        logger.info("Backend {}: {}".format(self.backend_id, message))
152
153
    def warning(self, message: str) -> None:
154
        """Log a warning message from this backend"""
155
        logger.warning("Backend {}: {}".format(self.backend_id, message))
156
157
158
class AnnifLearningBackend(AnnifBackend):
159
    """Base class for Annif backends that can perform online learning"""
160
161
    @abc.abstractmethod
162
    def _learn(self, corpus, params):
163
        """This method should implemented by backends. It implements the learn
164
        functionality, with pre-processed parameters."""
165
        pass  # pragma: no cover
166
167
    def learn(
168
        self,
169
        corpus: DocumentCorpus,
170
        params: dict[str, Any] | None = None,
171
    ) -> None:
172
        """Further train the model on the given document or subject corpus."""
173
        beparams = self._get_backend_params(params)
174
        return self._learn(corpus, params=beparams)
175