| 1 |  |  | """Common functionality for backends.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | from __future__ import annotations | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import abc | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import os.path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | from datetime import datetime, timezone | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | from glob import glob | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from typing import TYPE_CHECKING, Any | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | from annif import logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | from annif.suggestion import SuggestionBatch | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | if TYPE_CHECKING: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |     from configparser import SectionProxy | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |     from annif.corpus.document import DocumentCorpus | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |     from annif.project import AnnifProject | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | class AnnifBackend(metaclass=abc.ABCMeta): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |     """Base class for Annif backends that perform analysis. The | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |     non-implemented methods should be overridden in subclasses.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     name = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |     DEFAULT_PARAMETERS = {"limit": 100} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     def __init__( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |         self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         backend_id: str, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |         config_params: dict[str, Any] | SectionProxy, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |         project: AnnifProject, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     ) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         """Initialize backend with specific parameters. The | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |         parameters are a dict. Keys and values depend on the specific | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         backend type.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |         self.backend_id = backend_id | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |         self.config_params = config_params | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |         self.project = project | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |         self.datadir = project.datadir | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |     def default_params(self) -> dict[str, Any]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         params = AnnifBackend.DEFAULT_PARAMETERS.copy() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |         params.update(self.DEFAULT_PARAMETERS)  # Optional backend specific parameters | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |         return params | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |     def params(self) -> dict[str, Any]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |         params = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |         params.update(self.default_params()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |         params.update(self.config_params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |         return params | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |     def _model_file_paths(self) -> list: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |         all_paths = glob(os.path.join(self.datadir, "*")) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |         ignore_patterns = ("*-train*", "vectorizer") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         ignore_paths = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |             path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |             for igp in ignore_patterns | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |             for path in glob(os.path.join(self.datadir, igp)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |         ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         return list(set(all_paths) - set(ignore_paths)) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 64 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |     @property | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |     def is_trained(self) -> bool: | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |         return bool(self._model_file_paths) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |     def modification_time(self) -> datetime | None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |         mtimes = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |             datetime.utcfromtimestamp(os.path.getmtime(p)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |             for p in self._model_file_paths | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         most_recent = max(mtimes, default=None) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         if most_recent is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |             return None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |         return most_recent.replace(tzinfo=timezone.utc) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |     def _get_backend_params( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |         self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |         params: dict[str, Any] | None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |     ) -> dict[str, Any]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         backend_params = dict(self.params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |         if params is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |             backend_params.update(params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         return backend_params | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |     def _train( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         corpus: DocumentCorpus, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |         params: dict[str, Any], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         jobs: int = 0, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |     ) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         """This method can be overridden by backends. It implements | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |         the train functionality, with pre-processed parameters.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |         pass  # default is to do nothing, subclasses may override | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |     def train( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         corpus: DocumentCorpus, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         params: dict[str, Any] | None = None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         jobs: int = 0, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |     ) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         """Train the model on the given document or subject corpus.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |         beparams = self._get_backend_params(params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         return self._train(corpus, params=beparams, jobs=jobs) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |     def initialize(self, parallel: bool = False) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         """This method can be overridden by backends. It should cause the | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         backend to pre-load all data it needs during operation. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         If parallel is True, the backend should expect to be used for | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |         parallel operation.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |         pass | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |     def _suggest(self, text, params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |         """Either this method or _suggest_batch should be implemented by by | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |         backends.  It implements the suggest functionality for a single | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |         document, with pre-processed parameters.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |         pass  # pragma: no cover | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |     def _suggest_batch( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         self, texts: list[str], params: dict[str, Any] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |     ) -> SuggestionBatch: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         """This method can be implemented by backends to use batching of documents in | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |         their operations. This default implementation uses the regular suggest | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |         functionality.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |         return SuggestionBatch.from_sequence( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |             [self._suggest(text, params) for text in texts], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |             self.project.subjects, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |             limit=int(params.get("limit")), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |     def suggest( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |         self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |         texts: list[str], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |         params: dict[str, Any] | None = None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |     ) -> SuggestionBatch: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |         """Suggest subjects for the input documents and return a list of subject sets | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |         represented as a list of SubjectSuggestion objects.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |         beparams = self._get_backend_params(params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |         self.initialize() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |         return self._suggest_batch(texts, params=beparams) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |     def debug(self, message: str) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |         """Log a debug message from this backend""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |         logger.debug("Backend {}: {}".format(self.backend_id, message)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |     def info(self, message: str) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |         """Log an info message from this backend""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |         logger.info("Backend {}: {}".format(self.backend_id, message)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |     def warning(self, message: str) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |         """Log a warning message from this backend""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |         logger.warning("Backend {}: {}".format(self.backend_id, message)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  | class AnnifLearningBackend(AnnifBackend): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |     """Base class for Annif backends that can perform online learning""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |     @abc.abstractmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |     def _learn(self, corpus, params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |         """This method should implemented by backends. It implements the learn | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |         functionality, with pre-processed parameters.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |         pass  # pragma: no cover | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |     def learn( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |         self, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |         corpus: DocumentCorpus, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |         params: dict[str, Any] | None = None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |     ) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |         """Further train the model on the given document or subject corpus.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |         beparams = self._get_backend_params(params) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 174 |  |  |         return self._learn(corpus, params=beparams) | 
            
                                                        
            
                                    
            
            
                | 175 |  |  |  |