Passed
Push — issue703-python-3.11-support ( f59527...05d52a )
by Juho
04:06 queued 14s
created

annif.backend.mllm.MLLMBackend.default_params()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""Maui-like Lexical Matching backend"""
2
from __future__ import annotations
3
4
import os.path
5
from typing import TYPE_CHECKING, Any
6
7
import joblib
8
import numpy as np
9
10
import annif.eval
11
import annif.util
12
from annif.exception import NotInitializedException, NotSupportedException
13
from annif.lexical.mllm import MLLMModel
14
from annif.suggestion import vector_to_suggestions
15
16
from . import hyperopt
17
18
if TYPE_CHECKING:
19
    from collections.abc import Iterator
20
21
    from optuna.study.study import Study
22
    from optuna.trial import Trial
23
24
    from annif.backend.hyperopt import HPRecommendation
25
    from annif.corpus.document import DocumentCorpus
26
    from annif.lexical.mllm import Candidate
27
28
29
class MLLMOptimizer(hyperopt.HyperparameterOptimizer):
30
    """Hyperparameter optimizer for the MLLM backend"""
31
32
    def _prepare(self, n_jobs: int = 1) -> None:
33
        self._backend.initialize()
34
        self._train_x, self._train_y = self._backend._load_train_data()
35
        self._candidates = []
36
        self._gold_subjects = []
37
38
        # TODO parallelize generation of candidates
39
        for doc in self._corpus.documents:
40
            candidates = self._backend._generate_candidates(doc.text)
41
            self._candidates.append(candidates)
42
            self._gold_subjects.append(doc.subject_set)
43
44
    def _objective(self, trial: Trial) -> float:
45
        params = {
46
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 30),
47
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 100, 2000),
48
            "max_samples": trial.suggest_float("max_samples", 0.5, 1.0),
49
            "limit": 100,
50
        }
51
        model = self._backend._model._create_classifier(params)
52
        model.fit(self._train_x, self._train_y)
53
54
        batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
55
        for goldsubj, candidates in zip(self._gold_subjects, self._candidates):
56
            if candidates:
57
                features = self._backend._model._candidates_to_features(candidates)
58
                scores = model.predict_proba(features)
59
                ranking = self._backend._model._prediction_to_list(scores, candidates)
60
            else:
61
                ranking = []
62
            results = self._backend._prediction_to_result(ranking, params)
63
            batch.evaluate_many([results], [goldsubj])
64
        results = batch.results(metrics=[self._metric])
65
        return results[self._metric]
66
67
    def _postprocess(self, study: Study) -> HPRecommendation:
68
        bp = study.best_params
69
        lines = [
70
            f"min_samples_leaf={bp['min_samples_leaf']}",
71
            f"max_leaf_nodes={bp['max_leaf_nodes']}",
72
            f"max_samples={bp['max_samples']:.4f}",
73
        ]
74
        return hyperopt.HPRecommendation(lines=lines, score=study.best_value)
75
76
77
class MLLMBackend(hyperopt.AnnifHyperoptBackend):
78
    """Maui-like Lexical Matching backend for Annif"""
79
80
    name = "mllm"
81
82
    # defaults for unitialized instances
83
    _model = None
84
85
    MODEL_FILE = "mllm-model.gz"
86
    TRAIN_FILE = "mllm-train.gz"
87
88
    DEFAULT_PARAMETERS = {
89
        "min_samples_leaf": 20,
90
        "max_leaf_nodes": 1000,
91
        "max_samples": 0.9,
92
        "use_hidden_labels": False,
93
    }
94
95
    def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> MLLMOptimizer:
96
        return MLLMOptimizer(self, corpus, metric)
97
98
    def _load_model(self) -> MLLMModel:
99
        path = os.path.join(self.datadir, self.MODEL_FILE)
100
        self.debug("loading model from {}".format(path))
101
        if os.path.exists(path):
102
            return MLLMModel.load(path)
103
        else:
104
            raise NotInitializedException(
105
                "model {} not found".format(path), backend_id=self.backend_id
106
            )
107
108
    def _load_train_data(self) -> tuple[np.ndarray, np.ndarray]:
109
        path = os.path.join(self.datadir, self.TRAIN_FILE)
110
        if os.path.exists(path):
111
            return joblib.load(path)
112
        else:
113
            raise NotInitializedException(
114
                "train data file {} not found".format(path), backend_id=self.backend_id
115
            )
116
117
    def initialize(self, parallel: bool = False) -> None:
118
        if self._model is None:
119
            self._model = self._load_model()
120
121
    def _train(
122
        self,
123
        corpus: DocumentCorpus,
124
        params: dict[str, Any],
125
        jobs: int = 0,
126
    ) -> None:
127
        self.info("starting train")
128
        if corpus != "cached":
129
            if corpus.is_empty():
130
                raise NotSupportedException(
131
                    "training backend {} with no documents".format(self.backend_id)
132
                )
133
            self.info("preparing training data")
134
            self._model = MLLMModel()
135
            train_data = self._model.prepare_train(
136
                corpus, self.project.vocab, self.project.analyzer, params, jobs
137
            )
138
            annif.util.atomic_save(
139
                train_data, self.datadir, self.TRAIN_FILE, method=joblib.dump
140
            )
141
        else:
142
            self.info("reusing cached training data from previous run")
143
            self._model = self._load_model()
144
            train_data = self._load_train_data()
145
146
        self.info("training model")
147
        self._model.train(train_data[0], train_data[1], params)
148
149
        self.info("saving model")
150
        annif.util.atomic_save(self._model, self.datadir, self.MODEL_FILE)
151
152
    def _generate_candidates(self, text: str) -> list[Candidate]:
153
        return self._model.generate_candidates(text, self.project.analyzer)
154
155
    def _prediction_to_result(
156
        self,
157
        prediction: list[tuple[np.float64, int]],
158
        params: dict[str, Any],
159
    ) -> Iterator:
160
        vector = np.zeros(len(self.project.subjects), dtype=np.float32)
161
        for score, subject_id in prediction:
162
            vector[subject_id] = score
163
        return vector_to_suggestions(vector, int(params["limit"]))
164
165
    def _suggest(self, text: str, params: dict[str, Any]) -> Iterator:
166
        candidates = self._generate_candidates(text)
167
        prediction = self._model.predict(candidates)
168
        return self._prediction_to_result(prediction, params)
169