annif.backend.llm_ensemble - Code Metrics - Inspection of "WIP LLM ranking/scoring backend" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#859)

by Juho

created 2025-07-08 12:54 UTC

annif.backend.llm_ensemble A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	347
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	235
dl	0
loc	347
rs	9.84
c	0
b	0
f	0
wmc	32

13 Methods

Rating	Name	Size	Complexity
A	BaseLLMBackend._truncate_text()	5	1
A	BaseLLMBackend.default_params()	5	1
B	BaseLLMBackend.initialize()	35	6
A	LLMEnsembleBackend._get_labels_batch()	9	1
A	BaseLLMBackend._call_llm()	30	2
A	LLMEnsembleBackend.get_hp_optimizer()	2	1
A	LLMEnsembleBackend._suggest_batch()	24	4
A	BaseLLMBackend._get_tokenizer()	8	2
B	LLMEnsembleBackend._llm_suggest_batch()	63	5
A	BaseLLMBackend._verify_connection()	11	2
A	LLMEnsembleOptimizer._postprocess()	7	1
A	LLMEnsembleOptimizer._objective()	24	2
A	LLMEnsembleOptimizer._prepare()	50	4

"""Backend utilizing a large-language model."""

from __future__ import annotations

import concurrent.futures
import json
import os
from typing import TYPE_CHECKING, Any, Optional

import tiktoken
from openai import AzureOpenAI, BadRequestError, OpenAI, OpenAIError
from transformers import AutoTokenizer

import annif.eval
import annif.parallel
import annif.util
from annif.exception import ConfigurationException, OperationFailedException
from annif.suggestion import SubjectSuggestion, SuggestionBatch

from . import backend, ensemble, hyperopt

if TYPE_CHECKING:
    from annif.corpus.document import DocumentCorpus


class BaseLLMBackend(backend.AnnifBackend):
    """Base class for LLM backends"""

    _client = None

    DEFAULT_PARAMETERS = {
        "api_version": "2024-10-21",
        "temperature": 0.0,
        "top_p": 1.0,
        "seed": 0,
        "max_completion_tokens": 2000,
    }

    def initialize(self, parallel: bool = False) -> None:
        if self._client is not None:
            return

        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
        api_base_url = os.getenv("LLM_API_BASE_URL")
        try:
            self.model = self.params["model"]
        except KeyError as err:
            raise ConfigurationException(
                "model setting is missing", project_id=self.project.project_id
            )

        if api_base_url is not None:
            self._client = OpenAI(
                base_url=api_base_url,
                api_key=os.getenv("LLM_API_KEY", "dummy-key"),
            )
        elif azure_endpoint is not None:
            self._client = AzureOpenAI(
                azure_endpoint=azure_endpoint,
                api_key=os.getenv("AZURE_OPENAI_KEY"),
                api_version=self.params["api_version"],
            )
        else:
            raise OperationFailedException(
                "Please set the AZURE_OPENAI_ENDPOINT or LLM_API_BASE_URL "
                "environment variable for LLM API access."
            )

        # Tokenizer is unnecessary if truncation is not performed
        if int(self.params["max_prompt_tokens"]) > 0:
            self.tokenizer = self._get_tokenizer()
        self._verify_connection()
        super().initialize(parallel)

    def _get_tokenizer(self):
        try:
            # Try OpenAI tokenizer
            base_model = self.model.rsplit("-", 1)[0]
            return tiktoken.encoding_for_model(base_model)
        except KeyError:
            # Fallback to Hugging Face tokenizer
            return AutoTokenizer.from_pretrained(self.model)

    def _verify_connection(self):
        try:
            self._call_llm(
                system_prompt="You are a helpful assistant.",
                prompt="This is a test prompt to verify the connection.",
                params=self.params,
            )
        except OpenAIError as err:
            raise OperationFailedException(
                f"Failed to connect to LLM API: {err}"
            ) from err
        # print(f"Successfully connected to endpoint {self.params['endpoint']}")

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(BaseLLMBackend.DEFAULT_PARAMETERS.copy())
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def _truncate_text(self, text, max_prompt_tokens):
        """Truncate text so it contains at most max_prompt_tokens according to the
        OpenAI tokenizer"""
        tokens = self.tokenizer.encode(text)
        return self.tokenizer.decode(tokens[:max_prompt_tokens])

    def _call_llm(
        self,
        system_prompt: str,
        prompt: str,
        params: dict[str, Any],
        response_format: Optional[dict] = None,
    ) -> str:
        temperature = float(params["temperature"])
        top_p = float(params["top_p"])
        seed = int(params["seed"])
        max_completion_tokens = int(params["max_completion_tokens"])

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ]
        try:
            completion = self._client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=temperature,
                seed=seed,
                top_p=top_p,
                max_completion_tokens=max_completion_tokens,
                response_format=response_format,
            )
        except OpenAIError as err:
            print(err)
            return "{}"
        return completion.choices[0].message.content


class LLMEnsembleBackend(BaseLLMBackend, ensemble.EnsembleBackend):
    """Ensemble backend that combines results from multiple projects and scores them
    with a LLM"""

    name = "llm_ensemble"

    DEFAULT_PARAMETERS = {
        "max_prompt_tokens": 0,
        "llm_weight": 0.7,
        "llm_exponent": 1.0,
        "labels_language": "en",
        "sources_limit": 10,
    }

    def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> None:
        return LLMEnsembleOptimizer(self, corpus, metric)

    def _suggest_batch(
        self, texts: list[str], params: dict[str, Any]
    ) -> SuggestionBatch:
        sources = annif.util.parse_sources(params["sources"])
        llm_weight = float(params["llm_weight"])
        llm_exponent = float(params["llm_exponent"])
        if llm_weight < 0.0 or llm_weight > 1.0:
            raise ValueError("llm_weight must be between 0.0 and 1.0")
        if llm_exponent < 0.0:
            raise ValueError("llm_weight_exp must be greater than or equal to 0.0")

        batch_by_source = self._suggest_with_sources(texts, sources)
        merged_source_batch = self._merge_source_batches(
            batch_by_source, sources, {"limit": params["sources_limit"]}
        )

        # Score the suggestion labels with the LLM
        llm_results_batch = self._llm_suggest_batch(texts, merged_source_batch, params)

        batches = [merged_source_batch, llm_results_batch]
        weights = [1.0 - llm_weight, llm_weight]
        exponents = [1.0, llm_exponent]
        return SuggestionBatch.from_averaged(batches, weights, exponents).filter(
            limit=int(params["limit"])
        )

    def _llm_suggest_batch(
        self,
        texts: list[str],
        suggestion_batch: SuggestionBatch,
        params: dict[str, Any],
    ) -> SuggestionBatch:

        max_prompt_tokens = int(params["max_prompt_tokens"])

        system_prompt = """
            You will be given text and a list of keywords to describe it. Your task is
            to score the keywords with a value between 0 and 100. The score value
            should depend on how well the keyword represents the text: a perfect
            keyword should have score 100 and completely unrelated keyword score
            0. You must output JSON with keywords as field names and add their scores
            as field values.
            There must be the same number of objects in the JSON as there are lines in
            the intput keyword list; do not skip scoring any keywords.
        """

        labels_batch = self._get_labels_batch(suggestion_batch)

        def process_single_prompt(text, labels):
            prompt = "Here are the keywords:\n" + "\n".join(labels) + "\n" * 3
            if max_prompt_tokens > 0:
                text = self._truncate_text(text, max_prompt_tokens)
            prompt += "Here is the text:\n" + text + "\n"

            response = self._call_llm(
                system_prompt,
                prompt,
                params,
                response_format={"type": "json_object"},
            )
            try:
                llm_result = json.loads(response)
            except (TypeError, json.decoder.JSONDecodeError) as err:
                print(f"Error decoding JSON response from LLM: '{response[:100]}...'")
                print(f"{str(err)}")
                return [SubjectSuggestion(subject_id=None, score=0.0) for _ in labels]

            return [
                (
                    SubjectSuggestion(
                        subject_id=self.project.subjects.by_label(
                            llm_label, self.params["labels_language"]
                        ),
                        score=score / 100.0,  # LLM scores are between 0 and 100
                    )
                    if llm_label in labels
                    else SubjectSuggestion(subject_id=None, score=0.0)
                )
                for llm_label, score in llm_result.items()
            ]

        with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
            llm_batch_suggestions = list(
                executor.map(process_single_prompt, texts, labels_batch)
            )

        return SuggestionBatch.from_sequence(
            llm_batch_suggestions,
            self.project.subjects,
        )

    def _get_labels_batch(self, suggestion_batch: SuggestionBatch) -> list[list[str]]:
        return [
            [
                self.project.subjects[suggestion.subject_id].labels[
                    self.params["labels_language"]
                ]
                for suggestion in suggestion_result
            ]
            for suggestion_result in suggestion_batch
        ]


class LLMEnsembleOptimizer(ensemble.EnsembleOptimizer):
    """Hyperparameter optimizer for the LLM ensemble backend"""

    def _prepare(self, n_jobs=1):
        sources = annif.util.parse_sources(self._backend.params["sources"])
        project_ids = [source[0] for source in sources]

        # initialize the source projects before forking, to save memory
        # for project_id in sources.keys():
        #     project = self._backend.project.registry.get_project(project_id)
        #     project.initialize(parallel=True)
        self._backend.initialize(parallel=True)

        psmap = annif.parallel.ProjectSuggestMap(
            self._backend.project.registry,
            project_ids,
            backend_params=None,
            limit=None,
            threshold=0.0,
        )

        jobs, pool_class = annif.parallel.get_pool(n_jobs)

        self._gold_batches = []
        self._source_batches = []

        print("Generating source batches")
        with pool_class(jobs) as pool:
            for suggestions_batch, gold_batch in pool.imap_unordered(
                psmap.suggest_batch, self._corpus.doc_batches
            ):
                self._source_batches.append(suggestions_batch)
                self._gold_batches.append(gold_batch)

        # get the llm batches
        print("Generating LLM batches")
        self._merged_source_batches = []
        self._llm_batches = []
        for batch_by_source, docs_batch in zip(
            self._source_batches, self._corpus.doc_batches
        ):
            merged_source_batch = self._backend._merge_source_batches(
                batch_by_source,
                sources,
                {"limit": self._backend.params["sources_limit"]},
            )
            llm_batch = self._backend._llm_suggest_batch(
                [doc.text for doc in docs_batch],
                merged_source_batch,
                self._backend.params,
            )
            self._merged_source_batches.append(merged_source_batch)
            self._llm_batches.append(llm_batch)

    def _objective(self, trial) -> float:
        eval_batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
        params = {
            "llm_weight": trial.suggest_float("llm_weight", 0.0, 1.0),
            "llm_exponent": trial.suggest_float("llm_exponent", 0.25, 10.0, log=True),
        }
        for merged_source_batch, llm_batch, gold_batch in zip(
            self._merged_source_batches, self._llm_batches, self._gold_batches
        ):
            batches = [merged_source_batch, llm_batch]
            weights = [
                1.0 - params["llm_weight"],
                params["llm_weight"],
            ]
            exponents = [
                1.0,
                params["llm_exponent"],
            ]
            avg_batch = SuggestionBatch.from_averaged(
                batches, weights, exponents
            ).filter(limit=int(self._backend.params["limit"]))
            eval_batch.evaluate_many(avg_batch, gold_batch)
        results = eval_batch.results(metrics=[self._metric])
        return results[self._metric]

    def _postprocess(self, study):
        bp = study.best_params
        lines = [
            f"llm_weight={bp['llm_weight']}",
            f"llm_exponent={bp['llm_exponent']}",
        ]
        return hyperopt.HPRecommendation(lines=lines, score=study.best_value)


1			"""Backend utilizing a large-language model."""
2
3			from __future__ import annotations
4
5			import concurrent.futures
6			import json
7			import os
8			from typing import TYPE_CHECKING, Any, Optional
9
10			import tiktoken
11			from openai import AzureOpenAI, BadRequestError, OpenAI, OpenAIError
12			from transformers import AutoTokenizer
13
14			import annif.eval
15			import annif.parallel
16			import annif.util
17			from annif.exception import ConfigurationException, OperationFailedException
18			from annif.suggestion import SubjectSuggestion, SuggestionBatch
19
20			from . import backend, ensemble, hyperopt
21
22			if TYPE_CHECKING:
23			from annif.corpus.document import DocumentCorpus
24
25
26			class BaseLLMBackend(backend.AnnifBackend):
27			"""Base class for LLM backends"""
28
29			_client = None
30
31			DEFAULT_PARAMETERS = {
32			"api_version": "2024-10-21",
33			"temperature": 0.0,
34			"top_p": 1.0,
35			"seed": 0,
36			"max_completion_tokens": 2000,
37			}
38
39			def initialize(self, parallel: bool = False) -> None:
40			if self._client is not None:
41			return
42
43			azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
44			api_base_url = os.getenv("LLM_API_BASE_URL")
45			try:
46			self.model = self.params["model"]
47			except KeyError as err:
48			raise ConfigurationException(
49			"model setting is missing", project_id=self.project.project_id
50			)
51
52			if api_base_url is not None:
53			self._client = OpenAI(
54			base_url=api_base_url,
55			api_key=os.getenv("LLM_API_KEY", "dummy-key"),
56			)
57			elif azure_endpoint is not None:
58			self._client = AzureOpenAI(
59			azure_endpoint=azure_endpoint,
60			api_key=os.getenv("AZURE_OPENAI_KEY"),
61			api_version=self.params["api_version"],
62			)
63			else:
64			raise OperationFailedException(
65			"Please set the AZURE_OPENAI_ENDPOINT or LLM_API_BASE_URL "
66			"environment variable for LLM API access."
67			)
68
69			# Tokenizer is unnecessary if truncation is not performed
70			if int(self.params["max_prompt_tokens"]) > 0:
71			self.tokenizer = self._get_tokenizer()
72			self._verify_connection()
73			super().initialize(parallel)
74
75			def _get_tokenizer(self):
76			try:
77			# Try OpenAI tokenizer
78			base_model = self.model.rsplit("-", 1)[0]
79			return tiktoken.encoding_for_model(base_model)
80			except KeyError:
81			# Fallback to Hugging Face tokenizer
82			return AutoTokenizer.from_pretrained(self.model)
83
84			def _verify_connection(self):
85			try:
86			self._call_llm(
87			system_prompt="You are a helpful assistant.",
88			prompt="This is a test prompt to verify the connection.",
89			params=self.params,
90			)
91			except OpenAIError as err:
92			raise OperationFailedException(
93			f"Failed to connect to LLM API: {err}"
94			) from err
95			# print(f"Successfully connected to endpoint {self.params['endpoint']}")
96
97			def default_params(self):
98			params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
99			params.update(BaseLLMBackend.DEFAULT_PARAMETERS.copy())
100			params.update(self.DEFAULT_PARAMETERS)
101			return params
102
103			def _truncate_text(self, text, max_prompt_tokens):
104			"""Truncate text so it contains at most max_prompt_tokens according to the
105			OpenAI tokenizer"""
106			tokens = self.tokenizer.encode(text)
107			return self.tokenizer.decode(tokens[:max_prompt_tokens])
108
109			def _call_llm(
110			self,
111			system_prompt: str,
112			prompt: str,
113			params: dict[str, Any],
114			response_format: Optional[dict] = None,
115			) -> str:
116			temperature = float(params["temperature"])
117			top_p = float(params["top_p"])
118			seed = int(params["seed"])
119			max_completion_tokens = int(params["max_completion_tokens"])
120
121			messages = [
122			{"role": "system", "content": system_prompt},
123			{"role": "user", "content": prompt},
124			]
125			try:
126			completion = self._client.chat.completions.create(
127			model=self.model,
128			messages=messages,
129			temperature=temperature,
130			seed=seed,
131			top_p=top_p,
132			max_completion_tokens=max_completion_tokens,
133			response_format=response_format,
134			)
135			except OpenAIError as err:
136			print(err)
137			return "{}"
138			return completion.choices[0].message.content
139
140
141			class LLMEnsembleBackend(BaseLLMBackend, ensemble.EnsembleBackend):
142			"""Ensemble backend that combines results from multiple projects and scores them
143			with a LLM"""
144
145			name = "llm_ensemble"
146
147			DEFAULT_PARAMETERS = {
148			"max_prompt_tokens": 0,
149			"llm_weight": 0.7,
150			"llm_exponent": 1.0,
151			"labels_language": "en",
152			"sources_limit": 10,
153			}
154
155			def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> None:
156			return LLMEnsembleOptimizer(self, corpus, metric)
157
158			def _suggest_batch(
159			self, texts: list[str], params: dict[str, Any]
160			) -> SuggestionBatch:
161			sources = annif.util.parse_sources(params["sources"])
162			llm_weight = float(params["llm_weight"])
163			llm_exponent = float(params["llm_exponent"])
164			if llm_weight < 0.0 or llm_weight > 1.0:
165			raise ValueError("llm_weight must be between 0.0 and 1.0")
166			if llm_exponent < 0.0:
167			raise ValueError("llm_weight_exp must be greater than or equal to 0.0")
168
169			batch_by_source = self._suggest_with_sources(texts, sources)
170			merged_source_batch = self._merge_source_batches(
171			batch_by_source, sources, {"limit": params["sources_limit"]}
172			)
173
174			# Score the suggestion labels with the LLM
175			llm_results_batch = self._llm_suggest_batch(texts, merged_source_batch, params)
176
177			batches = [merged_source_batch, llm_results_batch]
178			weights = [1.0 - llm_weight, llm_weight]
179			exponents = [1.0, llm_exponent]
180			return SuggestionBatch.from_averaged(batches, weights, exponents).filter(
181			limit=int(params["limit"])
182			)
183
184			def _llm_suggest_batch(
185			self,
186			texts: list[str],
187			suggestion_batch: SuggestionBatch,
188			params: dict[str, Any],
189			) -> SuggestionBatch:
190
191			max_prompt_tokens = int(params["max_prompt_tokens"])
192
193			system_prompt = """
194			You will be given text and a list of keywords to describe it. Your task is
195			to score the keywords with a value between 0 and 100. The score value
196			should depend on how well the keyword represents the text: a perfect
197			keyword should have score 100 and completely unrelated keyword score
198			0. You must output JSON with keywords as field names and add their scores
199			as field values.
200			There must be the same number of objects in the JSON as there are lines in
201			the intput keyword list; do not skip scoring any keywords.
202			"""
203
204			labels_batch = self._get_labels_batch(suggestion_batch)
205
206			def process_single_prompt(text, labels):
207			prompt = "Here are the keywords:\n" + "\n".join(labels) + "\n" * 3
208			if max_prompt_tokens > 0:
209			text = self._truncate_text(text, max_prompt_tokens)
210			prompt += "Here is the text:\n" + text + "\n"
211
212			response = self._call_llm(
213			system_prompt,
214			prompt,
215			params,
216			response_format={"type": "json_object"},
217			)
218			try:
219			llm_result = json.loads(response)
220			except (TypeError, json.decoder.JSONDecodeError) as err:
221			print(f"Error decoding JSON response from LLM: '{response[:100]}...'")
222			print(f"{str(err)}")
223			return [SubjectSuggestion(subject_id=None, score=0.0) for _ in labels]
224
225			return [
226			(
227			SubjectSuggestion(
228			subject_id=self.project.subjects.by_label(
229			llm_label, self.params["labels_language"]
230			),
231			score=score / 100.0, # LLM scores are between 0 and 100
232			)
233			if llm_label in labels
234			else SubjectSuggestion(subject_id=None, score=0.0)
235			)
236			for llm_label, score in llm_result.items()
237			]
238
239			with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
240			llm_batch_suggestions = list(
241			executor.map(process_single_prompt, texts, labels_batch)
242			)
243
244			return SuggestionBatch.from_sequence(
245			llm_batch_suggestions,
246			self.project.subjects,
247			)
248
249			def _get_labels_batch(self, suggestion_batch: SuggestionBatch) -> list[list[str]]:
250			return [
251			[
252			self.project.subjects[suggestion.subject_id].labels[
253			self.params["labels_language"]
254			]
255			for suggestion in suggestion_result
256			]
257			for suggestion_result in suggestion_batch
258			]
259
260
261			class LLMEnsembleOptimizer(ensemble.EnsembleOptimizer):
262			"""Hyperparameter optimizer for the LLM ensemble backend"""
263
264			def _prepare(self, n_jobs=1):
265			sources = annif.util.parse_sources(self._backend.params["sources"])
266			project_ids = [source[0] for source in sources]
267
268			# initialize the source projects before forking, to save memory
269			# for project_id in sources.keys():
270			# project = self._backend.project.registry.get_project(project_id)
271			# project.initialize(parallel=True)
272			self._backend.initialize(parallel=True)
273
274			psmap = annif.parallel.ProjectSuggestMap(
275			self._backend.project.registry,
276			project_ids,
277			backend_params=None,
278			limit=None,
279			threshold=0.0,
280			)
281
282			jobs, pool_class = annif.parallel.get_pool(n_jobs)
283
284			self._gold_batches = []
285			self._source_batches = []
286
287			print("Generating source batches")
288			with pool_class(jobs) as pool:
289			for suggestions_batch, gold_batch in pool.imap_unordered(
290			psmap.suggest_batch, self._corpus.doc_batches
291			):
292			self._source_batches.append(suggestions_batch)
293			self._gold_batches.append(gold_batch)
294
295			# get the llm batches
296			print("Generating LLM batches")
297			self._merged_source_batches = []
298			self._llm_batches = []
299			for batch_by_source, docs_batch in zip(
300			self._source_batches, self._corpus.doc_batches
301			):
302			merged_source_batch = self._backend._merge_source_batches(
303			batch_by_source,
304			sources,
305			{"limit": self._backend.params["sources_limit"]},
306			)
307			llm_batch = self._backend._llm_suggest_batch(
308			[doc.text for doc in docs_batch],
309			merged_source_batch,
310			self._backend.params,
311			)
312			self._merged_source_batches.append(merged_source_batch)
313			self._llm_batches.append(llm_batch)
314
315			def _objective(self, trial) -> float:
316			eval_batch = annif.eval.EvaluationBatch(self._backend.project.subjects)
317			params = {
318			"llm_weight": trial.suggest_float("llm_weight", 0.0, 1.0),
319			"llm_exponent": trial.suggest_float("llm_exponent", 0.25, 10.0, log=True),
320			}
321			for merged_source_batch, llm_batch, gold_batch in zip(
322			self._merged_source_batches, self._llm_batches, self._gold_batches
323			):
324			batches = [merged_source_batch, llm_batch]
325			weights = [
326			1.0 - params["llm_weight"],
327			params["llm_weight"],
328			]
329			exponents = [
330			1.0,
331			params["llm_exponent"],
332			]
333			avg_batch = SuggestionBatch.from_averaged(
334			batches, weights, exponents
335			).filter(limit=int(self._backend.params["limit"]))
336			eval_batch.evaluate_many(avg_batch, gold_batch)
337			results = eval_batch.results(metrics=[self._metric])
338			return results[self._metric]
339
340			def _postprocess(self, study):
341			bp = study.best_params
342			lines = [
343			f"llm_weight={bp['llm_weight']}",
344			f"llm_exponent={bp['llm_exponent']}",
345			]
346			return hyperopt.HPRecommendation(lines=lines, score=study.best_value)
347

NatLibFi / Annif

Pull Request — main (#859)

annif.backend.llm_ensemble A

Complexity

Size/Duplication

Importance

13 Methods

Duplication Side-by-Side

Filter issues like