annif.backend.llm_ensemble.LLMEnsembleBackend._suggest_batch() - Code Metrics - Inspection of "Rename file to adhere naming convention" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — experiment-llm-ensemble-backen... ( fc5cf8...212700 )

by Juho

created 2025-05-06 07:14 UTC

LLMEnsembleBackend._suggest_batch() A

↳ Parent: annif.backend.llm_ensemble

Complexity

Conditions

Size

Total Lines	20
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	14
nop	3
dl	0
loc	20
rs	9.7
c	0
b	0
f	0

"""Language model based ensemble backend that combines results from multiple
projects."""

from __future__ import annotations

import json
import os
from typing import TYPE_CHECKING, Any

import tiktoken
from openai import AzureOpenAI, BadRequestError

import annif.eval
import annif.parallel
import annif.util
from annif.exception import NotSupportedException
from annif.suggestion import SubjectSuggestion, SuggestionBatch

from . import backend, ensemble

# from openai import AsyncAzureOpenAI


if TYPE_CHECKING:
    from annif.corpus.document import DocumentCorpus


class BaseLLMBackend(backend.AnnifBackend):
    # """Base class for TODO backends"""

    DEFAULT_PARAMETERS = {
        "api_version": "2024-10-21",
        "temperature": 0.0,
        "top_p": 1.0,
        "seed": 0,
    }

    def initialize(self, parallel: bool = False) -> None:
        super().initialize(parallel)
        self.client = AzureOpenAI(
            azure_endpoint=self.params["endpoint"],
            api_version=self.params["api_version"],
            api_key=os.getenv("AZURE_OPENAI_KEY"),
        )
        # TODO: Verify the connection?

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(BaseLLMBackend.DEFAULT_PARAMETERS.copy())
        params.update(self.DEFAULT_PARAMETERS)
        return params


class LLMEnsembleBackend(BaseLLMBackend, ensemble.EnsembleBackend):
    # """TODO backend that combines results from multiple projects"""

    name = "llm_ensemble"

    DEFAULT_PARAMETERS = {
        "max_prompt_tokens": 127000,
        "llm_weight": 0.7,
        "labels_language": "en",
        "sources_limit": 10,
    }

    system_prompt = """
        You will be given text and a list of keywords to describe it. Your task is to
        score the keywords with a value between 0.0 and 1.0. The score value
        should depend on how well the keyword represents the text: a perfect
        keyword should have score 1.0 and completely unrelated keyword score
        0.0. You must output JSON with keywords as field names and add their scores
        as field values.
        There must be the same number of objects in the JSON as there are lines in the
        intput keyword list; do not skip scoring any keywords.
    """
    # Give zero or very low score to the keywords that do not describe the text.

    def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> None:
        raise NotSupportedException(
            "Hyperparameter optimization for LLM ensemble backend is not possible."
        )

    def _suggest_batch(
        self, texts: list[str], params: dict[str, Any]
    ) -> SuggestionBatch:
        sources = annif.util.parse_sources(params["sources"])
        llm_weight = float(params["llm_weight"])
        if llm_weight < 0.0 or llm_weight > 1.0:
            raise ValueError("llm_weight must be between 0.0 and 1.0")

        batch_by_source = self._suggest_with_sources(texts, sources)
        merged_source_batch = self._merge_source_batches(
            batch_by_source, sources, {"limit": params["sources_limit"]}
        )

        # Score the suggestion labels with the LLM
        llm_results_batch = self._llm_suggest_batch(texts, merged_source_batch, params)

        batches = [merged_source_batch, llm_results_batch]
        weights = [1.0 - llm_weight, llm_weight]
        return SuggestionBatch.from_averaged(batches, weights).filter(
            limit=int(params["limit"])
        )

    def _llm_suggest_batch(
        self,
        texts: list[str],
        suggestion_batch: SuggestionBatch,
        params: dict[str, Any],
    ) -> SuggestionBatch:

        model = params["model"]
        encoding = tiktoken.encoding_for_model(model.rsplit("-", 1)[0])
        max_prompt_tokens = int(params["max_prompt_tokens"])

        labels_batch = self._get_labels_batch(suggestion_batch)

        llm_batch_suggestions = []
        for text, labels in zip(texts, labels_batch):
            prompt = "Here are the keywords:\n" + "\n".join(labels) + "\n" * 3
            text = self._truncate_text(text, encoding, max_prompt_tokens)
            prompt += "Here is the text:\n" + text + "\n"

            response = self._call_llm(prompt, model, params)
            try:
                llm_result = json.loads(response)
            except (TypeError, json.decoder.JSONDecodeError) as err:
                print(f"Error decoding JSON response from LLM: {response}")
                print(f"Error: {err}")
                llm_batch_suggestions.append(
                    [SubjectSuggestion(subject_id=None, score=0.0) for _ in labels]
                )
                continue
            llm_batch_suggestions.append(
                [
                    (
                        SubjectSuggestion(
                            subject_id=self.project.subjects.by_label(
                                llm_label, self.params["labels_language"]
                            ),
                            score=score,
                        )
                        if llm_label in labels
                        else SubjectSuggestion(subject_id=None, score=0.0)
                    )
                    for llm_label, score in llm_result.items()
                ]
            )

        return SuggestionBatch.from_sequence(
            llm_batch_suggestions,
            self.project.subjects,
        )

    def _get_labels_batch(self, suggestion_batch: SuggestionBatch) -> list[list[str]]:
        return [
            [
                self.project.subjects[suggestion.subject_id].labels[
                    self.params["labels_language"]
                ]
                for suggestion in suggestion_result
            ]
            for suggestion_result in suggestion_batch
        ]

    def _truncate_text(self, text, encoding, max_prompt_tokens):
        """truncate text so it contains at most max_prompt_tokens according to the
        OpenAI tokenizer"""
        tokens = encoding.encode(text)
        return encoding.decode(tokens[:max_prompt_tokens])

    def _call_llm(self, prompt: str, model: str, params: dict[str, Any]) -> str:
        temperature = float(params["temperature"])
        top_p = float(params["top_p"])
        seed = int(params["seed"])

        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": prompt},
        ]
        try:
            completion = self.client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=temperature,
                seed=seed,
                top_p=top_p,
                response_format={"type": "json_object"},
            )

            completion = completion.choices[0].message.content
            return completion
        except BadRequestError as err:  # openai.RateLimitError
            print(err)
            return "{}"


1			"""Language model based ensemble backend that combines results from multiple
2			projects."""
3
4			from __future__ import annotations
5
6			import json
7			import os
8			from typing import TYPE_CHECKING, Any
9
10			import tiktoken
11			from openai import AzureOpenAI, BadRequestError
12
13			import annif.eval
14			import annif.parallel
15			import annif.util
16			from annif.exception import NotSupportedException
17			from annif.suggestion import SubjectSuggestion, SuggestionBatch
18
19			from . import backend, ensemble
20
21			# from openai import AsyncAzureOpenAI
22
23
24			if TYPE_CHECKING:
25			from annif.corpus.document import DocumentCorpus
26
27
28			class BaseLLMBackend(backend.AnnifBackend):
29			# """Base class for TODO backends"""
30
31			DEFAULT_PARAMETERS = {
32			"api_version": "2024-10-21",
33			"temperature": 0.0,
34			"top_p": 1.0,
35			"seed": 0,
36			}
37
38			def initialize(self, parallel: bool = False) -> None:
39			super().initialize(parallel)
40			self.client = AzureOpenAI(
41			azure_endpoint=self.params["endpoint"],
42			api_version=self.params["api_version"],
43			api_key=os.getenv("AZURE_OPENAI_KEY"),
44			)
45			# TODO: Verify the connection?
46
47			def default_params(self):
48			params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
49			params.update(BaseLLMBackend.DEFAULT_PARAMETERS.copy())
50			params.update(self.DEFAULT_PARAMETERS)
51			return params
52
53
54			class LLMEnsembleBackend(BaseLLMBackend, ensemble.EnsembleBackend):
55			# """TODO backend that combines results from multiple projects"""
56
57			name = "llm_ensemble"
58
59			DEFAULT_PARAMETERS = {
60			"max_prompt_tokens": 127000,
61			"llm_weight": 0.7,
62			"labels_language": "en",
63			"sources_limit": 10,
64			}
65
66			system_prompt = """
67			You will be given text and a list of keywords to describe it. Your task is to
68			score the keywords with a value between 0.0 and 1.0. The score value
69			should depend on how well the keyword represents the text: a perfect
70			keyword should have score 1.0 and completely unrelated keyword score
71			0.0. You must output JSON with keywords as field names and add their scores
72			as field values.
73			There must be the same number of objects in the JSON as there are lines in the
74			intput keyword list; do not skip scoring any keywords.
75			"""
76			# Give zero or very low score to the keywords that do not describe the text.
77
78			def get_hp_optimizer(self, corpus: DocumentCorpus, metric: str) -> None:
79			raise NotSupportedException(
80			"Hyperparameter optimization for LLM ensemble backend is not possible."
81			)
82
83			def _suggest_batch(
84			self, texts: list[str], params: dict[str, Any]
85			) -> SuggestionBatch:
86			sources = annif.util.parse_sources(params["sources"])
87			llm_weight = float(params["llm_weight"])
88			if llm_weight < 0.0 or llm_weight > 1.0:
89			raise ValueError("llm_weight must be between 0.0 and 1.0")
90
91			batch_by_source = self._suggest_with_sources(texts, sources)
92			merged_source_batch = self._merge_source_batches(
93			batch_by_source, sources, {"limit": params["sources_limit"]}
94			)
95
96			# Score the suggestion labels with the LLM
97			llm_results_batch = self._llm_suggest_batch(texts, merged_source_batch, params)
98
99			batches = [merged_source_batch, llm_results_batch]
100			weights = [1.0 - llm_weight, llm_weight]
101			return SuggestionBatch.from_averaged(batches, weights).filter(
102			limit=int(params["limit"])
103			)
104
105			def _llm_suggest_batch(
106			self,
107			texts: list[str],
108			suggestion_batch: SuggestionBatch,
109			params: dict[str, Any],
110			) -> SuggestionBatch:
111
112			model = params["model"]
113			encoding = tiktoken.encoding_for_model(model.rsplit("-", 1)[0])
114			max_prompt_tokens = int(params["max_prompt_tokens"])
115
116			labels_batch = self._get_labels_batch(suggestion_batch)
117
118			llm_batch_suggestions = []
119			for text, labels in zip(texts, labels_batch):
120			prompt = "Here are the keywords:\n" + "\n".join(labels) + "\n" * 3
121			text = self._truncate_text(text, encoding, max_prompt_tokens)
122			prompt += "Here is the text:\n" + text + "\n"
123
124			response = self._call_llm(prompt, model, params)
125			try:
126			llm_result = json.loads(response)
127			except (TypeError, json.decoder.JSONDecodeError) as err:
128			print(f"Error decoding JSON response from LLM: {response}")
129			print(f"Error: {err}")
130			llm_batch_suggestions.append(
131			[SubjectSuggestion(subject_id=None, score=0.0) for _ in labels]
132			)
133			continue
134			llm_batch_suggestions.append(
135			[
136			(
137			SubjectSuggestion(
138			subject_id=self.project.subjects.by_label(
139			llm_label, self.params["labels_language"]
140			),
141			score=score,
142			)
143			if llm_label in labels
144			else SubjectSuggestion(subject_id=None, score=0.0)
145			)
146			for llm_label, score in llm_result.items()
147			]
148			)
149
150			return SuggestionBatch.from_sequence(
151			llm_batch_suggestions,
152			self.project.subjects,
153			)
154
155			def _get_labels_batch(self, suggestion_batch: SuggestionBatch) -> list[list[str]]:
156			return [
157			[
158			self.project.subjects[suggestion.subject_id].labels[
159			self.params["labels_language"]
160			]
161			for suggestion in suggestion_result
162			]
163			for suggestion_result in suggestion_batch
164			]
165
166			def _truncate_text(self, text, encoding, max_prompt_tokens):
167			"""truncate text so it contains at most max_prompt_tokens according to the
168			OpenAI tokenizer"""
169			tokens = encoding.encode(text)
170			return encoding.decode(tokens[:max_prompt_tokens])
171
172			def _call_llm(self, prompt: str, model: str, params: dict[str, Any]) -> str:
173			temperature = float(params["temperature"])
174			top_p = float(params["top_p"])
175			seed = int(params["seed"])
176
177			messages = [
178			{"role": "system", "content": self.system_prompt},
179			{"role": "user", "content": prompt},
180			]
181			try:
182			completion = self.client.chat.completions.create(
183			model=model,
184			messages=messages,
185			temperature=temperature,
186			seed=seed,
187			top_p=top_p,
188			response_format={"type": "json_object"},
189			)
190
191			completion = completion.choices[0].message.content
192			return completion
193			except BadRequestError as err: # openai.RateLimitError
194			print(err)
195			return "{}"
196

NatLibFi / Annif

Push — experiment-llm-ensemble-backen... ( fc5cf8...212700 )

LLMEnsembleBackend._suggest_batch() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like