| Conditions | 5 |
| Total Lines | 63 |
| Code Lines | 43 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | """Language model based ensemble backend that combines results from multiple |
||
| 74 | |||
| 75 | # Add LLM suggestions to the source batches |
||
| 76 | batch_by_source[self.project.project_id] = self._llm_suggest_batch( |
||
| 77 | texts, merged_source_batch, params |
||
| 78 | ) |
||
| 79 | new_sources = sources + [(self.project.project_id, float(params["llm_weight"]))] |
||
| 80 | return self._merge_source_batches(batch_by_source, new_sources, params) |
||
| 81 | |||
| 82 | def _llm_suggest_batch( |
||
| 83 | self, |
||
| 84 | texts: list[str], |
||
| 85 | suggestion_batch: SuggestionBatch, |
||
| 86 | params: dict[str, Any], |
||
| 87 | ) -> SuggestionBatch: |
||
| 88 | model = params["model"] |
||
| 89 | encoding = tiktoken.encoding_for_model(model.rsplit("-", 1)[0]) |
||
| 90 | labels_batch = self._get_labels_batch(suggestion_batch) |
||
| 91 | |||
| 92 | llm_batch_suggestions = [] |
||
| 93 | for text, labels in zip(texts, labels_batch): |
||
| 94 | prompt = "Here are the keywords:\n" + "\n".join(labels) + "\n" * 3 |
||
| 95 | text = self._truncate_text(text, encoding) |
||
| 96 | prompt += "Here is the text:\n" + text + "\n" |
||
| 97 | |||
| 98 | response = self._call_llm(prompt, model) |
||
| 99 | try: |
||
| 100 | llm_result = json.loads(response) |
||
| 101 | except (TypeError, json.decoder.JSONDecodeError) as err: |
||
| 102 | print(err) |
||
| 103 | llm_result = None |
||
| 104 | continue # TODO: handle this error |
||
| 105 | llm_suggestions = [ |
||
| 106 | SubjectSuggestion( |
||
| 107 | subject_id=self.project.subjects.by_label(llm_label, "en"), |
||
| 108 | score=score, |
||
| 109 | ) |
||
| 110 | for llm_label, score in llm_result.items() |
||
| 111 | ] |
||
| 112 | llm_batch_suggestions.append(llm_suggestions) |
||
| 113 | return SuggestionBatch.from_sequence( |
||
| 114 | llm_batch_suggestions, |
||
| 115 | self.project.subjects, |
||
| 116 | ) |
||
| 117 | |||
| 118 | def _get_labels_batch(self, suggestion_batch: SuggestionBatch) -> list[list[str]]: |
||
| 119 | return [ |
||
| 120 | [ |
||
| 121 | self.project.subjects[suggestion.subject_id].labels[ |
||
| 122 | "en" |
||
| 123 | ] # TODO: make language selectable |
||
| 124 | for suggestion in suggestion_result |
||
| 125 | ] |
||
| 126 | for suggestion_result in suggestion_batch |
||
| 127 | ] |
||
| 128 | |||
| 129 | def _truncate_text(self, text, encoding): |
||
| 130 | """truncate text so it contains at most MAX_PROMPT_TOKENS according to the |
||
| 131 | OpenAI tokenizer""" |
||
| 132 | |||
| 133 | MAX_PROMPT_TOKENS = 14000 |
||
| 134 | tokens = encoding.encode(text) |
||
| 135 | return encoding.decode(tokens[:MAX_PROMPT_TOKENS]) |
||
| 136 | |||
| 137 | def _call_llm(self, prompt: str, model: str): |
||
| 161 |