prompt piece optimizations

2025-06-26 16:01:09 +02:00 · 2025-02-01 22:42:37 -08:00 · 2025-02-01 22:42:37 -08:00 · a96728ff4d
commit a96728ff4d
parent eaffdee0dc
1 changed files with 10 additions and 8 deletions
--- a/backend/onyx/agents/agent_search/shared_graph_utils/agent_prompt_ops.py
+++ b/backend/onyx/agents/agent_search/shared_graph_utils/agent_prompt_ops.py
@ -60,17 +60,19 @@ def build_sub_question_answer_prompt(
 def trim_prompt_piece(config: LLMConfig, prompt_piece: str, reserved_str: str) -> str:
-    # TODO: this truncating might add latency. We could do a rougher + faster check
+    # TODO: save the max input tokens in LLMConfig
-    # first to determine whether truncation is needed
+    max_tokens = get_max_input_tokens(
-
+        model_provider=config.model_provider,
    # TODO: maybe save the tokenizer and max input tokens if this is getting called multiple times?
    llm_tokenizer = get_tokenizer(
        provider_type=config.model_provider,
        model_name=config.model_name,
    )
-    max_tokens = get_max_input_tokens(
+    # no need to trim if a conservative estimate of one token
-        model_provider=config.model_provider,
+    # per character is already less than the max tokens
    if len(prompt_piece) + len(reserved_str) < max_tokens:
        return prompt_piece
    llm_tokenizer = get_tokenizer(
        provider_type=config.model_provider,
        model_name=config.model_name,
    )