use max_tokens to do better rate limit handling (#4224)

* use max_tokens to do better rate limit handling * fix unti tests * address greptile comment, thanks greptile
2025-07-28 13:53:28 +02:00 · 2025-03-06 18:12:05 -08:00
parent 08b2421947
commit 0c29743538
19 changed files with 121 additions and 22 deletions
--- a/backend/tests/integration/common_utils/reset.py
+++ b/backend/tests/integration/common_utils/reset.py
@@ -25,7 +25,7 @@ from onyx.indexing.models import IndexingSetting
 from onyx.setup import setup_postgres
 from onyx.setup import setup_vespa
 from onyx.utils.logger import setup_logger
-from tests.integration.common_utils.timeout import run_with_timeout
+from tests.integration.common_utils.timeout import run_with_timeout_multiproc

 logger = setup_logger()

@@ -161,7 +161,7 @@ def reset_postgres(
    for _ in range(NUM_TRIES):
        logger.info(f"Downgrading Postgres... ({_ + 1}/{NUM_TRIES})")
        try:
-            run_with_timeout(
+            run_with_timeout_multiproc(
                downgrade_postgres,
                TIMEOUT,
                kwargs={
--- a/backend/tests/integration/common_utils/timeout.py
+++ b/backend/tests/integration/common_utils/timeout.py
@@ -6,7 +6,9 @@ from typing import TypeVar
 T = TypeVar("T")


-def run_with_timeout(task: Callable[..., T], timeout: int, kwargs: dict[str, Any]) -> T:
+def run_with_timeout_multiproc(
+    task: Callable[..., T], timeout: int, kwargs: dict[str, Any]
+) -> T:
    # Use multiprocessing to prevent a thread from blocking the main thread
    with multiprocessing.Pool(processes=1) as pool:
        async_result = pool.apply_async(task, kwds=kwargs)
--- a/backend/tests/unit/onyx/llm/test_chat_llm.py
+++ b/backend/tests/unit/onyx/llm/test_chat_llm.py
@@ -145,6 +145,7 @@ def test_multiple_tool_calls(default_multi_llm: DefaultMultiLLM) -> None:
            timeout=30,
            parallel_tool_calls=False,
            mock_response=MOCK_LLM_RESPONSE,
+            max_tokens=None,
        )


@@ -290,4 +291,5 @@ def test_multiple_tool_calls_streaming(default_multi_llm: DefaultMultiLLM) -> No
            timeout=30,
            parallel_tool_calls=False,
            mock_response=MOCK_LLM_RESPONSE,
+            max_tokens=None,
        )