use max_tokens to do better rate limit handling (#4224)

* use max_tokens to do better rate limit handling

* fix unti tests

* address greptile comment, thanks greptile
This commit is contained in:
evan-danswer
2025-03-06 18:12:05 -08:00
committed by GitHub
parent 08b2421947
commit 0c29743538
19 changed files with 121 additions and 22 deletions

View File

@@ -25,7 +25,7 @@ from onyx.indexing.models import IndexingSetting
from onyx.setup import setup_postgres
from onyx.setup import setup_vespa
from onyx.utils.logger import setup_logger
from tests.integration.common_utils.timeout import run_with_timeout
from tests.integration.common_utils.timeout import run_with_timeout_multiproc
logger = setup_logger()
@@ -161,7 +161,7 @@ def reset_postgres(
for _ in range(NUM_TRIES):
logger.info(f"Downgrading Postgres... ({_ + 1}/{NUM_TRIES})")
try:
run_with_timeout(
run_with_timeout_multiproc(
downgrade_postgres,
TIMEOUT,
kwargs={

View File

@@ -6,7 +6,9 @@ from typing import TypeVar
T = TypeVar("T")
def run_with_timeout(task: Callable[..., T], timeout: int, kwargs: dict[str, Any]) -> T:
def run_with_timeout_multiproc(
task: Callable[..., T], timeout: int, kwargs: dict[str, Any]
) -> T:
# Use multiprocessing to prevent a thread from blocking the main thread
with multiprocessing.Pool(processes=1) as pool:
async_result = pool.apply_async(task, kwds=kwargs)

View File

@@ -145,6 +145,7 @@ def test_multiple_tool_calls(default_multi_llm: DefaultMultiLLM) -> None:
timeout=30,
parallel_tool_calls=False,
mock_response=MOCK_LLM_RESPONSE,
max_tokens=None,
)
@@ -290,4 +291,5 @@ def test_multiple_tool_calls_streaming(default_multi_llm: DefaultMultiLLM) -> No
timeout=30,
parallel_tool_calls=False,
mock_response=MOCK_LLM_RESPONSE,
max_tokens=None,
)