Add env variable to disable streaming for the DefaultMultiLLM class

2025-06-24 06:50:57 +02:00 · 2024-04-25 09:20:20 -07:00 · 2024-04-25 09:20:20 -07:00 · 648f2d06bf
commit 648f2d06bf
parent 66d95690cb
3 changed files with 12 additions and 0 deletions
--- a/backend/danswer/configs/model_configs.py
+++ b/backend/danswer/configs/model_configs.py
@ -91,3 +91,9 @@ GEN_AI_HISTORY_CUTOFF = 3000
 # error if the total # of tokens exceeds the max input tokens.
 GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS = 512
 GEN_AI_TEMPERATURE = float(os.environ.get("GEN_AI_TEMPERATURE") or 0)
+
+# should be used if you are using a custom LLM inference provider that doesn't support
+# streaming format AND you are still using the langchain/litellm LLM class
+DISABLE_LITELLM_STREAMING = (
+    os.environ.get("DISABLE_LITELLM_STREAMING") or "false"
+).lower() == "true"
--- a/backend/danswer/llm/chat_llm.py
+++ b/backend/danswer/llm/chat_llm.py
@ -7,6 +7,7 @@ from langchain.schema.language_model import LanguageModelInput
 from langchain_community.chat_models import ChatLiteLLM

 from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS
+from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING
 from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
 from danswer.configs.model_configs import GEN_AI_API_VERSION
 from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE
@ -70,6 +71,9 @@ class LangChainChatLLM(LLM, abc.ABC):
        if LOG_ALL_MODEL_INTERACTIONS:
            self._log_prompt(prompt)

+        if DISABLE_LITELLM_STREAMING:
+            return [self.invoke(prompt)]
+
        output_tokens = []
        for token in message_generator_to_string_generator(self.llm.stream(prompt)):
            output_tokens.append(token)
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@ -44,6 +44,7 @@ services:
      - DISABLE_LLM_CHOOSE_SEARCH=${DISABLE_LLM_CHOOSE_SEARCH:-}
      - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
      - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
+      - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
      # if set, allows for the use of the token budget system
      - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
      # Enables the use of bedrock models
@ -117,6 +118,7 @@ services:
      - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
      - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
      - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
+      - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
      # Query Options
      - DOC_TIME_DECAY=${DOC_TIME_DECAY:-}  # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
      - HYBRID_ALPHA=${HYBRID_ALPHA:-}  # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)