From 648f2d06bfdecc781c400f9723356887e2a7719c Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Thu, 25 Apr 2024 09:20:20 -0700
Subject: [PATCH] Add env variable to disable streaming for the DefaultMultiLLM
 class

---
 backend/danswer/configs/model_configs.py         | 6 ++++++
 backend/danswer/llm/chat_llm.py                  | 4 ++++
 deployment/docker_compose/docker-compose.dev.yml | 2 ++
 3 files changed, 12 insertions(+)

diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py
index e0d774c82b..1e60931519 100644
--- a/backend/danswer/configs/model_configs.py
+++ b/backend/danswer/configs/model_configs.py
@@ -91,3 +91,9 @@ GEN_AI_HISTORY_CUTOFF = 3000
 # error if the total # of tokens exceeds the max input tokens.
 GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS = 512
 GEN_AI_TEMPERATURE = float(os.environ.get("GEN_AI_TEMPERATURE") or 0)
+
+# should be used if you are using a custom LLM inference provider that doesn't support
+# streaming format AND you are still using the langchain/litellm LLM class
+DISABLE_LITELLM_STREAMING = (
+    os.environ.get("DISABLE_LITELLM_STREAMING") or "false"
+).lower() == "true"
diff --git a/backend/danswer/llm/chat_llm.py b/backend/danswer/llm/chat_llm.py
index 63024c1880..706559d779 100644
--- a/backend/danswer/llm/chat_llm.py
+++ b/backend/danswer/llm/chat_llm.py
@@ -7,6 +7,7 @@ from langchain.schema.language_model import LanguageModelInput
 from langchain_community.chat_models import ChatLiteLLM
 
 from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS
+from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING
 from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
 from danswer.configs.model_configs import GEN_AI_API_VERSION
 from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE
@@ -70,6 +71,9 @@ class LangChainChatLLM(LLM, abc.ABC):
         if LOG_ALL_MODEL_INTERACTIONS:
             self._log_prompt(prompt)
 
+        if DISABLE_LITELLM_STREAMING:
+            return [self.invoke(prompt)]
+
         output_tokens = []
         for token in message_generator_to_string_generator(self.llm.stream(prompt)):
             output_tokens.append(token)
diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
index 4cb7196f7d..08bbf7043c 100644
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -44,6 +44,7 @@ services:
       - DISABLE_LLM_CHOOSE_SEARCH=${DISABLE_LLM_CHOOSE_SEARCH:-}
       - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
       - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
+      - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
       # if set, allows for the use of the token budget system
       - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
       # Enables the use of bedrock models
@@ -117,6 +118,7 @@ services:
       - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
       - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
       - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
+      - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
       # Query Options
       - DOC_TIME_DECAY=${DOC_TIME_DECAY:-}  # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
       - HYBRID_ALPHA=${HYBRID_ALPHA:-}  # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)