From 648f2d06bfdecc781c400f9723356887e2a7719c Mon Sep 17 00:00:00 2001 From: Weves Date: Thu, 25 Apr 2024 09:20:20 -0700 Subject: [PATCH] Add env variable to disable streaming for the DefaultMultiLLM class --- backend/danswer/configs/model_configs.py | 6 ++++++ backend/danswer/llm/chat_llm.py | 4 ++++ deployment/docker_compose/docker-compose.dev.yml | 2 ++ 3 files changed, 12 insertions(+) diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py index e0d774c82b..1e60931519 100644 --- a/backend/danswer/configs/model_configs.py +++ b/backend/danswer/configs/model_configs.py @@ -91,3 +91,9 @@ GEN_AI_HISTORY_CUTOFF = 3000 # error if the total # of tokens exceeds the max input tokens. GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS = 512 GEN_AI_TEMPERATURE = float(os.environ.get("GEN_AI_TEMPERATURE") or 0) + +# should be used if you are using a custom LLM inference provider that doesn't support +# streaming format AND you are still using the langchain/litellm LLM class +DISABLE_LITELLM_STREAMING = ( + os.environ.get("DISABLE_LITELLM_STREAMING") or "false" +).lower() == "true" diff --git a/backend/danswer/llm/chat_llm.py b/backend/danswer/llm/chat_llm.py index 63024c1880..706559d779 100644 --- a/backend/danswer/llm/chat_llm.py +++ b/backend/danswer/llm/chat_llm.py @@ -7,6 +7,7 @@ from langchain.schema.language_model import LanguageModelInput from langchain_community.chat_models import ChatLiteLLM from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS +from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING from danswer.configs.model_configs import GEN_AI_API_ENDPOINT from danswer.configs.model_configs import GEN_AI_API_VERSION from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE @@ -70,6 +71,9 @@ class LangChainChatLLM(LLM, abc.ABC): if LOG_ALL_MODEL_INTERACTIONS: self._log_prompt(prompt) + if DISABLE_LITELLM_STREAMING: + return [self.invoke(prompt)] + output_tokens = [] for token in message_generator_to_string_generator(self.llm.stream(prompt)): output_tokens.append(token) diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index 4cb7196f7d..08bbf7043c 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -44,6 +44,7 @@ services: - DISABLE_LLM_CHOOSE_SEARCH=${DISABLE_LLM_CHOOSE_SEARCH:-} - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} + - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-} # if set, allows for the use of the token budget system - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-} # Enables the use of bedrock models @@ -117,6 +118,7 @@ services: - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-} + - DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-} # Query Options - DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years) - HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)