Add env variable to disable streaming for the DefaultMultiLLM class

This commit is contained in:
Weves 2024-04-25 09:20:20 -07:00 committed by Chris Weaver
parent 66d95690cb
commit 648f2d06bf
3 changed files with 12 additions and 0 deletions

View File

@ -91,3 +91,9 @@ GEN_AI_HISTORY_CUTOFF = 3000
# error if the total # of tokens exceeds the max input tokens. # error if the total # of tokens exceeds the max input tokens.
GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS = 512 GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS = 512
GEN_AI_TEMPERATURE = float(os.environ.get("GEN_AI_TEMPERATURE") or 0) GEN_AI_TEMPERATURE = float(os.environ.get("GEN_AI_TEMPERATURE") or 0)
# should be used if you are using a custom LLM inference provider that doesn't support
# streaming format AND you are still using the langchain/litellm LLM class
DISABLE_LITELLM_STREAMING = (
os.environ.get("DISABLE_LITELLM_STREAMING") or "false"
).lower() == "true"

View File

@ -7,6 +7,7 @@ from langchain.schema.language_model import LanguageModelInput
from langchain_community.chat_models import ChatLiteLLM from langchain_community.chat_models import ChatLiteLLM
from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS
from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
from danswer.configs.model_configs import GEN_AI_API_VERSION from danswer.configs.model_configs import GEN_AI_API_VERSION
from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE
@ -70,6 +71,9 @@ class LangChainChatLLM(LLM, abc.ABC):
if LOG_ALL_MODEL_INTERACTIONS: if LOG_ALL_MODEL_INTERACTIONS:
self._log_prompt(prompt) self._log_prompt(prompt)
if DISABLE_LITELLM_STREAMING:
return [self.invoke(prompt)]
output_tokens = [] output_tokens = []
for token in message_generator_to_string_generator(self.llm.stream(prompt)): for token in message_generator_to_string_generator(self.llm.stream(prompt)):
output_tokens.append(token) output_tokens.append(token)

View File

@ -44,6 +44,7 @@ services:
- DISABLE_LLM_CHOOSE_SEARCH=${DISABLE_LLM_CHOOSE_SEARCH:-} - DISABLE_LLM_CHOOSE_SEARCH=${DISABLE_LLM_CHOOSE_SEARCH:-}
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-} - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
# if set, allows for the use of the token budget system # if set, allows for the use of the token budget system
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-} - TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
# Enables the use of bedrock models # Enables the use of bedrock models
@ -117,6 +118,7 @@ services:
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-} - DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-} - DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-} - GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
# Query Options # Query Options
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years) - DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector) - HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)