Add env variable to disable streaming for the DefaultMultiLLM class

This commit is contained in:
Weves 2024-04-25 09:20:20 -07:00 committed by Chris Weaver
parent 66d95690cb
commit 648f2d06bf
3 changed files with 12 additions and 0 deletions

View File

@ -91,3 +91,9 @@ GEN_AI_HISTORY_CUTOFF = 3000
# error if the total # of tokens exceeds the max input tokens.
GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS = 512
GEN_AI_TEMPERATURE = float(os.environ.get("GEN_AI_TEMPERATURE") or 0)
# should be used if you are using a custom LLM inference provider that doesn't support
# streaming format AND you are still using the langchain/litellm LLM class
DISABLE_LITELLM_STREAMING = (
os.environ.get("DISABLE_LITELLM_STREAMING") or "false"
).lower() == "true"

View File

@ -7,6 +7,7 @@ from langchain.schema.language_model import LanguageModelInput
from langchain_community.chat_models import ChatLiteLLM
from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS
from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING
from danswer.configs.model_configs import GEN_AI_API_ENDPOINT
from danswer.configs.model_configs import GEN_AI_API_VERSION
from danswer.configs.model_configs import GEN_AI_LLM_PROVIDER_TYPE
@ -70,6 +71,9 @@ class LangChainChatLLM(LLM, abc.ABC):
if LOG_ALL_MODEL_INTERACTIONS:
self._log_prompt(prompt)
if DISABLE_LITELLM_STREAMING:
return [self.invoke(prompt)]
output_tokens = []
for token in message_generator_to_string_generator(self.llm.stream(prompt)):
output_tokens.append(token)

View File

@ -44,6 +44,7 @@ services:
- DISABLE_LLM_CHOOSE_SEARCH=${DISABLE_LLM_CHOOSE_SEARCH:-}
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
# if set, allows for the use of the token budget system
- TOKEN_BUDGET_GLOBALLY_ENABLED=${TOKEN_BUDGET_GLOBALLY_ENABLED:-}
# Enables the use of bedrock models
@ -117,6 +118,7 @@ services:
- DISABLE_LLM_QUERY_REPHRASE=${DISABLE_LLM_QUERY_REPHRASE:-}
- DISABLE_GENERATIVE_AI=${DISABLE_GENERATIVE_AI:-}
- GENERATIVE_MODEL_ACCESS_CHECK_FREQ=${GENERATIVE_MODEL_ACCESS_CHECK_FREQ:-}
- DISABLE_LITELLM_STREAMING=${DISABLE_LITELLM_STREAMING:-}
# Query Options
- DOC_TIME_DECAY=${DOC_TIME_DECAY:-} # Recency Bias for search results, decay at 1 / (1 + DOC_TIME_DECAY * x years)
- HYBRID_ALPHA=${HYBRID_ALPHA:-} # Hybrid Search Alpha (0 for entirely keyword, 1 for entirely vector)