mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-28 13:53:28 +02:00
Multilingual Docs Updates (#856)
This commit is contained in:
@@ -21,6 +21,7 @@ NORMALIZE_EMBEDDINGS = (
|
|||||||
os.environ.get("NORMALIZE_EMBEDDINGS") or "False"
|
os.environ.get("NORMALIZE_EMBEDDINGS") or "False"
|
||||||
).lower() == "true"
|
).lower() == "true"
|
||||||
# These are only used if reranking is turned off, to normalize the direct retrieval scores for display
|
# These are only used if reranking is turned off, to normalize the direct retrieval scores for display
|
||||||
|
# Currently unused
|
||||||
SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
|
SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
|
||||||
SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
|
SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
|
||||||
# Certain models like e5, BGE, etc use a prefix for asymmetric retrievals (query generally shorter than docs)
|
# Certain models like e5, BGE, etc use a prefix for asymmetric retrievals (query generally shorter than docs)
|
||||||
|
@@ -42,8 +42,6 @@ services:
|
|||||||
# Don't change the NLP model configs unless you know what you're doing
|
# Don't change the NLP model configs unless you know what you're doing
|
||||||
- DOCUMENT_ENCODER_MODEL=${DOCUMENT_ENCODER_MODEL:-}
|
- DOCUMENT_ENCODER_MODEL=${DOCUMENT_ENCODER_MODEL:-}
|
||||||
- NORMALIZE_EMBEDDINGS=${NORMALIZE_EMBEDDINGS:-}
|
- NORMALIZE_EMBEDDINGS=${NORMALIZE_EMBEDDINGS:-}
|
||||||
- SIM_SCORE_RANGE_LOW=${SIM_SCORE_RANGE_LOW:-}
|
|
||||||
- SIM_SCORE_RANGE_HIGH=${SIM_SCORE_RANGE_HIGH:-}
|
|
||||||
- ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
|
- ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
|
||||||
- ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
|
- ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
|
||||||
- ENABLE_RERANKING_REAL_TIME_FLOW=${ENABLE_RERANKING_REAL_TIME_FLOW:-}
|
- ENABLE_RERANKING_REAL_TIME_FLOW=${ENABLE_RERANKING_REAL_TIME_FLOW:-}
|
||||||
|
@@ -4,6 +4,9 @@
|
|||||||
# Feel free to combine it with the other templates to suit your needs
|
# Feel free to combine it with the other templates to suit your needs
|
||||||
|
|
||||||
|
|
||||||
|
# Rephrase the user query in specified languages using LLM, use comma separated values
|
||||||
|
MULTILINGUAL_QUERY_EXPANSION="English, French"
|
||||||
|
|
||||||
# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
|
# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
|
||||||
DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
|
DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
|
||||||
|
|
||||||
@@ -12,27 +15,26 @@ DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
|
|||||||
ASYM_QUERY_PREFIX="query: "
|
ASYM_QUERY_PREFIX="query: "
|
||||||
ASYM_PASSAGE_PREFIX="passage: "
|
ASYM_PASSAGE_PREFIX="passage: "
|
||||||
|
|
||||||
# Depends model by model, this one is tuned with this as True
|
# Depends model by model, the one shown above is tuned with this as True
|
||||||
NORMALIZE_EMBEDDINGS="True"
|
NORMALIZE_EMBEDDINGS="True"
|
||||||
|
|
||||||
# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
|
|
||||||
SIM_SCORE_RANGE_LOW="0.6"
|
|
||||||
SIM_SCORE_RANGE_LOW="0.8"
|
|
||||||
|
|
||||||
# Use LLM to determine if chunks are relevant to the query
|
# Use LLM to determine if chunks are relevant to the query
|
||||||
# may not work well for languages that do not have much training data in the LLM training set
|
# May not work well for languages that do not have much training data in the LLM training set
|
||||||
|
# If using a common language like Spanish, French, Chinese, etc. this can be kept turned on
|
||||||
DISABLE_LLM_CHUNK_FILTER="True"
|
DISABLE_LLM_CHUNK_FILTER="True"
|
||||||
|
|
||||||
# Rephrase the user query in specified languages using LLM, use comma separated values
|
# The default reranking models are English first
|
||||||
MULTILINGUAL_QUERY_EXPANSION="English, French"
|
# There are no great quality French/English reranking models currently so turning this off
|
||||||
|
ENABLE_RERANKING_ASYNC_FLOW="False"
|
||||||
|
ENABLE_RERANKING_REAL_TIME_FLOW="False"
|
||||||
|
|
||||||
# Enables fine-grained embeddings for better retrieval
|
# Enables fine-grained embeddings for better retrieval
|
||||||
# At the cost of indexing speed (~5x slower), query time is same speed
|
# At the cost of indexing speed (~5x slower), query time is same speed
|
||||||
|
# Since reranking is turned off and multilingual retrieval is generally harder
|
||||||
|
# it is advised to turn this one on
|
||||||
ENABLE_MINI_CHUNK="True"
|
ENABLE_MINI_CHUNK="True"
|
||||||
|
|
||||||
# Stronger model will help with multilingual tasks
|
# Using a stronger LLM will help with multilingual tasks
|
||||||
|
# Since documents may be in multiple languages, and there are additional instructions to respond
|
||||||
|
# in the user query's language, it is advised to use the best model possible
|
||||||
GEN_AI_MODEL_VERSION="gpt-4"
|
GEN_AI_MODEL_VERSION="gpt-4"
|
||||||
GEN_AI_API_KEY=<provide your api key>
|
|
||||||
|
|
||||||
# More verbose logging if desired
|
|
||||||
LOG_LEVEL="debug"
|
|
||||||
|
Reference in New Issue
Block a user