Multilingual Docs Updates (#856)

2025-07-12 14:12:53 +02:00 · 2023-12-22 00:26:00 -08:00
parent 962e3f726a
commit 6650f01dc6
3 changed files with 16 additions and 15 deletions
--- a/backend/danswer/configs/model_configs.py
+++ b/backend/danswer/configs/model_configs.py
@ -21,6 +21,7 @@ NORMALIZE_EMBEDDINGS = (
    os.environ.get("NORMALIZE_EMBEDDINGS") or "False"
 ).lower() == "true"
 # These are only used if reranking is turned off, to normalize the direct retrieval scores for display
+# Currently unused
 SIM_SCORE_RANGE_LOW = float(os.environ.get("SIM_SCORE_RANGE_LOW") or 0.0)
 SIM_SCORE_RANGE_HIGH = float(os.environ.get("SIM_SCORE_RANGE_HIGH") or 1.0)
 # Certain models like e5, BGE, etc use a prefix for asymmetric retrievals (query generally shorter than docs)
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@ -42,8 +42,6 @@ services:
      # Don't change the NLP model configs unless you know what you're doing
      - DOCUMENT_ENCODER_MODEL=${DOCUMENT_ENCODER_MODEL:-}
      - NORMALIZE_EMBEDDINGS=${NORMALIZE_EMBEDDINGS:-}
-      - SIM_SCORE_RANGE_LOW=${SIM_SCORE_RANGE_LOW:-}
-      - SIM_SCORE_RANGE_HIGH=${SIM_SCORE_RANGE_HIGH:-}
      - ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
      - ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
      - ENABLE_RERANKING_REAL_TIME_FLOW=${ENABLE_RERANKING_REAL_TIME_FLOW:-}
--- a/deployment/docker_compose/env.multilingual.template
+++ b/deployment/docker_compose/env.multilingual.template
@ -4,6 +4,9 @@
 # Feel free to combine it with the other templates to suit your needs


+# Rephrase the user query in specified languages using LLM, use comma separated values
+MULTILINGUAL_QUERY_EXPANSION="English, French"
+
 # A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
 DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"

@ -12,27 +15,26 @@ DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
 ASYM_QUERY_PREFIX="query: "
 ASYM_PASSAGE_PREFIX="passage: "

-# Depends model by model, this one is tuned with this as True
+# Depends model by model, the one shown above is tuned with this as True
 NORMALIZE_EMBEDDINGS="True"

-# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
-SIM_SCORE_RANGE_LOW="0.6"
-SIM_SCORE_RANGE_LOW="0.8"
-
 # Use LLM to determine if chunks are relevant to the query
-# may not work well for languages that do not have much training data in the LLM training set
+# May not work well for languages that do not have much training data in the LLM training set
+# If using a common language like Spanish, French, Chinese, etc. this can be kept turned on
 DISABLE_LLM_CHUNK_FILTER="True"

-# Rephrase the user query in specified languages using LLM, use comma separated values
-MULTILINGUAL_QUERY_EXPANSION="English, French"
+# The default reranking models are English first
+# There are no great quality French/English reranking models currently so turning this off
+ENABLE_RERANKING_ASYNC_FLOW="False"
+ENABLE_RERANKING_REAL_TIME_FLOW="False"

 # Enables fine-grained embeddings for better retrieval
 # At the cost of indexing speed (~5x slower), query time is same speed
+# Since reranking is turned off and multilingual retrieval is generally harder
+# it is advised to turn this one on
 ENABLE_MINI_CHUNK="True"

-# Stronger model will help with multilingual tasks
+# Using a stronger LLM will help with multilingual tasks
+# Since documents may be in multiple languages, and there are additional instructions to respond
+# in the user query's language, it is advised to use the best model possible
 GEN_AI_MODEL_VERSION="gpt-4"
-GEN_AI_API_KEY=<provide your api key>
-
-# More verbose logging if desired
-LOG_LEVEL="debug"