Multilingual Query Expansion (#737)

2025-10-03 18:08:58 +02:00 · 2023-11-19 10:55:55 -08:00
parent b258ec1bed
commit 6fb07d20cc
9 changed files with 196 additions and 11 deletions
--- a/deployment/docker_compose/docker-compose.dev.yml
+++ b/deployment/docker_compose/docker-compose.dev.yml
@@ -47,6 +47,7 @@ services:
      - SKIP_RERANKING=${SKIP_RERANKING:-}
      - QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
      - EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
+      - MULTILINGUAL_QUERY_EXPANSION=${MULTILINGUAL_QUERY_EXPANSION:-}
      - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-}
      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
      # Leave this on pretty please? Nothing sensitive is collected!
@@ -54,6 +55,8 @@ services:
      - DISABLE_TELEMETRY=${DISABLE_TELEMETRY:-}
      # Set to debug to get more fine-grained logs
      - LOG_LEVEL=${LOG_LEVEL:-info}
+      # Log all of the prompts to the LLM
+      - LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-info}
    volumes:
      - local_dynamic_storage:/home/storage
      - file_connector_tmp_storage:/home/file_connector_storage
@@ -106,11 +109,17 @@ services:
      - SKIP_RERANKING=${SKIP_RERANKING:-}
      - QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
      - EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
+      - MULTILINGUAL_QUERY_EXPANSION=${MULTILINGUAL_QUERY_EXPANSION:-}
      - MIN_THREADS_ML_MODELS=${MIN_THREADS_ML_MODELS:-}
      - MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-}
      - MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
+      # Leave this on pretty please? Nothing sensitive is collected!
+      # https://docs.danswer.dev/more/telemetry
+      - DISABLE_TELEMETRY=${DISABLE_TELEMETRY:-}
      # Set to debug to get more fine-grained logs
      - LOG_LEVEL=${LOG_LEVEL:-info}
+      # Log all of the prompts to the LLM
+      - LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-info}
    volumes:
      - local_dynamic_storage:/home/storage
      - file_connector_tmp_storage:/home/file_connector_storage
--- a/deployment/docker_compose/env.multilingual.template
+++ b/deployment/docker_compose/env.multilingual.template
@@ -0,0 +1,41 @@
+# This env template shows how to configure Danswer for multilingual use
+# In this case, it is configured for French and English
+# To use it, copy it to .env in the docker_compose directory.
+# Feel free to combine it with the other templates to suit your needs
+
+
+# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
+DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
+
+# The model above is trained with the following prefix for queries and passages to improve retrieval
+# by letting the model know which of the two type is currently being embedded
+ASYM_QUERY_PREFIX="query: "
+ASYM_PASSAGE_PREFIX="passage: "
+
+# Depends model by model, this one is tuned with this as True
+NORMALIZE_EMBEDDINGS="True"
+
+# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
+SIM_SCORE_RANGE_LOW="0.6"
+SIM_SCORE_RANGE_LOW="0.8"
+
+# No recent multilingual reranking models small enough to run on CPU, so turning it off
+SKIP_RERANKING="True"
+
+# Use LLM to determine if chunks are relevant to the query
+# may not work well for languages that do not have much training data in the LLM training set
+DISABLE_LLM_CHUNK_FILTER="True"
+
+# Rephrase the user query in specified languages using LLM, use comma separated values
+MULTILINGUAL_QUERY_EXPANSION="English, French"
+
+# Enables fine-grained embeddings for better retrieval
+# At the cost of indexing speed (~5x slower), query time is same speed
+ENABLE_MINI_CHUNK="True"
+
+# Stronger model will help with multilingual tasks
+GEN_AI_MODEL_VERSION="gpt-4"
+GEN_AI_API_KEY=<provide your api key>
+
+# More verbose logging if desired
+LOG_LEVEL="debug"