Multilingual Query Expansion (#737)

2025-09-26 20:08:38 +02:00 · 2023-11-19 10:55:55 -08:00
parent b258ec1bed
commit 6fb07d20cc
9 changed files with 196 additions and 11 deletions
--- a/deployment/docker_compose/env.multilingual.template
+++ b/deployment/docker_compose/env.multilingual.template
@@ -0,0 +1,41 @@
+# This env template shows how to configure Danswer for multilingual use
+# In this case, it is configured for French and English
+# To use it, copy it to .env in the docker_compose directory.
+# Feel free to combine it with the other templates to suit your needs
+
+
+# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
+DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
+
+# The model above is trained with the following prefix for queries and passages to improve retrieval
+# by letting the model know which of the two type is currently being embedded
+ASYM_QUERY_PREFIX="query: "
+ASYM_PASSAGE_PREFIX="passage: "
+
+# Depends model by model, this one is tuned with this as True
+NORMALIZE_EMBEDDINGS="True"
+
+# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
+SIM_SCORE_RANGE_LOW="0.6"
+SIM_SCORE_RANGE_LOW="0.8"
+
+# No recent multilingual reranking models small enough to run on CPU, so turning it off
+SKIP_RERANKING="True"
+
+# Use LLM to determine if chunks are relevant to the query
+# may not work well for languages that do not have much training data in the LLM training set
+DISABLE_LLM_CHUNK_FILTER="True"
+
+# Rephrase the user query in specified languages using LLM, use comma separated values
+MULTILINGUAL_QUERY_EXPANSION="English, French"
+
+# Enables fine-grained embeddings for better retrieval
+# At the cost of indexing speed (~5x slower), query time is same speed
+ENABLE_MINI_CHUNK="True"
+
+# Stronger model will help with multilingual tasks
+GEN_AI_MODEL_VERSION="gpt-4"
+GEN_AI_API_KEY=<provide your api key>
+
+# More verbose logging if desired
+LOG_LEVEL="debug"