From 6012a7cbd9e703e4b4321035ae5d36b1e6f88bd5 Mon Sep 17 00:00:00 2001 From: Bart Schuller Date: Sun, 8 Sep 2024 19:25:07 +0200 Subject: [PATCH] Fix multilingual .env embedding dimension (#1976) --- .../docker_compose/env.multilingual.template | 37 +++---------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/deployment/docker_compose/env.multilingual.template b/deployment/docker_compose/env.multilingual.template index 4225ac016eba..1a66dbfbbdee 100644 --- a/deployment/docker_compose/env.multilingual.template +++ b/deployment/docker_compose/env.multilingual.template @@ -1,33 +1,8 @@ -# This env template shows how to configure Danswer for multilingual use -# In this case, it is configured for French and English -# To use it, copy it to .env in the docker_compose directory. -# Feel free to combine it with the other templates to suit your needs +# This env template shows how to configure Danswer for custom multilingual use +# Note that for most use cases it will be enough to configure Danswer multilingual purely through the UI +# See "Search Settings" -> "Advanced" for UI options. +# To use it, copy it to .env in the docker_compose directory (or the equivalent environment settings file for your deployment) - -# Rephrase the user query in specified languages using LLM, use comma separated values -MULTILINGUAL_QUERY_EXPANSION="English, French" -# Change the below to suit your specific needs, can be more explicit about the language of the response -LANGUAGE_HINT="IMPORTANT: Respond in the same language as my query!" +# The following is included with the user prompt. Here's one example but feel free to customize it to your needs: +LANGUAGE_HINT="IMPORTANT: ALWAYS RESPOND IN FRENCH! Even if the documents and the user query are in English, your response must be in French." LANGUAGE_CHAT_NAMING_HINT="The name of the conversation must be in the same language as the user query." - -# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small -DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small" - -# The model above is trained with the following prefix for queries and passages to improve retrieval -# by letting the model know which of the two type is currently being embedded -ASYM_QUERY_PREFIX="query: " -ASYM_PASSAGE_PREFIX="passage: " - -# Depends model by model, the one shown above is tuned with this as True -NORMALIZE_EMBEDDINGS="True" - -# Use LLM to determine if chunks are relevant to the query -# May not work well for languages that do not have much training data in the LLM training set -# If using a common language like Spanish, French, Chinese, etc. this can be kept turned on -DISABLE_LLM_DOC_RELEVANCE="True" - -# Enables fine-grained embeddings for better retrieval -# At the cost of indexing speed (~5x slower), query time is same speed -# Since reranking is turned off and multilingual retrieval is generally harder -# it is advised to turn this one on -ENABLE_MULTIPASS_INDEXING="True"