mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-03 18:08:58 +02:00
Multilingual Query Expansion (#737)
This commit is contained in:
@@ -47,6 +47,7 @@ services:
|
||||
- SKIP_RERANKING=${SKIP_RERANKING:-}
|
||||
- QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
|
||||
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
|
||||
- MULTILINGUAL_QUERY_EXPANSION=${MULTILINGUAL_QUERY_EXPANSION:-}
|
||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-}
|
||||
- MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
|
||||
# Leave this on pretty please? Nothing sensitive is collected!
|
||||
@@ -54,6 +55,8 @@ services:
|
||||
- DISABLE_TELEMETRY=${DISABLE_TELEMETRY:-}
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
# Log all of the prompts to the LLM
|
||||
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-info}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
@@ -106,11 +109,17 @@ services:
|
||||
- SKIP_RERANKING=${SKIP_RERANKING:-}
|
||||
- QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
|
||||
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
|
||||
- MULTILINGUAL_QUERY_EXPANSION=${MULTILINGUAL_QUERY_EXPANSION:-}
|
||||
- MIN_THREADS_ML_MODELS=${MIN_THREADS_ML_MODELS:-}
|
||||
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-}
|
||||
- MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
|
||||
# Leave this on pretty please? Nothing sensitive is collected!
|
||||
# https://docs.danswer.dev/more/telemetry
|
||||
- DISABLE_TELEMETRY=${DISABLE_TELEMETRY:-}
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
# Log all of the prompts to the LLM
|
||||
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-info}
|
||||
volumes:
|
||||
- local_dynamic_storage:/home/storage
|
||||
- file_connector_tmp_storage:/home/file_connector_storage
|
||||
|
41
deployment/docker_compose/env.multilingual.template
Normal file
41
deployment/docker_compose/env.multilingual.template
Normal file
@@ -0,0 +1,41 @@
|
||||
# This env template shows how to configure Danswer for multilingual use
|
||||
# In this case, it is configured for French and English
|
||||
# To use it, copy it to .env in the docker_compose directory.
|
||||
# Feel free to combine it with the other templates to suit your needs
|
||||
|
||||
|
||||
# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
|
||||
DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
|
||||
|
||||
# The model above is trained with the following prefix for queries and passages to improve retrieval
|
||||
# by letting the model know which of the two type is currently being embedded
|
||||
ASYM_QUERY_PREFIX="query: "
|
||||
ASYM_PASSAGE_PREFIX="passage: "
|
||||
|
||||
# Depends model by model, this one is tuned with this as True
|
||||
NORMALIZE_EMBEDDINGS="True"
|
||||
|
||||
# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
|
||||
SIM_SCORE_RANGE_LOW="0.6"
|
||||
SIM_SCORE_RANGE_LOW="0.8"
|
||||
|
||||
# No recent multilingual reranking models small enough to run on CPU, so turning it off
|
||||
SKIP_RERANKING="True"
|
||||
|
||||
# Use LLM to determine if chunks are relevant to the query
|
||||
# may not work well for languages that do not have much training data in the LLM training set
|
||||
DISABLE_LLM_CHUNK_FILTER="True"
|
||||
|
||||
# Rephrase the user query in specified languages using LLM, use comma separated values
|
||||
MULTILINGUAL_QUERY_EXPANSION="English, French"
|
||||
|
||||
# Enables fine-grained embeddings for better retrieval
|
||||
# At the cost of indexing speed (~5x slower), query time is same speed
|
||||
ENABLE_MINI_CHUNK="True"
|
||||
|
||||
# Stronger model will help with multilingual tasks
|
||||
GEN_AI_MODEL_VERSION="gpt-4"
|
||||
GEN_AI_API_KEY=<provide your api key>
|
||||
|
||||
# More verbose logging if desired
|
||||
LOG_LEVEL="debug"
|
Reference in New Issue
Block a user