Multilingual Query Expansion (#737)

This commit is contained in:
Yuhong Sun
2023-11-19 10:55:55 -08:00
committed by GitHub
parent b258ec1bed
commit 6fb07d20cc
9 changed files with 196 additions and 11 deletions

View File

@@ -47,6 +47,7 @@ services:
- SKIP_RERANKING=${SKIP_RERANKING:-}
- QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
- MULTILINGUAL_QUERY_EXPANSION=${MULTILINGUAL_QUERY_EXPANSION:-}
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-}
- MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
# Leave this on pretty please? Nothing sensitive is collected!
@@ -54,6 +55,8 @@ services:
- DISABLE_TELEMETRY=${DISABLE_TELEMETRY:-}
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
# Log all of the prompts to the LLM
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-info}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage
@@ -106,11 +109,17 @@ services:
- SKIP_RERANKING=${SKIP_RERANKING:-}
- QA_PROMPT_OVERRIDE=${QA_PROMPT_OVERRIDE:-}
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
- MULTILINGUAL_QUERY_EXPANSION=${MULTILINGUAL_QUERY_EXPANSION:-}
- MIN_THREADS_ML_MODELS=${MIN_THREADS_ML_MODELS:-}
- MODEL_SERVER_HOST=${MODEL_SERVER_HOST:-}
- MODEL_SERVER_PORT=${MODEL_SERVER_PORT:-}
# Leave this on pretty please? Nothing sensitive is collected!
# https://docs.danswer.dev/more/telemetry
- DISABLE_TELEMETRY=${DISABLE_TELEMETRY:-}
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
# Log all of the prompts to the LLM
- LOG_ALL_MODEL_INTERACTIONS=${LOG_ALL_MODEL_INTERACTIONS:-info}
volumes:
- local_dynamic_storage:/home/storage
- file_connector_tmp_storage:/home/file_connector_storage

View File

@@ -0,0 +1,41 @@
# This env template shows how to configure Danswer for multilingual use
# In this case, it is configured for French and English
# To use it, copy it to .env in the docker_compose directory.
# Feel free to combine it with the other templates to suit your needs
# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
# The model above is trained with the following prefix for queries and passages to improve retrieval
# by letting the model know which of the two type is currently being embedded
ASYM_QUERY_PREFIX="query: "
ASYM_PASSAGE_PREFIX="passage: "
# Depends model by model, this one is tuned with this as True
NORMALIZE_EMBEDDINGS="True"
# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
SIM_SCORE_RANGE_LOW="0.6"
SIM_SCORE_RANGE_LOW="0.8"
# No recent multilingual reranking models small enough to run on CPU, so turning it off
SKIP_RERANKING="True"
# Use LLM to determine if chunks are relevant to the query
# may not work well for languages that do not have much training data in the LLM training set
DISABLE_LLM_CHUNK_FILTER="True"
# Rephrase the user query in specified languages using LLM, use comma separated values
MULTILINGUAL_QUERY_EXPANSION="English, French"
# Enables fine-grained embeddings for better retrieval
# At the cost of indexing speed (~5x slower), query time is same speed
ENABLE_MINI_CHUNK="True"
# Stronger model will help with multilingual tasks
GEN_AI_MODEL_VERSION="gpt-4"
GEN_AI_API_KEY=<provide your api key>
# More verbose logging if desired
LOG_LEVEL="debug"