Multilingual Query Expansion (#737)

This commit is contained in:
Yuhong Sun
2023-11-19 10:55:55 -08:00
committed by GitHub
parent b258ec1bed
commit 6fb07d20cc
9 changed files with 196 additions and 11 deletions

View File

@@ -0,0 +1,41 @@
# This env template shows how to configure Danswer for multilingual use
# In this case, it is configured for French and English
# To use it, copy it to .env in the docker_compose directory.
# Feel free to combine it with the other templates to suit your needs
# A recent MIT license multilingual model: https://huggingface.co/intfloat/multilingual-e5-small
DOCUMENT_ENCODER_MODEL="intfloat/multilingual-e5-small"
# The model above is trained with the following prefix for queries and passages to improve retrieval
# by letting the model know which of the two type is currently being embedded
ASYM_QUERY_PREFIX="query: "
ASYM_PASSAGE_PREFIX="passage: "
# Depends model by model, this one is tuned with this as True
NORMALIZE_EMBEDDINGS="True"
# Due to the loss function used in training, this model outputs similarity scores from range ~0.6 to 1
SIM_SCORE_RANGE_LOW="0.6"
SIM_SCORE_RANGE_LOW="0.8"
# No recent multilingual reranking models small enough to run on CPU, so turning it off
SKIP_RERANKING="True"
# Use LLM to determine if chunks are relevant to the query
# may not work well for languages that do not have much training data in the LLM training set
DISABLE_LLM_CHUNK_FILTER="True"
# Rephrase the user query in specified languages using LLM, use comma separated values
MULTILINGUAL_QUERY_EXPANSION="English, French"
# Enables fine-grained embeddings for better retrieval
# At the cost of indexing speed (~5x slower), query time is same speed
ENABLE_MINI_CHUNK="True"
# Stronger model will help with multilingual tasks
GEN_AI_MODEL_VERSION="gpt-4"
GEN_AI_API_KEY=<provide your api key>
# More verbose logging if desired
LOG_LEVEL="debug"