mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-30 17:50:27 +02:00
Improve English rephrasing for multilingual use case (#808)
This commit is contained in:
parent
5629ca7d96
commit
651de071f7
@ -1,11 +1,11 @@
|
||||
# Prompts that aren't part of a particular configurable feature
|
||||
|
||||
LANGUAGE_REPHRASE_PROMPT = """
|
||||
Rephrase the query in {target_language}.
|
||||
If the query is already in the correct language, \
|
||||
simply repeat the ORIGINAL query back to me, EXACTLY as is with no rephrasing.
|
||||
NEVER change proper nouns, technical terms, acronyms, or terms you are not familiar with.
|
||||
IMPORTANT, if the query is already in the target language, DO NOT REPHRASE OR EDIT the query!
|
||||
Translate query to {target_language}.
|
||||
If the query at the end is already in {target_language}, \
|
||||
simply repeat the ORIGINAL query back to me, EXACTLY as is with no edits.
|
||||
|
||||
If the query below is not in {target_language}, translate it into {target_language}.
|
||||
|
||||
Query:
|
||||
{query}
|
||||
|
@ -332,6 +332,12 @@ def apply_boost(
|
||||
return final_chunks
|
||||
|
||||
|
||||
def _simplify_text(text: str) -> str:
|
||||
return "".join(
|
||||
char for char in text if char not in string.punctuation and not char.isspace()
|
||||
).lower()
|
||||
|
||||
|
||||
def retrieve_chunks(
|
||||
query: SearchQuery,
|
||||
document_index: DocumentIndex,
|
||||
@ -347,13 +353,22 @@ def retrieve_chunks(
|
||||
query=query, document_index=document_index, hybrid_alpha=hybrid_alpha
|
||||
)
|
||||
else:
|
||||
simplified_queries = set()
|
||||
run_queries: list[tuple[Callable, tuple]] = []
|
||||
|
||||
# Currently only uses query expansion on multilingual use cases
|
||||
query_rephrases = rephrase_query(query.query, multilingual_query_expansion)
|
||||
# Just to be extra sure, add the original query.
|
||||
query_rephrases.append(query.query)
|
||||
for rephrase in set(query_rephrases):
|
||||
q_copy = query.copy(update={'query': rephrase}, deep=True)
|
||||
# Sometimes the model rephrases the query in the same language with minor changes
|
||||
# Avoid doing an extra search with the minor changes as this biases the results
|
||||
simplified_rephrase = _simplify_text(rephrase)
|
||||
if simplified_rephrase in simplified_queries:
|
||||
continue
|
||||
simplified_queries.add(simplified_rephrase)
|
||||
|
||||
q_copy = query.copy(update={"query": rephrase}, deep=True)
|
||||
run_queries.append(
|
||||
(doc_index_retrieval, (q_copy, document_index, hybrid_alpha))
|
||||
)
|
||||
|
@ -42,7 +42,11 @@ def rephrase_query(
|
||||
(llm_rephrase_query, (query, language)) for language in languages
|
||||
]
|
||||
|
||||
return run_functions_tuples_in_parallel(functions_with_args)
|
||||
query_rephrases = run_functions_tuples_in_parallel(functions_with_args)
|
||||
return query_rephrases
|
||||
|
||||
else:
|
||||
return [llm_rephrase_query(query, language) for language in languages]
|
||||
query_rephrases = [
|
||||
llm_rephrase_query(query, language) for language in languages
|
||||
]
|
||||
return query_rephrases
|
||||
|
Loading…
x
Reference in New Issue
Block a user