From 651de071f7018141acd47e95b8f19c6739ae2fad Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sun, 3 Dec 2023 14:34:12 -0800 Subject: [PATCH] Improve English rephrasing for multilingual use case (#808) --- .../danswer/prompts/miscellaneous_prompts.py | 10 +++++----- backend/danswer/search/search_runner.py | 17 ++++++++++++++++- .../secondary_llm_flows/query_expansion.py | 8 ++++++-- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/backend/danswer/prompts/miscellaneous_prompts.py b/backend/danswer/prompts/miscellaneous_prompts.py index c58cbb48c2..340908f11c 100644 --- a/backend/danswer/prompts/miscellaneous_prompts.py +++ b/backend/danswer/prompts/miscellaneous_prompts.py @@ -1,11 +1,11 @@ # Prompts that aren't part of a particular configurable feature LANGUAGE_REPHRASE_PROMPT = """ -Rephrase the query in {target_language}. -If the query is already in the correct language, \ -simply repeat the ORIGINAL query back to me, EXACTLY as is with no rephrasing. -NEVER change proper nouns, technical terms, acronyms, or terms you are not familiar with. -IMPORTANT, if the query is already in the target language, DO NOT REPHRASE OR EDIT the query! +Translate query to {target_language}. +If the query at the end is already in {target_language}, \ +simply repeat the ORIGINAL query back to me, EXACTLY as is with no edits. + +If the query below is not in {target_language}, translate it into {target_language}. Query: {query} diff --git a/backend/danswer/search/search_runner.py b/backend/danswer/search/search_runner.py index 371fbf4641..576c2ae201 100644 --- a/backend/danswer/search/search_runner.py +++ b/backend/danswer/search/search_runner.py @@ -332,6 +332,12 @@ def apply_boost( return final_chunks +def _simplify_text(text: str) -> str: + return "".join( + char for char in text if char not in string.punctuation and not char.isspace() + ).lower() + + def retrieve_chunks( query: SearchQuery, document_index: DocumentIndex, @@ -347,13 +353,22 @@ def retrieve_chunks( query=query, document_index=document_index, hybrid_alpha=hybrid_alpha ) else: + simplified_queries = set() run_queries: list[tuple[Callable, tuple]] = [] + # Currently only uses query expansion on multilingual use cases query_rephrases = rephrase_query(query.query, multilingual_query_expansion) # Just to be extra sure, add the original query. query_rephrases.append(query.query) for rephrase in set(query_rephrases): - q_copy = query.copy(update={'query': rephrase}, deep=True) + # Sometimes the model rephrases the query in the same language with minor changes + # Avoid doing an extra search with the minor changes as this biases the results + simplified_rephrase = _simplify_text(rephrase) + if simplified_rephrase in simplified_queries: + continue + simplified_queries.add(simplified_rephrase) + + q_copy = query.copy(update={"query": rephrase}, deep=True) run_queries.append( (doc_index_retrieval, (q_copy, document_index, hybrid_alpha)) ) diff --git a/backend/danswer/secondary_llm_flows/query_expansion.py b/backend/danswer/secondary_llm_flows/query_expansion.py index c80dbcbfc3..ff3d9ae146 100644 --- a/backend/danswer/secondary_llm_flows/query_expansion.py +++ b/backend/danswer/secondary_llm_flows/query_expansion.py @@ -42,7 +42,11 @@ def rephrase_query( (llm_rephrase_query, (query, language)) for language in languages ] - return run_functions_tuples_in_parallel(functions_with_args) + query_rephrases = run_functions_tuples_in_parallel(functions_with_args) + return query_rephrases else: - return [llm_rephrase_query(query, language) for language in languages] + query_rephrases = [ + llm_rephrase_query(query, language) for language in languages + ] + return query_rephrases