Improve English rephrasing for multilingual use case (#808)

This commit is contained in:
Yuhong Sun
2023-12-03 14:34:12 -08:00
committed by GitHub
parent 5629ca7d96
commit 651de071f7
3 changed files with 27 additions and 8 deletions

View File

@@ -1,11 +1,11 @@
# Prompts that aren't part of a particular configurable feature # Prompts that aren't part of a particular configurable feature
LANGUAGE_REPHRASE_PROMPT = """ LANGUAGE_REPHRASE_PROMPT = """
Rephrase the query in {target_language}. Translate query to {target_language}.
If the query is already in the correct language, \ If the query at the end is already in {target_language}, \
simply repeat the ORIGINAL query back to me, EXACTLY as is with no rephrasing. simply repeat the ORIGINAL query back to me, EXACTLY as is with no edits.
NEVER change proper nouns, technical terms, acronyms, or terms you are not familiar with.
IMPORTANT, if the query is already in the target language, DO NOT REPHRASE OR EDIT the query! If the query below is not in {target_language}, translate it into {target_language}.
Query: Query:
{query} {query}

View File

@@ -332,6 +332,12 @@ def apply_boost(
return final_chunks return final_chunks
def _simplify_text(text: str) -> str:
return "".join(
char for char in text if char not in string.punctuation and not char.isspace()
).lower()
def retrieve_chunks( def retrieve_chunks(
query: SearchQuery, query: SearchQuery,
document_index: DocumentIndex, document_index: DocumentIndex,
@@ -347,13 +353,22 @@ def retrieve_chunks(
query=query, document_index=document_index, hybrid_alpha=hybrid_alpha query=query, document_index=document_index, hybrid_alpha=hybrid_alpha
) )
else: else:
simplified_queries = set()
run_queries: list[tuple[Callable, tuple]] = [] run_queries: list[tuple[Callable, tuple]] = []
# Currently only uses query expansion on multilingual use cases # Currently only uses query expansion on multilingual use cases
query_rephrases = rephrase_query(query.query, multilingual_query_expansion) query_rephrases = rephrase_query(query.query, multilingual_query_expansion)
# Just to be extra sure, add the original query. # Just to be extra sure, add the original query.
query_rephrases.append(query.query) query_rephrases.append(query.query)
for rephrase in set(query_rephrases): for rephrase in set(query_rephrases):
q_copy = query.copy(update={'query': rephrase}, deep=True) # Sometimes the model rephrases the query in the same language with minor changes
# Avoid doing an extra search with the minor changes as this biases the results
simplified_rephrase = _simplify_text(rephrase)
if simplified_rephrase in simplified_queries:
continue
simplified_queries.add(simplified_rephrase)
q_copy = query.copy(update={"query": rephrase}, deep=True)
run_queries.append( run_queries.append(
(doc_index_retrieval, (q_copy, document_index, hybrid_alpha)) (doc_index_retrieval, (q_copy, document_index, hybrid_alpha))
) )

View File

@@ -42,7 +42,11 @@ def rephrase_query(
(llm_rephrase_query, (query, language)) for language in languages (llm_rephrase_query, (query, language)) for language in languages
] ]
return run_functions_tuples_in_parallel(functions_with_args) query_rephrases = run_functions_tuples_in_parallel(functions_with_args)
return query_rephrases
else: else:
return [llm_rephrase_query(query, language) for language in languages] query_rephrases = [
llm_rephrase_query(query, language) for language in languages
]
return query_rephrases