From 744c95e1e1435a940f848b7d3fb94a9fa8dfdc9b Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 9 Oct 2023 18:54:40 -0700 Subject: [PATCH] Remove Stopword Highlighting (#546) --- backend/danswer/configs/app_configs.py | 7 +++++++ backend/danswer/datastores/vespa/store.py | 11 +++++++---- backend/danswer/search/keyword_search.py | 12 +++++++++--- deployment/docker_compose/docker-compose.dev.yml | 1 + 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 380203c59..8b6b0e2c8 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -159,6 +159,13 @@ QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60") # 60 seconds # Include additional document/chunk metadata in prompt to GenerativeAI INCLUDE_METADATA = False HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "false" +# Keyword Search Drop Stopwords +# If user has changed the default model, would most likely be to use a multilingual +# model, the stopwords are NLTK english stopwords so then we would want to not drop the keywords +if os.environ.get("EDIT_KEYWORD_QUERY"): + EDIT_KEYWORD_QUERY = os.environ.get("EDIT_KEYWORD_QUERY", "").lower() == "true" +else: + EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL") ##### diff --git a/backend/danswer/datastores/vespa/store.py b/backend/danswer/datastores/vespa/store.py index 2d258d40d..fa787a710 100644 --- a/backend/danswer/datastores/vespa/store.py +++ b/backend/danswer/datastores/vespa/store.py @@ -15,6 +15,7 @@ from requests import Response from danswer.chunking.models import DocMetadataAwareIndexChunk from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import DOCUMENT_INDEX_NAME +from danswer.configs.app_configs import EDIT_KEYWORD_QUERY from danswer.configs.app_configs import NUM_RETURNED_HITS from danswer.configs.app_configs import VESPA_DEPLOYMENT_ZIP from danswer.configs.app_configs import VESPA_HOST @@ -44,6 +45,7 @@ from danswer.datastores.interfaces import DocumentInsertionRecord from danswer.datastores.interfaces import IndexFilter from danswer.datastores.interfaces import UpdateRequest from danswer.datastores.vespa.utils import remove_invalid_unicode_chars +from danswer.search.keyword_search import remove_stop_words from danswer.search.semantic_search import embed_query from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger @@ -324,9 +326,7 @@ def _process_dynamic_summary( def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]: if "query" in query_params and not cast(str, query_params["query"]).strip(): - raise ValueError( - "Query only consisted of stopwords, should not use Keyword Search" - ) + raise ValueError("No/empty query received") response = requests.get(SEARCH_ENDPOINT, params=query_params) response.raise_for_status() @@ -540,10 +540,13 @@ class VespaIndex(DocumentIndex): ) query_embedding = embed_query(query) + query_keywords = ( + " ".join(remove_stop_words(query)) if EDIT_KEYWORD_QUERY else query + ) params = { "yql": yql, - "query": query, + "query": query_keywords, "input.query(query_embedding)": str(query_embedding), "ranking.profile": "semantic_search", } diff --git a/backend/danswer/search/keyword_search.py b/backend/danswer/search/keyword_search.py index a0adf7f55..f681fe12c 100644 --- a/backend/danswer/search/keyword_search.py +++ b/backend/danswer/search/keyword_search.py @@ -7,6 +7,7 @@ from nltk.stem import WordNetLemmatizer # type:ignore from nltk.tokenize import word_tokenize # type:ignore from danswer.chunking.models import InferenceChunk +from danswer.configs.app_configs import EDIT_KEYWORD_QUERY from danswer.configs.app_configs import NUM_RETURNED_HITS from danswer.datastores.interfaces import DocumentIndex from danswer.datastores.interfaces import IndexFilter @@ -28,10 +29,13 @@ def lemmatize_text(text: str) -> list[str]: def remove_stop_words(text: str) -> list[str]: stop_words = set(stopwords.words("english")) word_tokens = word_tokenize(text) - return [word for word in word_tokens if word.casefold() not in stop_words] + text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words] + return text_trimmed or word_tokens -def query_processing(query: str) -> str: +def query_processing( + query: str, +) -> str: query = " ".join(remove_stop_words(query)) query = " ".join(lemmatize_text(query)) return query @@ -44,10 +48,12 @@ def retrieve_keyword_documents( filters: list[IndexFilter] | None, datastore: DocumentIndex, num_hits: int = NUM_RETURNED_HITS, + edit_query: bool = EDIT_KEYWORD_QUERY, retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None] | None = None, ) -> list[InferenceChunk] | None: - edited_query = query_processing(query) + edited_query = query_processing(query) if edit_query else query + top_chunks = datastore.keyword_retrieval(edited_query, user_id, filters, num_hits) if not top_chunks: diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index 388870818..4ebf0d4d4 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -89,6 +89,7 @@ services: - ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-} - ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-} - SKIP_RERANKING=${SKIP_RERANKING:-} + - EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-} # Set to debug to get more fine-grained logs - LOG_LEVEL=${LOG_LEVEL:-info} volumes: