Remove Stopword Highlighting (#546)

This commit is contained in:
Yuhong Sun 2023-10-09 18:54:40 -07:00 committed by GitHub
parent 0d505ffea1
commit 744c95e1e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 24 additions and 7 deletions

View File

@ -159,6 +159,13 @@ QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60") # 60 seconds
# Include additional document/chunk metadata in prompt to GenerativeAI
INCLUDE_METADATA = False
HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "false"
# Keyword Search Drop Stopwords
# If user has changed the default model, would most likely be to use a multilingual
# model, the stopwords are NLTK english stopwords so then we would want to not drop the keywords
if os.environ.get("EDIT_KEYWORD_QUERY"):
EDIT_KEYWORD_QUERY = os.environ.get("EDIT_KEYWORD_QUERY", "").lower() == "true"
else:
EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
#####

View File

@ -15,6 +15,7 @@ from requests import Response
from danswer.chunking.models import DocMetadataAwareIndexChunk
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
from danswer.configs.app_configs import NUM_RETURNED_HITS
from danswer.configs.app_configs import VESPA_DEPLOYMENT_ZIP
from danswer.configs.app_configs import VESPA_HOST
@ -44,6 +45,7 @@ from danswer.datastores.interfaces import DocumentInsertionRecord
from danswer.datastores.interfaces import IndexFilter
from danswer.datastores.interfaces import UpdateRequest
from danswer.datastores.vespa.utils import remove_invalid_unicode_chars
from danswer.search.keyword_search import remove_stop_words
from danswer.search.semantic_search import embed_query
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
@ -324,9 +326,7 @@ def _process_dynamic_summary(
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
if "query" in query_params and not cast(str, query_params["query"]).strip():
raise ValueError(
"Query only consisted of stopwords, should not use Keyword Search"
)
raise ValueError("No/empty query received")
response = requests.get(SEARCH_ENDPOINT, params=query_params)
response.raise_for_status()
@ -540,10 +540,13 @@ class VespaIndex(DocumentIndex):
)
query_embedding = embed_query(query)
query_keywords = (
" ".join(remove_stop_words(query)) if EDIT_KEYWORD_QUERY else query
)
params = {
"yql": yql,
"query": query,
"query": query_keywords,
"input.query(query_embedding)": str(query_embedding),
"ranking.profile": "semantic_search",
}

View File

@ -7,6 +7,7 @@ from nltk.stem import WordNetLemmatizer # type:ignore
from nltk.tokenize import word_tokenize # type:ignore
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
from danswer.configs.app_configs import NUM_RETURNED_HITS
from danswer.datastores.interfaces import DocumentIndex
from danswer.datastores.interfaces import IndexFilter
@ -28,10 +29,13 @@ def lemmatize_text(text: str) -> list[str]:
def remove_stop_words(text: str) -> list[str]:
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
return [word for word in word_tokens if word.casefold() not in stop_words]
text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words]
return text_trimmed or word_tokens
def query_processing(query: str) -> str:
def query_processing(
query: str,
) -> str:
query = " ".join(remove_stop_words(query))
query = " ".join(lemmatize_text(query))
return query
@ -44,10 +48,12 @@ def retrieve_keyword_documents(
filters: list[IndexFilter] | None,
datastore: DocumentIndex,
num_hits: int = NUM_RETURNED_HITS,
edit_query: bool = EDIT_KEYWORD_QUERY,
retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None]
| None = None,
) -> list[InferenceChunk] | None:
edited_query = query_processing(query)
edited_query = query_processing(query) if edit_query else query
top_chunks = datastore.keyword_retrieval(edited_query, user_id, filters, num_hits)
if not top_chunks:

View File

@ -89,6 +89,7 @@ services:
- ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
- ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
- SKIP_RERANKING=${SKIP_RERANKING:-}
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
# Set to debug to get more fine-grained logs
- LOG_LEVEL=${LOG_LEVEL:-info}
volumes: