mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-09 04:18:32 +02:00
Remove Stopword Highlighting (#546)
This commit is contained in:
parent
0d505ffea1
commit
744c95e1e1
@ -159,6 +159,13 @@ QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60") # 60 seconds
|
||||
# Include additional document/chunk metadata in prompt to GenerativeAI
|
||||
INCLUDE_METADATA = False
|
||||
HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "false"
|
||||
# Keyword Search Drop Stopwords
|
||||
# If user has changed the default model, would most likely be to use a multilingual
|
||||
# model, the stopwords are NLTK english stopwords so then we would want to not drop the keywords
|
||||
if os.environ.get("EDIT_KEYWORD_QUERY"):
|
||||
EDIT_KEYWORD_QUERY = os.environ.get("EDIT_KEYWORD_QUERY", "").lower() == "true"
|
||||
else:
|
||||
EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
|
||||
|
||||
|
||||
#####
|
||||
|
@ -15,6 +15,7 @@ from requests import Response
|
||||
from danswer.chunking.models import DocMetadataAwareIndexChunk
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
|
||||
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
||||
from danswer.configs.app_configs import VESPA_DEPLOYMENT_ZIP
|
||||
from danswer.configs.app_configs import VESPA_HOST
|
||||
@ -44,6 +45,7 @@ from danswer.datastores.interfaces import DocumentInsertionRecord
|
||||
from danswer.datastores.interfaces import IndexFilter
|
||||
from danswer.datastores.interfaces import UpdateRequest
|
||||
from danswer.datastores.vespa.utils import remove_invalid_unicode_chars
|
||||
from danswer.search.keyword_search import remove_stop_words
|
||||
from danswer.search.semantic_search import embed_query
|
||||
from danswer.utils.batching import batch_generator
|
||||
from danswer.utils.logger import setup_logger
|
||||
@ -324,9 +326,7 @@ def _process_dynamic_summary(
|
||||
|
||||
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||
raise ValueError(
|
||||
"Query only consisted of stopwords, should not use Keyword Search"
|
||||
)
|
||||
raise ValueError("No/empty query received")
|
||||
response = requests.get(SEARCH_ENDPOINT, params=query_params)
|
||||
response.raise_for_status()
|
||||
|
||||
@ -540,10 +540,13 @@ class VespaIndex(DocumentIndex):
|
||||
)
|
||||
|
||||
query_embedding = embed_query(query)
|
||||
query_keywords = (
|
||||
" ".join(remove_stop_words(query)) if EDIT_KEYWORD_QUERY else query
|
||||
)
|
||||
|
||||
params = {
|
||||
"yql": yql,
|
||||
"query": query,
|
||||
"query": query_keywords,
|
||||
"input.query(query_embedding)": str(query_embedding),
|
||||
"ranking.profile": "semantic_search",
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ from nltk.stem import WordNetLemmatizer # type:ignore
|
||||
from nltk.tokenize import word_tokenize # type:ignore
|
||||
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
|
||||
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
||||
from danswer.datastores.interfaces import DocumentIndex
|
||||
from danswer.datastores.interfaces import IndexFilter
|
||||
@ -28,10 +29,13 @@ def lemmatize_text(text: str) -> list[str]:
|
||||
def remove_stop_words(text: str) -> list[str]:
|
||||
stop_words = set(stopwords.words("english"))
|
||||
word_tokens = word_tokenize(text)
|
||||
return [word for word in word_tokens if word.casefold() not in stop_words]
|
||||
text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words]
|
||||
return text_trimmed or word_tokens
|
||||
|
||||
|
||||
def query_processing(query: str) -> str:
|
||||
def query_processing(
|
||||
query: str,
|
||||
) -> str:
|
||||
query = " ".join(remove_stop_words(query))
|
||||
query = " ".join(lemmatize_text(query))
|
||||
return query
|
||||
@ -44,10 +48,12 @@ def retrieve_keyword_documents(
|
||||
filters: list[IndexFilter] | None,
|
||||
datastore: DocumentIndex,
|
||||
num_hits: int = NUM_RETURNED_HITS,
|
||||
edit_query: bool = EDIT_KEYWORD_QUERY,
|
||||
retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None]
|
||||
| None = None,
|
||||
) -> list[InferenceChunk] | None:
|
||||
edited_query = query_processing(query)
|
||||
edited_query = query_processing(query) if edit_query else query
|
||||
|
||||
top_chunks = datastore.keyword_retrieval(edited_query, user_id, filters, num_hits)
|
||||
|
||||
if not top_chunks:
|
||||
|
@ -89,6 +89,7 @@ services:
|
||||
- ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
|
||||
- ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
|
||||
- SKIP_RERANKING=${SKIP_RERANKING:-}
|
||||
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
|
||||
# Set to debug to get more fine-grained logs
|
||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||
volumes:
|
||||
|
Loading…
x
Reference in New Issue
Block a user