mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-05 20:49:48 +02:00
Remove Stopword Highlighting (#546)
This commit is contained in:
parent
0d505ffea1
commit
744c95e1e1
@ -159,6 +159,13 @@ QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60") # 60 seconds
|
|||||||
# Include additional document/chunk metadata in prompt to GenerativeAI
|
# Include additional document/chunk metadata in prompt to GenerativeAI
|
||||||
INCLUDE_METADATA = False
|
INCLUDE_METADATA = False
|
||||||
HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "false"
|
HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "false"
|
||||||
|
# Keyword Search Drop Stopwords
|
||||||
|
# If user has changed the default model, would most likely be to use a multilingual
|
||||||
|
# model, the stopwords are NLTK english stopwords so then we would want to not drop the keywords
|
||||||
|
if os.environ.get("EDIT_KEYWORD_QUERY"):
|
||||||
|
EDIT_KEYWORD_QUERY = os.environ.get("EDIT_KEYWORD_QUERY", "").lower() == "true"
|
||||||
|
else:
|
||||||
|
EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
|
||||||
|
|
||||||
|
|
||||||
#####
|
#####
|
||||||
|
@ -15,6 +15,7 @@ from requests import Response
|
|||||||
from danswer.chunking.models import DocMetadataAwareIndexChunk
|
from danswer.chunking.models import DocMetadataAwareIndexChunk
|
||||||
from danswer.chunking.models import InferenceChunk
|
from danswer.chunking.models import InferenceChunk
|
||||||
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
|
||||||
|
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
|
||||||
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
||||||
from danswer.configs.app_configs import VESPA_DEPLOYMENT_ZIP
|
from danswer.configs.app_configs import VESPA_DEPLOYMENT_ZIP
|
||||||
from danswer.configs.app_configs import VESPA_HOST
|
from danswer.configs.app_configs import VESPA_HOST
|
||||||
@ -44,6 +45,7 @@ from danswer.datastores.interfaces import DocumentInsertionRecord
|
|||||||
from danswer.datastores.interfaces import IndexFilter
|
from danswer.datastores.interfaces import IndexFilter
|
||||||
from danswer.datastores.interfaces import UpdateRequest
|
from danswer.datastores.interfaces import UpdateRequest
|
||||||
from danswer.datastores.vespa.utils import remove_invalid_unicode_chars
|
from danswer.datastores.vespa.utils import remove_invalid_unicode_chars
|
||||||
|
from danswer.search.keyword_search import remove_stop_words
|
||||||
from danswer.search.semantic_search import embed_query
|
from danswer.search.semantic_search import embed_query
|
||||||
from danswer.utils.batching import batch_generator
|
from danswer.utils.batching import batch_generator
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
@ -324,9 +326,7 @@ def _process_dynamic_summary(
|
|||||||
|
|
||||||
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
def _query_vespa(query_params: Mapping[str, str | int]) -> list[InferenceChunk]:
|
||||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||||
raise ValueError(
|
raise ValueError("No/empty query received")
|
||||||
"Query only consisted of stopwords, should not use Keyword Search"
|
|
||||||
)
|
|
||||||
response = requests.get(SEARCH_ENDPOINT, params=query_params)
|
response = requests.get(SEARCH_ENDPOINT, params=query_params)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
@ -540,10 +540,13 @@ class VespaIndex(DocumentIndex):
|
|||||||
)
|
)
|
||||||
|
|
||||||
query_embedding = embed_query(query)
|
query_embedding = embed_query(query)
|
||||||
|
query_keywords = (
|
||||||
|
" ".join(remove_stop_words(query)) if EDIT_KEYWORD_QUERY else query
|
||||||
|
)
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"yql": yql,
|
"yql": yql,
|
||||||
"query": query,
|
"query": query_keywords,
|
||||||
"input.query(query_embedding)": str(query_embedding),
|
"input.query(query_embedding)": str(query_embedding),
|
||||||
"ranking.profile": "semantic_search",
|
"ranking.profile": "semantic_search",
|
||||||
}
|
}
|
||||||
|
@ -7,6 +7,7 @@ from nltk.stem import WordNetLemmatizer # type:ignore
|
|||||||
from nltk.tokenize import word_tokenize # type:ignore
|
from nltk.tokenize import word_tokenize # type:ignore
|
||||||
|
|
||||||
from danswer.chunking.models import InferenceChunk
|
from danswer.chunking.models import InferenceChunk
|
||||||
|
from danswer.configs.app_configs import EDIT_KEYWORD_QUERY
|
||||||
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
||||||
from danswer.datastores.interfaces import DocumentIndex
|
from danswer.datastores.interfaces import DocumentIndex
|
||||||
from danswer.datastores.interfaces import IndexFilter
|
from danswer.datastores.interfaces import IndexFilter
|
||||||
@ -28,10 +29,13 @@ def lemmatize_text(text: str) -> list[str]:
|
|||||||
def remove_stop_words(text: str) -> list[str]:
|
def remove_stop_words(text: str) -> list[str]:
|
||||||
stop_words = set(stopwords.words("english"))
|
stop_words = set(stopwords.words("english"))
|
||||||
word_tokens = word_tokenize(text)
|
word_tokens = word_tokenize(text)
|
||||||
return [word for word in word_tokens if word.casefold() not in stop_words]
|
text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words]
|
||||||
|
return text_trimmed or word_tokens
|
||||||
|
|
||||||
|
|
||||||
def query_processing(query: str) -> str:
|
def query_processing(
|
||||||
|
query: str,
|
||||||
|
) -> str:
|
||||||
query = " ".join(remove_stop_words(query))
|
query = " ".join(remove_stop_words(query))
|
||||||
query = " ".join(lemmatize_text(query))
|
query = " ".join(lemmatize_text(query))
|
||||||
return query
|
return query
|
||||||
@ -44,10 +48,12 @@ def retrieve_keyword_documents(
|
|||||||
filters: list[IndexFilter] | None,
|
filters: list[IndexFilter] | None,
|
||||||
datastore: DocumentIndex,
|
datastore: DocumentIndex,
|
||||||
num_hits: int = NUM_RETURNED_HITS,
|
num_hits: int = NUM_RETURNED_HITS,
|
||||||
|
edit_query: bool = EDIT_KEYWORD_QUERY,
|
||||||
retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None]
|
retrieval_metrics_callback: Callable[[RetrievalMetricsContainer], None]
|
||||||
| None = None,
|
| None = None,
|
||||||
) -> list[InferenceChunk] | None:
|
) -> list[InferenceChunk] | None:
|
||||||
edited_query = query_processing(query)
|
edited_query = query_processing(query) if edit_query else query
|
||||||
|
|
||||||
top_chunks = datastore.keyword_retrieval(edited_query, user_id, filters, num_hits)
|
top_chunks = datastore.keyword_retrieval(edited_query, user_id, filters, num_hits)
|
||||||
|
|
||||||
if not top_chunks:
|
if not top_chunks:
|
||||||
|
@ -89,6 +89,7 @@ services:
|
|||||||
- ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
|
- ASYM_QUERY_PREFIX=${ASYM_QUERY_PREFIX:-}
|
||||||
- ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
|
- ASYM_PASSAGE_PREFIX=${ASYM_PASSAGE_PREFIX:-}
|
||||||
- SKIP_RERANKING=${SKIP_RERANKING:-}
|
- SKIP_RERANKING=${SKIP_RERANKING:-}
|
||||||
|
- EDIT_KEYWORD_QUERY=${EDIT_KEYWORD_QUERY:-}
|
||||||
# Set to debug to get more fine-grained logs
|
# Set to debug to get more fine-grained logs
|
||||||
- LOG_LEVEL=${LOG_LEVEL:-info}
|
- LOG_LEVEL=${LOG_LEVEL:-info}
|
||||||
volumes:
|
volumes:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user