mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-25 19:37:29 +02:00
Remove AI Thoughts by default (#783)
- Removes AI Thoughts by default - only shows when validation fails - Removes punctuation "words" from queries in addition to stopwords (Vespa ignores punctuation anyways) - Fixes Vespa deletion script for larger doc counts
This commit is contained in:
@@ -57,7 +57,7 @@ from danswer.indexing.models import InferenceChunk
|
||||
from danswer.search.models import IndexFilters
|
||||
from danswer.search.search_runner import embed_query
|
||||
from danswer.search.search_runner import query_processing
|
||||
from danswer.search.search_runner import remove_stop_words
|
||||
from danswer.search.search_runner import remove_stop_words_and_punctuation
|
||||
from danswer.utils.batching import batch_generator
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
@@ -732,7 +732,9 @@ class VespaIndex(DocumentIndex):
|
||||
query_embedding = embed_query(query)
|
||||
|
||||
query_keywords = (
|
||||
" ".join(remove_stop_words(query)) if edit_keyword_query else query
|
||||
" ".join(remove_stop_words_and_punctuation(query))
|
||||
if edit_keyword_query
|
||||
else query
|
||||
)
|
||||
|
||||
params: dict[str, str | int] = {
|
||||
@@ -773,7 +775,9 @@ class VespaIndex(DocumentIndex):
|
||||
query_embedding = embed_query(query)
|
||||
|
||||
query_keywords = (
|
||||
" ".join(remove_stop_words(query)) if edit_keyword_query else query
|
||||
" ".join(remove_stop_words_and_punctuation(query))
|
||||
if edit_keyword_query
|
||||
else query
|
||||
)
|
||||
|
||||
params: dict[str, str | int | float] = {
|
||||
|
@@ -4,7 +4,7 @@ from danswer.search.models import QueryFlow
|
||||
from danswer.search.models import SearchType
|
||||
from danswer.search.search_nlp_models import get_default_tokenizer
|
||||
from danswer.search.search_nlp_models import IntentModel
|
||||
from danswer.search.search_runner import remove_stop_words
|
||||
from danswer.search.search_runner import remove_stop_words_and_punctuation
|
||||
from danswer.server.models import HelperResponse
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.timing import log_function_time
|
||||
@@ -67,7 +67,7 @@ def recommend_search_flow(
|
||||
|
||||
# Heuristics based decisions
|
||||
words = query.split()
|
||||
non_stopwords = remove_stop_words(query)
|
||||
non_stopwords = remove_stop_words_and_punctuation(query)
|
||||
non_stopword_percent = len(non_stopwords) / len(words)
|
||||
|
||||
# UNK tokens -> suggest Keyword (still may be valid QA)
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import string
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from copy import deepcopy
|
||||
@@ -55,17 +56,21 @@ def lemmatize_text(text: str) -> list[str]:
|
||||
return [lemmatizer.lemmatize(word) for word in word_tokens]
|
||||
|
||||
|
||||
def remove_stop_words(text: str) -> list[str]:
|
||||
def remove_stop_words_and_punctuation(text: str) -> list[str]:
|
||||
stop_words = set(stopwords.words("english"))
|
||||
word_tokens = word_tokenize(text)
|
||||
text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words]
|
||||
text_trimmed = [
|
||||
word
|
||||
for word in word_tokens
|
||||
if (word.casefold() not in stop_words and word not in string.punctuation)
|
||||
]
|
||||
return text_trimmed or word_tokens
|
||||
|
||||
|
||||
def query_processing(
|
||||
query: str,
|
||||
) -> str:
|
||||
query = " ".join(remove_stop_words(query))
|
||||
query = " ".join(remove_stop_words_and_punctuation(query))
|
||||
query = " ".join(lemmatize_text(query))
|
||||
return query
|
||||
|
||||
|
@@ -16,9 +16,20 @@ logger = setup_logger()
|
||||
|
||||
|
||||
def wipe_vespa_index() -> None:
|
||||
params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
|
||||
response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
|
||||
response.raise_for_status()
|
||||
continuation = None
|
||||
should_continue = True
|
||||
while should_continue:
|
||||
params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
|
||||
if continuation:
|
||||
params = {**params, "continuation": continuation}
|
||||
response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
|
||||
response.raise_for_status()
|
||||
|
||||
response_json = response.json()
|
||||
print(response_json)
|
||||
|
||||
continuation = response_json.get("continuation")
|
||||
should_continue = bool(continuation)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user