Remove AI Thoughts by default (#783)

- Removes AI Thoughts by default - only shows when validation fails - Removes punctuation "words" from queries in addition to stopwords (Vespa ignores punctuation anyways) - Fixes Vespa deletion script for larger doc counts
2025-09-25 19:37:29 +02:00 · 2023-11-29 01:00:53 -08:00
parent fcb7f6fcc0
commit 37daf4f3e4
10 changed files with 92 additions and 106 deletions
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -57,7 +57,7 @@ from danswer.indexing.models import InferenceChunk
 from danswer.search.models import IndexFilters
 from danswer.search.search_runner import embed_query
 from danswer.search.search_runner import query_processing
-from danswer.search.search_runner import remove_stop_words
+from danswer.search.search_runner import remove_stop_words_and_punctuation
 from danswer.utils.batching import batch_generator
 from danswer.utils.logger import setup_logger

@@ -732,7 +732,9 @@ class VespaIndex(DocumentIndex):
        query_embedding = embed_query(query)

        query_keywords = (
-            " ".join(remove_stop_words(query)) if edit_keyword_query else query
+            " ".join(remove_stop_words_and_punctuation(query))
+            if edit_keyword_query
+            else query
        )

        params: dict[str, str | int] = {
@@ -773,7 +775,9 @@ class VespaIndex(DocumentIndex):
        query_embedding = embed_query(query)

        query_keywords = (
-            " ".join(remove_stop_words(query)) if edit_keyword_query else query
+            " ".join(remove_stop_words_and_punctuation(query))
+            if edit_keyword_query
+            else query
        )

        params: dict[str, str | int | float] = {
--- a/backend/danswer/search/danswer_helper.py
+++ b/backend/danswer/search/danswer_helper.py
@@ -4,7 +4,7 @@ from danswer.search.models import QueryFlow
 from danswer.search.models import SearchType
 from danswer.search.search_nlp_models import get_default_tokenizer
 from danswer.search.search_nlp_models import IntentModel
-from danswer.search.search_runner import remove_stop_words
+from danswer.search.search_runner import remove_stop_words_and_punctuation
 from danswer.server.models import HelperResponse
 from danswer.utils.logger import setup_logger
 from danswer.utils.timing import log_function_time
@@ -67,7 +67,7 @@ def recommend_search_flow(

    # Heuristics based decisions
    words = query.split()
-    non_stopwords = remove_stop_words(query)
+    non_stopwords = remove_stop_words_and_punctuation(query)
    non_stopword_percent = len(non_stopwords) / len(words)

    # UNK tokens -> suggest Keyword (still may be valid QA)
--- a/backend/danswer/search/search_runner.py
+++ b/backend/danswer/search/search_runner.py
@@ -1,3 +1,4 @@
+import string
 from collections.abc import Callable
 from collections.abc import Iterator
 from copy import deepcopy
@@ -55,17 +56,21 @@ def lemmatize_text(text: str) -> list[str]:
    return [lemmatizer.lemmatize(word) for word in word_tokens]


-def remove_stop_words(text: str) -> list[str]:
+def remove_stop_words_and_punctuation(text: str) -> list[str]:
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
-    text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words]
+    text_trimmed = [
+        word
+        for word in word_tokens
+        if (word.casefold() not in stop_words and word not in string.punctuation)
+    ]
    return text_trimmed or word_tokens


 def query_processing(
    query: str,
 ) -> str:
-    query = " ".join(remove_stop_words(query))
+    query = " ".join(remove_stop_words_and_punctuation(query))
    query = " ".join(lemmatize_text(query))
    return query

--- a/backend/scripts/reset_indexes.py
+++ b/backend/scripts/reset_indexes.py
@@ -16,9 +16,20 @@ logger = setup_logger()


 def wipe_vespa_index() -> None:
-    params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
-    response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
-    response.raise_for_status()
+    continuation = None
+    should_continue = True
+    while should_continue:
+        params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
+        if continuation:
+            params = {**params, "continuation": continuation}
+        response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
+        response.raise_for_status()
+
+        response_json = response.json()
+        print(response_json)
+
+        continuation = response_json.get("continuation")
+        should_continue = bool(continuation)


 if __name__ == "__main__":