Remove AI Thoughts by default (#783)

- Removes AI Thoughts by default - only shows when validation fails
- Removes punctuation "words" from queries in addition to stopwords (Vespa ignores punctuation anyways)
- Fixes Vespa deletion script for larger doc counts
This commit is contained in:
Chris Weaver
2023-11-29 01:00:53 -08:00
committed by GitHub
parent fcb7f6fcc0
commit 37daf4f3e4
10 changed files with 92 additions and 106 deletions

View File

@@ -57,7 +57,7 @@ from danswer.indexing.models import InferenceChunk
from danswer.search.models import IndexFilters
from danswer.search.search_runner import embed_query
from danswer.search.search_runner import query_processing
from danswer.search.search_runner import remove_stop_words
from danswer.search.search_runner import remove_stop_words_and_punctuation
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger
@@ -732,7 +732,9 @@ class VespaIndex(DocumentIndex):
query_embedding = embed_query(query)
query_keywords = (
" ".join(remove_stop_words(query)) if edit_keyword_query else query
" ".join(remove_stop_words_and_punctuation(query))
if edit_keyword_query
else query
)
params: dict[str, str | int] = {
@@ -773,7 +775,9 @@ class VespaIndex(DocumentIndex):
query_embedding = embed_query(query)
query_keywords = (
" ".join(remove_stop_words(query)) if edit_keyword_query else query
" ".join(remove_stop_words_and_punctuation(query))
if edit_keyword_query
else query
)
params: dict[str, str | int | float] = {

View File

@@ -4,7 +4,7 @@ from danswer.search.models import QueryFlow
from danswer.search.models import SearchType
from danswer.search.search_nlp_models import get_default_tokenizer
from danswer.search.search_nlp_models import IntentModel
from danswer.search.search_runner import remove_stop_words
from danswer.search.search_runner import remove_stop_words_and_punctuation
from danswer.server.models import HelperResponse
from danswer.utils.logger import setup_logger
from danswer.utils.timing import log_function_time
@@ -67,7 +67,7 @@ def recommend_search_flow(
# Heuristics based decisions
words = query.split()
non_stopwords = remove_stop_words(query)
non_stopwords = remove_stop_words_and_punctuation(query)
non_stopword_percent = len(non_stopwords) / len(words)
# UNK tokens -> suggest Keyword (still may be valid QA)

View File

@@ -1,3 +1,4 @@
import string
from collections.abc import Callable
from collections.abc import Iterator
from copy import deepcopy
@@ -55,17 +56,21 @@ def lemmatize_text(text: str) -> list[str]:
return [lemmatizer.lemmatize(word) for word in word_tokens]
def remove_stop_words(text: str) -> list[str]:
def remove_stop_words_and_punctuation(text: str) -> list[str]:
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
text_trimmed = [word for word in word_tokens if word.casefold() not in stop_words]
text_trimmed = [
word
for word in word_tokens
if (word.casefold() not in stop_words and word not in string.punctuation)
]
return text_trimmed or word_tokens
def query_processing(
query: str,
) -> str:
query = " ".join(remove_stop_words(query))
query = " ".join(remove_stop_words_and_punctuation(query))
query = " ".join(lemmatize_text(query))
return query

View File

@@ -16,9 +16,20 @@ logger = setup_logger()
def wipe_vespa_index() -> None:
params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
response.raise_for_status()
continuation = None
should_continue = True
while should_continue:
params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME}
if continuation:
params = {**params, "continuation": continuation}
response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params)
response.raise_for_status()
response_json = response.json()
print(response_json)
continuation = response_json.get("continuation")
should_continue = bool(continuation)
if __name__ == "__main__":