From 2f770264c8830fb85ffb492479093456215ddd37 Mon Sep 17 00:00:00 2001 From: Weves Date: Tue, 31 Oct 2023 17:51:42 -0700 Subject: [PATCH] Add randomly generated sentences --- backend/scripts/benchmark_search_isolated.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/backend/scripts/benchmark_search_isolated.py b/backend/scripts/benchmark_search_isolated.py index 9c12cc946..80cacff36 100644 --- a/backend/scripts/benchmark_search_isolated.py +++ b/backend/scripts/benchmark_search_isolated.py @@ -2,12 +2,18 @@ import os import random import time +import nltk + from danswer.configs.app_configs import DOC_TIME_DECAY from danswer.document_index.vespa.index import _query_vespa from danswer.document_index.vespa.index import CONTENT_SUMMARY from danswer.document_index.vespa.index import VespaIndex from danswer.search.search_runner import embed_query +# Download the wordlist +nltk.download("words") +from nltk.corpus import words # noqa: E402 + question_bank = [ "Who was the first president of the United States?", "What is photosynthesis?", @@ -115,6 +121,13 @@ additional_questions = [ ] +def generate_random_sentence(): + word_list = words.words() + sentence_length = random.randint(5, 10) + sentence = " ".join(random.choices(word_list, k=sentence_length)) + return sentence + + def _measure_vespa_latency(filters: dict = {}): yql = ( VespaIndex.yql_base @@ -122,7 +135,7 @@ def _measure_vespa_latency(filters: dict = {}): + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' ) # yql = VespaIndex.yql_base + '({grammar: "weakAnd"}userInput(@query))' - query = random.choice(question_bank) + query = generate_random_sentence() query_embedding = embed_query(query) num_to_retrieve = 50 params: dict[str, str | int] = {