danswer/backend/scripts/query_time_check/test_query_times.py
rkuo-danswer 4fe99d05fd
add timings for syncing (#3798)
* add timings for syncing

* add more logging

* more debugging

* refactor multipass/db check out of VespaIndex

* circular imports?

* more debugging

* add logs

* various improvements

* additional logs to narrow down issue

* use global httpx pool for the main vespa flows in celery. Use in more places eventually.

* cleanup debug logging, etc

* remove debug logging

* this should use the secondary index

* mypy

* missed some logging

* review fixes

* refactor get_default_document_index to use search settings

* more missed logging

* fix circular refs

---------

Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
Co-authored-by: pablodanswer <pablo@danswer.ai>
2025-01-29 23:24:44 +00:00

130 lines
4.4 KiB
Python

"""
RUN THIS AFTER SEED_DUMMY_DOCS.PY
"""
import random
import time
from onyx.configs.constants import DocumentSource
from onyx.configs.model_configs import DOC_EMBEDDING_DIM
from onyx.context.search.models import IndexFilters
from onyx.db.engine import get_session_context_manager
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.document_index_utils import get_multipass_config
from onyx.document_index.vespa.index import VespaIndex
from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
from shared_configs.model_server_models import Embedding
# make sure these are smaller than TOTAL_ACL_ENTRIES_PER_CATEGORY and TOTAL_DOC_SETS, respectively
NUMBER_OF_ACL_ENTRIES_PER_QUERY = 6
NUMBER_OF_DOC_SETS_PER_QUERY = 2
def get_slowest_99th_percentile(results: list[float]) -> float:
return sorted(results)[int(0.99 * len(results))]
# Generate random filters
def _random_filters() -> IndexFilters:
"""
Generate random filters for the query containing:
- NUMBER_OF_ACL_ENTRIES_PER_QUERY user emails
- NUMBER_OF_ACL_ENTRIES_PER_QUERY groups
- NUMBER_OF_ACL_ENTRIES_PER_QUERY external groups
- NUMBER_OF_DOC_SETS_PER_QUERY document sets
"""
access_control_list = [
f"user_email:user_{random.randint(0, TOTAL_ACL_ENTRIES_PER_CATEGORY - 1)}@example.com",
]
acl_indices = random.sample(
range(TOTAL_ACL_ENTRIES_PER_CATEGORY), NUMBER_OF_ACL_ENTRIES_PER_QUERY
)
for i in acl_indices:
access_control_list.append(f"group:group_{acl_indices[i]}")
access_control_list.append(f"external_group:external_group_{acl_indices[i]}")
doc_sets = []
doc_set_indices = random.sample(
range(TOTAL_DOC_SETS), NUMBER_OF_ACL_ENTRIES_PER_QUERY
)
for i in doc_set_indices:
doc_sets.append(f"document_set:Document Set {doc_set_indices[i]}")
return IndexFilters(
source_type=[DocumentSource.GOOGLE_DRIVE],
document_set=doc_sets,
tags=[],
access_control_list=access_control_list,
)
def test_hybrid_retrieval_times(
number_of_queries: int,
) -> None:
with get_session_context_manager() as db_session:
search_settings = get_current_search_settings(db_session)
multipass_config = get_multipass_config(search_settings)
index_name = search_settings.index_name
vespa_index = VespaIndex(
index_name=index_name,
secondary_index_name=None,
large_chunks_enabled=multipass_config.enable_large_chunks,
secondary_large_chunks_enabled=None,
)
# Generate random queries
queries = [f"Random Query {i}" for i in range(number_of_queries)]
# Generate random embeddings
embeddings = [
Embedding([random.random() for _ in range(DOC_EMBEDDING_DIM)])
for _ in range(number_of_queries)
]
total_time = 0.0
results = []
for i in range(number_of_queries):
start_time = time.time()
vespa_index.hybrid_retrieval(
query=queries[i],
query_embedding=embeddings[i],
final_keywords=None,
filters=_random_filters(),
hybrid_alpha=0.5,
time_decay_multiplier=1.0,
num_to_retrieve=50,
offset=0,
title_content_ratio=0.5,
)
end_time = time.time()
query_time = end_time - start_time
total_time += query_time
results.append(query_time)
print(f"Query {i+1}: {query_time:.4f} seconds")
avg_time = total_time / number_of_queries
fast_time = min(results)
slow_time = max(results)
ninety_ninth_percentile = get_slowest_99th_percentile(results)
# Write results to a file
_OUTPUT_PATH = "query_times_results_large_more.txt"
with open(_OUTPUT_PATH, "w") as f:
f.write(f"Average query time: {avg_time:.4f} seconds\n")
f.write(f"Fastest query: {fast_time:.4f} seconds\n")
f.write(f"Slowest query: {slow_time:.4f} seconds\n")
f.write(f"99th percentile: {ninety_ninth_percentile:.4f} seconds\n")
print(f"Results written to {_OUTPUT_PATH}")
print(f"\nAverage query time: {avg_time:.4f} seconds")
print(f"Fastest query: {fast_time:.4f} seconds")
print(f"Slowest query: {max(results):.4f} seconds")
print(f"99th percentile: {get_slowest_99th_percentile(results):.4f} seconds")
if __name__ == "__main__":
test_hybrid_retrieval_times(number_of_queries=1000)