mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-28 02:31:55 +01:00
* add timings for syncing * add more logging * more debugging * refactor multipass/db check out of VespaIndex * circular imports? * more debugging * add logs * various improvements * additional logs to narrow down issue * use global httpx pool for the main vespa flows in celery. Use in more places eventually. * cleanup debug logging, etc * remove debug logging * this should use the secondary index * mypy * missed some logging * review fixes * refactor get_default_document_index to use search settings * more missed logging * fix circular refs --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app> Co-authored-by: pablodanswer <pablo@danswer.ai>
130 lines
4.4 KiB
Python
130 lines
4.4 KiB
Python
"""
|
|
RUN THIS AFTER SEED_DUMMY_DOCS.PY
|
|
"""
|
|
import random
|
|
import time
|
|
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.configs.model_configs import DOC_EMBEDDING_DIM
|
|
from onyx.context.search.models import IndexFilters
|
|
from onyx.db.engine import get_session_context_manager
|
|
from onyx.db.search_settings import get_current_search_settings
|
|
from onyx.document_index.document_index_utils import get_multipass_config
|
|
from onyx.document_index.vespa.index import VespaIndex
|
|
from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
|
|
from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
|
|
from shared_configs.model_server_models import Embedding
|
|
|
|
# make sure these are smaller than TOTAL_ACL_ENTRIES_PER_CATEGORY and TOTAL_DOC_SETS, respectively
|
|
NUMBER_OF_ACL_ENTRIES_PER_QUERY = 6
|
|
NUMBER_OF_DOC_SETS_PER_QUERY = 2
|
|
|
|
|
|
def get_slowest_99th_percentile(results: list[float]) -> float:
|
|
return sorted(results)[int(0.99 * len(results))]
|
|
|
|
|
|
# Generate random filters
|
|
def _random_filters() -> IndexFilters:
|
|
"""
|
|
Generate random filters for the query containing:
|
|
- NUMBER_OF_ACL_ENTRIES_PER_QUERY user emails
|
|
- NUMBER_OF_ACL_ENTRIES_PER_QUERY groups
|
|
- NUMBER_OF_ACL_ENTRIES_PER_QUERY external groups
|
|
- NUMBER_OF_DOC_SETS_PER_QUERY document sets
|
|
"""
|
|
access_control_list = [
|
|
f"user_email:user_{random.randint(0, TOTAL_ACL_ENTRIES_PER_CATEGORY - 1)}@example.com",
|
|
]
|
|
acl_indices = random.sample(
|
|
range(TOTAL_ACL_ENTRIES_PER_CATEGORY), NUMBER_OF_ACL_ENTRIES_PER_QUERY
|
|
)
|
|
for i in acl_indices:
|
|
access_control_list.append(f"group:group_{acl_indices[i]}")
|
|
access_control_list.append(f"external_group:external_group_{acl_indices[i]}")
|
|
|
|
doc_sets = []
|
|
doc_set_indices = random.sample(
|
|
range(TOTAL_DOC_SETS), NUMBER_OF_ACL_ENTRIES_PER_QUERY
|
|
)
|
|
for i in doc_set_indices:
|
|
doc_sets.append(f"document_set:Document Set {doc_set_indices[i]}")
|
|
|
|
return IndexFilters(
|
|
source_type=[DocumentSource.GOOGLE_DRIVE],
|
|
document_set=doc_sets,
|
|
tags=[],
|
|
access_control_list=access_control_list,
|
|
)
|
|
|
|
|
|
def test_hybrid_retrieval_times(
|
|
number_of_queries: int,
|
|
) -> None:
|
|
with get_session_context_manager() as db_session:
|
|
search_settings = get_current_search_settings(db_session)
|
|
multipass_config = get_multipass_config(search_settings)
|
|
index_name = search_settings.index_name
|
|
|
|
vespa_index = VespaIndex(
|
|
index_name=index_name,
|
|
secondary_index_name=None,
|
|
large_chunks_enabled=multipass_config.enable_large_chunks,
|
|
secondary_large_chunks_enabled=None,
|
|
)
|
|
|
|
# Generate random queries
|
|
queries = [f"Random Query {i}" for i in range(number_of_queries)]
|
|
|
|
# Generate random embeddings
|
|
embeddings = [
|
|
Embedding([random.random() for _ in range(DOC_EMBEDDING_DIM)])
|
|
for _ in range(number_of_queries)
|
|
]
|
|
|
|
total_time = 0.0
|
|
results = []
|
|
for i in range(number_of_queries):
|
|
start_time = time.time()
|
|
|
|
vespa_index.hybrid_retrieval(
|
|
query=queries[i],
|
|
query_embedding=embeddings[i],
|
|
final_keywords=None,
|
|
filters=_random_filters(),
|
|
hybrid_alpha=0.5,
|
|
time_decay_multiplier=1.0,
|
|
num_to_retrieve=50,
|
|
offset=0,
|
|
title_content_ratio=0.5,
|
|
)
|
|
|
|
end_time = time.time()
|
|
query_time = end_time - start_time
|
|
total_time += query_time
|
|
results.append(query_time)
|
|
|
|
print(f"Query {i+1}: {query_time:.4f} seconds")
|
|
|
|
avg_time = total_time / number_of_queries
|
|
fast_time = min(results)
|
|
slow_time = max(results)
|
|
ninety_ninth_percentile = get_slowest_99th_percentile(results)
|
|
# Write results to a file
|
|
_OUTPUT_PATH = "query_times_results_large_more.txt"
|
|
with open(_OUTPUT_PATH, "w") as f:
|
|
f.write(f"Average query time: {avg_time:.4f} seconds\n")
|
|
f.write(f"Fastest query: {fast_time:.4f} seconds\n")
|
|
f.write(f"Slowest query: {slow_time:.4f} seconds\n")
|
|
f.write(f"99th percentile: {ninety_ninth_percentile:.4f} seconds\n")
|
|
print(f"Results written to {_OUTPUT_PATH}")
|
|
|
|
print(f"\nAverage query time: {avg_time:.4f} seconds")
|
|
print(f"Fastest query: {fast_time:.4f} seconds")
|
|
print(f"Slowest query: {max(results):.4f} seconds")
|
|
print(f"99th percentile: {get_slowest_99th_percentile(results):.4f} seconds")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_hybrid_retrieval_times(number_of_queries=1000)
|