mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-30 01:30:45 +02:00
Added permission syncing (#2340)
* Added permission syncing on the backend * Rewored to work with celery alembic fix fixed test * frontend changes * got groups working * added comments and fixed public docs * fixed merge issues * frontend complete! * frontend cleanup and mypy fixes * refactored connector access_type selection * mypy fixes * minor refactor and frontend improvements * get to fetch * renames and comments * minor change to var names * got curator stuff working * addressed pablo's comments * refactored user_external_group to reference users table * implemented polling * small refactor * fixed a whoopsies on the frontend * added scripts to seed dummy docs and test query times * fixed frontend build issue * alembic fix * handled is_public overlap * yuhong feedback * added more checks for sync * black * mypy * fixed circular import * todos * alembic fix * alembic
This commit is contained in:
166
backend/scripts/query_time_check/seed_dummy_docs.py
Normal file
166
backend/scripts/query_time_check/seed_dummy_docs.py
Normal file
@ -0,0 +1,166 @@
|
||||
"""
|
||||
launch:
|
||||
- api server
|
||||
- postgres
|
||||
- vespa
|
||||
- model server (this is only needed so the api server can startup, no embedding is done)
|
||||
|
||||
Run this script to seed the database with dummy documents.
|
||||
Then run test_query_times.py to test query times.
|
||||
"""
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
from danswer.access.models import DocumentAccess
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.db.engine import get_session_context_manager
|
||||
from danswer.db.search_settings import get_current_search_settings
|
||||
from danswer.document_index.vespa.index import VespaIndex
|
||||
from danswer.indexing.models import ChunkEmbedding
|
||||
from danswer.indexing.models import DocMetadataAwareIndexChunk
|
||||
from danswer.indexing.models import IndexChunk
|
||||
from danswer.utils.timing import log_function_time
|
||||
from shared_configs.model_server_models import Embedding
|
||||
|
||||
|
||||
TOTAL_DOC_SETS = 8
|
||||
TOTAL_ACL_ENTRIES_PER_CATEGORY = 80
|
||||
|
||||
|
||||
def generate_random_embedding(dim: int) -> Embedding:
|
||||
return [random.uniform(-1, 1) for _ in range(dim)]
|
||||
|
||||
|
||||
def generate_random_identifier() -> str:
|
||||
return f"dummy_doc_{random.randint(1, 1000)}"
|
||||
|
||||
|
||||
def generate_dummy_chunk(
|
||||
doc_id: str,
|
||||
chunk_id: int,
|
||||
embedding_dim: int,
|
||||
number_of_acl_entries: int,
|
||||
number_of_document_sets: int,
|
||||
) -> DocMetadataAwareIndexChunk:
|
||||
document = Document(
|
||||
id=doc_id,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
sections=[],
|
||||
metadata={},
|
||||
semantic_identifier=generate_random_identifier(),
|
||||
)
|
||||
|
||||
chunk = IndexChunk(
|
||||
chunk_id=chunk_id,
|
||||
blurb=f"Blurb for chunk {chunk_id} of document {doc_id}.",
|
||||
content=f"Content for chunk {chunk_id} of document {doc_id}. This is dummy text for testing purposes.",
|
||||
source_links={},
|
||||
section_continuation=False,
|
||||
source_document=document,
|
||||
title_prefix=f"Title prefix for doc {doc_id}",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
mini_chunk_texts=None,
|
||||
embeddings=ChunkEmbedding(
|
||||
full_embedding=generate_random_embedding(embedding_dim),
|
||||
mini_chunk_embeddings=[],
|
||||
),
|
||||
title_embedding=generate_random_embedding(embedding_dim),
|
||||
)
|
||||
|
||||
document_set_names = []
|
||||
for i in range(number_of_document_sets):
|
||||
document_set_names.append(f"Document Set {i}")
|
||||
|
||||
user_emails: set[str | None] = set()
|
||||
user_groups: set[str] = set()
|
||||
external_user_emails: set[str] = set()
|
||||
external_user_group_ids: set[str] = set()
|
||||
for i in range(number_of_acl_entries):
|
||||
user_emails.add(f"user_{i}@example.com")
|
||||
user_groups.add(f"group_{i}")
|
||||
external_user_emails.add(f"external_user_{i}@example.com")
|
||||
external_user_group_ids.add(f"external_group_{i}")
|
||||
|
||||
return DocMetadataAwareIndexChunk.from_index_chunk(
|
||||
index_chunk=chunk,
|
||||
access=DocumentAccess(
|
||||
user_emails=user_emails,
|
||||
user_groups=user_groups,
|
||||
external_user_emails=external_user_emails,
|
||||
external_user_group_ids=external_user_group_ids,
|
||||
is_public=random.choice([True, False]),
|
||||
),
|
||||
document_sets={document_set for document_set in document_set_names},
|
||||
boost=random.randint(-1, 1),
|
||||
)
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def do_insertion(
|
||||
vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk]
|
||||
) -> None:
|
||||
insertion_records = vespa_index.index(all_chunks)
|
||||
print(f"Indexed {len(insertion_records)} documents.")
|
||||
print(
|
||||
f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}"
|
||||
)
|
||||
print(
|
||||
f"Existing documents updated: {sum(1 for record in insertion_records if record.already_existed)}"
|
||||
)
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def seed_dummy_docs(
|
||||
number_of_document_sets: int,
|
||||
number_of_acl_entries: int,
|
||||
num_docs: int = 1000,
|
||||
chunks_per_doc: int = 5,
|
||||
batch_size: int = 100,
|
||||
) -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
index_name = search_settings.index_name
|
||||
embedding_dim = search_settings.model_dim
|
||||
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
print(index_name)
|
||||
|
||||
all_chunks = []
|
||||
chunk_count = 0
|
||||
for doc_num in range(num_docs):
|
||||
doc_id = f"dummy_doc_{doc_num}_{datetime.now().isoformat()}"
|
||||
for chunk_num in range(chunks_per_doc):
|
||||
chunk = generate_dummy_chunk(
|
||||
doc_id=doc_id,
|
||||
chunk_id=chunk_num,
|
||||
embedding_dim=embedding_dim,
|
||||
number_of_acl_entries=number_of_acl_entries,
|
||||
number_of_document_sets=number_of_document_sets,
|
||||
)
|
||||
all_chunks.append(chunk)
|
||||
chunk_count += 1
|
||||
|
||||
if len(all_chunks) >= chunks_per_doc * batch_size:
|
||||
do_insertion(vespa_index, all_chunks)
|
||||
print(
|
||||
f"Indexed {chunk_count} chunks out of {num_docs * chunks_per_doc}."
|
||||
)
|
||||
print(
|
||||
f"percentage: {chunk_count / (num_docs * chunks_per_doc) * 100:.2f}% \n"
|
||||
)
|
||||
all_chunks = []
|
||||
|
||||
if all_chunks:
|
||||
do_insertion(vespa_index, all_chunks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
seed_dummy_docs(
|
||||
number_of_document_sets=TOTAL_DOC_SETS,
|
||||
number_of_acl_entries=TOTAL_ACL_ENTRIES_PER_CATEGORY,
|
||||
num_docs=100000,
|
||||
chunks_per_doc=5,
|
||||
batch_size=1000,
|
||||
)
|
122
backend/scripts/query_time_check/test_query_times.py
Normal file
122
backend/scripts/query_time_check/test_query_times.py
Normal file
@ -0,0 +1,122 @@
|
||||
"""
|
||||
RUN THIS AFTER SEED_DUMMY_DOCS.PY
|
||||
"""
|
||||
import random
|
||||
import time
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.model_configs import DOC_EMBEDDING_DIM
|
||||
from danswer.db.engine import get_session_context_manager
|
||||
from danswer.db.search_settings import get_current_search_settings
|
||||
from danswer.document_index.vespa.index import VespaIndex
|
||||
from danswer.search.models import IndexFilters
|
||||
from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
|
||||
from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
|
||||
from shared_configs.model_server_models import Embedding
|
||||
|
||||
# make sure these are smaller than TOTAL_ACL_ENTRIES_PER_CATEGORY and TOTAL_DOC_SETS, respectively
|
||||
NUMBER_OF_ACL_ENTRIES_PER_QUERY = 6
|
||||
NUMBER_OF_DOC_SETS_PER_QUERY = 2
|
||||
|
||||
|
||||
def get_slowest_99th_percentile(results: list[float]) -> float:
|
||||
return sorted(results)[int(0.99 * len(results))]
|
||||
|
||||
|
||||
# Generate random filters
|
||||
def _random_filters() -> IndexFilters:
|
||||
"""
|
||||
Generate random filters for the query containing:
|
||||
- NUMBER_OF_ACL_ENTRIES_PER_QUERY user emails
|
||||
- NUMBER_OF_ACL_ENTRIES_PER_QUERY groups
|
||||
- NUMBER_OF_ACL_ENTRIES_PER_QUERY external groups
|
||||
- NUMBER_OF_DOC_SETS_PER_QUERY document sets
|
||||
"""
|
||||
access_control_list = [
|
||||
f"user_email:user_{random.randint(0, TOTAL_ACL_ENTRIES_PER_CATEGORY - 1)}@example.com",
|
||||
]
|
||||
acl_indices = random.sample(
|
||||
range(TOTAL_ACL_ENTRIES_PER_CATEGORY), NUMBER_OF_ACL_ENTRIES_PER_QUERY
|
||||
)
|
||||
for i in acl_indices:
|
||||
access_control_list.append(f"group:group_{acl_indices[i]}")
|
||||
access_control_list.append(f"external_group:external_group_{acl_indices[i]}")
|
||||
|
||||
doc_sets = []
|
||||
doc_set_indices = random.sample(
|
||||
range(TOTAL_DOC_SETS), NUMBER_OF_ACL_ENTRIES_PER_QUERY
|
||||
)
|
||||
for i in doc_set_indices:
|
||||
doc_sets.append(f"document_set:Document Set {doc_set_indices[i]}")
|
||||
|
||||
return IndexFilters(
|
||||
source_type=[DocumentSource.GOOGLE_DRIVE],
|
||||
document_set=doc_sets,
|
||||
tags=[],
|
||||
access_control_list=access_control_list,
|
||||
)
|
||||
|
||||
|
||||
def test_hybrid_retrieval_times(
|
||||
number_of_queries: int,
|
||||
) -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
index_name = search_settings.index_name
|
||||
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
|
||||
# Generate random queries
|
||||
queries = [f"Random Query {i}" for i in range(number_of_queries)]
|
||||
|
||||
# Generate random embeddings
|
||||
embeddings = [
|
||||
Embedding([random.random() for _ in range(DOC_EMBEDDING_DIM)])
|
||||
for _ in range(number_of_queries)
|
||||
]
|
||||
|
||||
total_time = 0.0
|
||||
results = []
|
||||
for i in range(number_of_queries):
|
||||
start_time = time.time()
|
||||
|
||||
vespa_index.hybrid_retrieval(
|
||||
query=queries[i],
|
||||
query_embedding=embeddings[i],
|
||||
final_keywords=None,
|
||||
filters=_random_filters(),
|
||||
hybrid_alpha=0.5,
|
||||
time_decay_multiplier=1.0,
|
||||
num_to_retrieve=50,
|
||||
offset=0,
|
||||
title_content_ratio=0.5,
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
query_time = end_time - start_time
|
||||
total_time += query_time
|
||||
results.append(query_time)
|
||||
|
||||
print(f"Query {i+1}: {query_time:.4f} seconds")
|
||||
|
||||
avg_time = total_time / number_of_queries
|
||||
fast_time = min(results)
|
||||
slow_time = max(results)
|
||||
ninety_ninth_percentile = get_slowest_99th_percentile(results)
|
||||
# Write results to a file
|
||||
_OUTPUT_PATH = "query_times_results_large_more.txt"
|
||||
with open(_OUTPUT_PATH, "w") as f:
|
||||
f.write(f"Average query time: {avg_time:.4f} seconds\n")
|
||||
f.write(f"Fastest query: {fast_time:.4f} seconds\n")
|
||||
f.write(f"Slowest query: {slow_time:.4f} seconds\n")
|
||||
f.write(f"99th percentile: {ninety_ninth_percentile:.4f} seconds\n")
|
||||
print(f"Results written to {_OUTPUT_PATH}")
|
||||
|
||||
print(f"\nAverage query time: {avg_time:.4f} seconds")
|
||||
print(f"Fastest query: {fast_time:.4f} seconds")
|
||||
print(f"Slowest query: {max(results):.4f} seconds")
|
||||
print(f"99th percentile: {get_slowest_99th_percentile(results):.4f} seconds")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_hybrid_retrieval_times(number_of_queries=1000)
|
Reference in New Issue
Block a user