Added permission syncing (#2340)

* Added permission syncing on the backend * Rewored to work with celery alembic fix fixed test * frontend changes * got groups working * added comments and fixed public docs * fixed merge issues * frontend complete! * frontend cleanup and mypy fixes * refactored connector access_type selection * mypy fixes * minor refactor and frontend improvements * get to fetch * renames and comments * minor change to var names * got curator stuff working * addressed pablo's comments * refactored user_external_group to reference users table * implemented polling * small refactor * fixed a whoopsies on the frontend * added scripts to seed dummy docs and test query times * fixed frontend build issue * alembic fix * handled is_public overlap * yuhong feedback * added more checks for sync * black * mypy * fixed circular import * todos * alembic fix * alembic
2025-09-22 17:16:20 +02:00 · 2024-09-19 15:07:36 -07:00
parent ef104e9a82
commit 2274cab554
79 changed files with 2192 additions and 1049 deletions
--- a/backend/scripts/query_time_check/seed_dummy_docs.py
+++ b/backend/scripts/query_time_check/seed_dummy_docs.py
@@ -0,0 +1,166 @@
+"""
+launch:
+- api server
+- postgres
+- vespa
+- model server (this is only needed so the api server can startup, no embedding is done)
+
+Run this script to seed the database with dummy documents.
+Then run test_query_times.py to test query times.
+"""
+import random
+from datetime import datetime
+
+from danswer.access.models import DocumentAccess
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.models import Document
+from danswer.db.engine import get_session_context_manager
+from danswer.db.search_settings import get_current_search_settings
+from danswer.document_index.vespa.index import VespaIndex
+from danswer.indexing.models import ChunkEmbedding
+from danswer.indexing.models import DocMetadataAwareIndexChunk
+from danswer.indexing.models import IndexChunk
+from danswer.utils.timing import log_function_time
+from shared_configs.model_server_models import Embedding
+
+
+TOTAL_DOC_SETS = 8
+TOTAL_ACL_ENTRIES_PER_CATEGORY = 80
+
+
+def generate_random_embedding(dim: int) -> Embedding:
+    return [random.uniform(-1, 1) for _ in range(dim)]
+
+
+def generate_random_identifier() -> str:
+    return f"dummy_doc_{random.randint(1, 1000)}"
+
+
+def generate_dummy_chunk(
+    doc_id: str,
+    chunk_id: int,
+    embedding_dim: int,
+    number_of_acl_entries: int,
+    number_of_document_sets: int,
+) -> DocMetadataAwareIndexChunk:
+    document = Document(
+        id=doc_id,
+        source=DocumentSource.GOOGLE_DRIVE,
+        sections=[],
+        metadata={},
+        semantic_identifier=generate_random_identifier(),
+    )
+
+    chunk = IndexChunk(
+        chunk_id=chunk_id,
+        blurb=f"Blurb for chunk {chunk_id} of document {doc_id}.",
+        content=f"Content for chunk {chunk_id} of document {doc_id}. This is dummy text for testing purposes.",
+        source_links={},
+        section_continuation=False,
+        source_document=document,
+        title_prefix=f"Title prefix for doc {doc_id}",
+        metadata_suffix_semantic="",
+        metadata_suffix_keyword="",
+        mini_chunk_texts=None,
+        embeddings=ChunkEmbedding(
+            full_embedding=generate_random_embedding(embedding_dim),
+            mini_chunk_embeddings=[],
+        ),
+        title_embedding=generate_random_embedding(embedding_dim),
+    )
+
+    document_set_names = []
+    for i in range(number_of_document_sets):
+        document_set_names.append(f"Document Set {i}")
+
+    user_emails: set[str | None] = set()
+    user_groups: set[str] = set()
+    external_user_emails: set[str] = set()
+    external_user_group_ids: set[str] = set()
+    for i in range(number_of_acl_entries):
+        user_emails.add(f"user_{i}@example.com")
+        user_groups.add(f"group_{i}")
+        external_user_emails.add(f"external_user_{i}@example.com")
+        external_user_group_ids.add(f"external_group_{i}")
+
+    return DocMetadataAwareIndexChunk.from_index_chunk(
+        index_chunk=chunk,
+        access=DocumentAccess(
+            user_emails=user_emails,
+            user_groups=user_groups,
+            external_user_emails=external_user_emails,
+            external_user_group_ids=external_user_group_ids,
+            is_public=random.choice([True, False]),
+        ),
+        document_sets={document_set for document_set in document_set_names},
+        boost=random.randint(-1, 1),
+    )
+
+
+@log_function_time()
+def do_insertion(
+    vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk]
+) -> None:
+    insertion_records = vespa_index.index(all_chunks)
+    print(f"Indexed {len(insertion_records)} documents.")
+    print(
+        f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}"
+    )
+    print(
+        f"Existing documents updated: {sum(1 for record in insertion_records if record.already_existed)}"
+    )
+
+
+@log_function_time()
+def seed_dummy_docs(
+    number_of_document_sets: int,
+    number_of_acl_entries: int,
+    num_docs: int = 1000,
+    chunks_per_doc: int = 5,
+    batch_size: int = 100,
+) -> None:
+    with get_session_context_manager() as db_session:
+        search_settings = get_current_search_settings(db_session)
+        index_name = search_settings.index_name
+        embedding_dim = search_settings.model_dim
+
+    vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
+    print(index_name)
+
+    all_chunks = []
+    chunk_count = 0
+    for doc_num in range(num_docs):
+        doc_id = f"dummy_doc_{doc_num}_{datetime.now().isoformat()}"
+        for chunk_num in range(chunks_per_doc):
+            chunk = generate_dummy_chunk(
+                doc_id=doc_id,
+                chunk_id=chunk_num,
+                embedding_dim=embedding_dim,
+                number_of_acl_entries=number_of_acl_entries,
+                number_of_document_sets=number_of_document_sets,
+            )
+            all_chunks.append(chunk)
+            chunk_count += 1
+
+            if len(all_chunks) >= chunks_per_doc * batch_size:
+                do_insertion(vespa_index, all_chunks)
+                print(
+                    f"Indexed {chunk_count} chunks out of {num_docs * chunks_per_doc}."
+                )
+                print(
+                    f"percentage: {chunk_count / (num_docs * chunks_per_doc) * 100:.2f}% \n"
+                )
+                all_chunks = []
+
+    if all_chunks:
+        do_insertion(vespa_index, all_chunks)
+
+
+if __name__ == "__main__":
+    seed_dummy_docs(
+        number_of_document_sets=TOTAL_DOC_SETS,
+        number_of_acl_entries=TOTAL_ACL_ENTRIES_PER_CATEGORY,
+        num_docs=100000,
+        chunks_per_doc=5,
+        batch_size=1000,
+    )
--- a/backend/scripts/query_time_check/test_query_times.py
+++ b/backend/scripts/query_time_check/test_query_times.py
@@ -0,0 +1,122 @@
+"""
+RUN THIS AFTER SEED_DUMMY_DOCS.PY
+"""
+import random
+import time
+
+from danswer.configs.constants import DocumentSource
+from danswer.configs.model_configs import DOC_EMBEDDING_DIM
+from danswer.db.engine import get_session_context_manager
+from danswer.db.search_settings import get_current_search_settings
+from danswer.document_index.vespa.index import VespaIndex
+from danswer.search.models import IndexFilters
+from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY
+from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS
+from shared_configs.model_server_models import Embedding
+
+# make sure these are smaller than TOTAL_ACL_ENTRIES_PER_CATEGORY and TOTAL_DOC_SETS, respectively
+NUMBER_OF_ACL_ENTRIES_PER_QUERY = 6
+NUMBER_OF_DOC_SETS_PER_QUERY = 2
+
+
+def get_slowest_99th_percentile(results: list[float]) -> float:
+    return sorted(results)[int(0.99 * len(results))]
+
+
+# Generate random filters
+def _random_filters() -> IndexFilters:
+    """
+    Generate random filters for the query containing:
+    - NUMBER_OF_ACL_ENTRIES_PER_QUERY user emails
+    - NUMBER_OF_ACL_ENTRIES_PER_QUERY groups
+    - NUMBER_OF_ACL_ENTRIES_PER_QUERY external groups
+    - NUMBER_OF_DOC_SETS_PER_QUERY document sets
+    """
+    access_control_list = [
+        f"user_email:user_{random.randint(0, TOTAL_ACL_ENTRIES_PER_CATEGORY - 1)}@example.com",
+    ]
+    acl_indices = random.sample(
+        range(TOTAL_ACL_ENTRIES_PER_CATEGORY), NUMBER_OF_ACL_ENTRIES_PER_QUERY
+    )
+    for i in acl_indices:
+        access_control_list.append(f"group:group_{acl_indices[i]}")
+        access_control_list.append(f"external_group:external_group_{acl_indices[i]}")
+
+    doc_sets = []
+    doc_set_indices = random.sample(
+        range(TOTAL_DOC_SETS), NUMBER_OF_ACL_ENTRIES_PER_QUERY
+    )
+    for i in doc_set_indices:
+        doc_sets.append(f"document_set:Document Set {doc_set_indices[i]}")
+
+    return IndexFilters(
+        source_type=[DocumentSource.GOOGLE_DRIVE],
+        document_set=doc_sets,
+        tags=[],
+        access_control_list=access_control_list,
+    )
+
+
+def test_hybrid_retrieval_times(
+    number_of_queries: int,
+) -> None:
+    with get_session_context_manager() as db_session:
+        search_settings = get_current_search_settings(db_session)
+        index_name = search_settings.index_name
+
+    vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
+
+    # Generate random queries
+    queries = [f"Random Query {i}" for i in range(number_of_queries)]
+
+    # Generate random embeddings
+    embeddings = [
+        Embedding([random.random() for _ in range(DOC_EMBEDDING_DIM)])
+        for _ in range(number_of_queries)
+    ]
+
+    total_time = 0.0
+    results = []
+    for i in range(number_of_queries):
+        start_time = time.time()
+
+        vespa_index.hybrid_retrieval(
+            query=queries[i],
+            query_embedding=embeddings[i],
+            final_keywords=None,
+            filters=_random_filters(),
+            hybrid_alpha=0.5,
+            time_decay_multiplier=1.0,
+            num_to_retrieve=50,
+            offset=0,
+            title_content_ratio=0.5,
+        )
+
+        end_time = time.time()
+        query_time = end_time - start_time
+        total_time += query_time
+        results.append(query_time)
+
+        print(f"Query {i+1}: {query_time:.4f} seconds")
+
+    avg_time = total_time / number_of_queries
+    fast_time = min(results)
+    slow_time = max(results)
+    ninety_ninth_percentile = get_slowest_99th_percentile(results)
+    # Write results to a file
+    _OUTPUT_PATH = "query_times_results_large_more.txt"
+    with open(_OUTPUT_PATH, "w") as f:
+        f.write(f"Average query time: {avg_time:.4f} seconds\n")
+        f.write(f"Fastest query: {fast_time:.4f} seconds\n")
+        f.write(f"Slowest query: {slow_time:.4f} seconds\n")
+        f.write(f"99th percentile: {ninety_ninth_percentile:.4f} seconds\n")
+    print(f"Results written to {_OUTPUT_PATH}")
+
+    print(f"\nAverage query time: {avg_time:.4f} seconds")
+    print(f"Fastest query: {fast_time:.4f} seconds")
+    print(f"Slowest query: {max(results):.4f} seconds")
+    print(f"99th percentile: {get_slowest_99th_percentile(results):.4f} seconds")
+
+
+if __name__ == "__main__":
+    test_hybrid_retrieval_times(number_of_queries=1000)