Improved indexing (#3594)

* nit * k * add steps * main util functions * functioning fully * quick nit * k * typing fix * k * address comments
2025-07-08 21:50:12 +02:00 · 2025-01-05 15:31:53 -08:00
parent e83542f572
commit ddec239fef
18 changed files with 419 additions and 150 deletions
--- a/backend/scripts/query_time_check/seed_dummy_docs.py
+++ b/backend/scripts/query_time_check/seed_dummy_docs.py
@ -17,6 +17,7 @@ from onyx.connectors.models import Document
 from onyx.db.engine import get_session_context_manager
 from onyx.db.search_settings import get_current_search_settings
 from onyx.document_index.vespa.index import VespaIndex
+from onyx.indexing.indexing_pipeline import IndexBatchParams
 from onyx.indexing.models import ChunkEmbedding
 from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexChunk
@ -24,7 +25,6 @@ from onyx.utils.timing import log_function_time
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
 from shared_configs.model_server_models import Embedding

-
 TOTAL_DOC_SETS = 8
 TOTAL_ACL_ENTRIES_PER_CATEGORY = 80

@ -68,6 +68,8 @@ def generate_dummy_chunk(
            mini_chunk_embeddings=[],
        ),
        title_embedding=generate_random_embedding(embedding_dim),
+        large_chunk_id=None,
+        large_chunk_reference_ids=[],
    )

    document_set_names = []
@ -103,7 +105,15 @@ def generate_dummy_chunk(
 def do_insertion(
    vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk]
 ) -> None:
-    insertion_records = vespa_index.index(all_chunks)
+    insertion_records = vespa_index.index(
+        chunks=all_chunks,
+        index_batch_params=IndexBatchParams(
+            doc_id_to_previous_chunk_cnt={},
+            doc_id_to_new_chunk_cnt={},
+            tenant_id=POSTGRES_DEFAULT_SCHEMA,
+            large_chunks_enabled=False,
+        ),
+    )
    print(f"Indexed {len(insertion_records)} documents.")
    print(
        f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}"