Improved indexing (#3594)

* nit

* k

* add steps

* main util functions

* functioning fully

* quick nit

* k

* typing fix

* k

* address comments
This commit is contained in:
pablonyx
2025-01-05 15:31:53 -08:00
committed by GitHub
parent e83542f572
commit ddec239fef
18 changed files with 419 additions and 150 deletions

View File

@ -17,6 +17,7 @@ from onyx.connectors.models import Document
from onyx.db.engine import get_session_context_manager
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.vespa.index import VespaIndex
from onyx.indexing.indexing_pipeline import IndexBatchParams
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import IndexChunk
@ -24,7 +25,6 @@ from onyx.utils.timing import log_function_time
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.model_server_models import Embedding
TOTAL_DOC_SETS = 8
TOTAL_ACL_ENTRIES_PER_CATEGORY = 80
@ -68,6 +68,8 @@ def generate_dummy_chunk(
mini_chunk_embeddings=[],
),
title_embedding=generate_random_embedding(embedding_dim),
large_chunk_id=None,
large_chunk_reference_ids=[],
)
document_set_names = []
@ -103,7 +105,15 @@ def generate_dummy_chunk(
def do_insertion(
vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk]
) -> None:
insertion_records = vespa_index.index(all_chunks)
insertion_records = vespa_index.index(
chunks=all_chunks,
index_batch_params=IndexBatchParams(
doc_id_to_previous_chunk_cnt={},
doc_id_to_new_chunk_cnt={},
tenant_id=POSTGRES_DEFAULT_SCHEMA,
large_chunks_enabled=False,
),
)
print(f"Indexed {len(insertion_records)} documents.")
print(
f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}"