Improved indexing (#3594)

* nit

* k

* add steps

* main util functions

* functioning fully

* quick nit

* k

* typing fix

* k

* address comments
This commit is contained in:
pablonyx
2025-01-05 15:31:53 -08:00
committed by GitHub
parent e83542f572
commit ddec239fef
18 changed files with 419 additions and 150 deletions

View File

@@ -25,6 +25,7 @@ from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.index_attempt import mock_successful_index_attempt
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.factory import get_default_document_index
from onyx.document_index.interfaces import IndexBatchParams
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
@@ -86,6 +87,7 @@ def _create_indexable_chunks(
access=default_public_access,
document_sets=set(),
boost=DEFAULT_BOOST,
large_chunk_id=None,
)
chunks.append(chunk)
@@ -217,7 +219,15 @@ def seed_initial_documents(
# as we just sent over the Vespa schema and there is a slight delay
index_with_retries = retry_builder(tries=15)(document_index.index)
index_with_retries(chunks=chunks, fresh_index=cohere_enabled)
index_with_retries(
chunks=chunks,
index_batch_params=IndexBatchParams(
doc_id_to_previous_chunk_cnt={},
doc_id_to_new_chunk_cnt={},
large_chunks_enabled=False,
tenant_id=tenant_id,
),
)
# Mock a run for the UI even though it did not actually call out to anything
mock_successful_index_attempt(