mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-08 21:50:12 +02:00
Improved indexing (#3594)
* nit * k * add steps * main util functions * functioning fully * quick nit * k * typing fix * k * address comments
This commit is contained in:
@ -17,6 +17,7 @@ from onyx.connectors.models import Document
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.vespa.index import VespaIndex
|
||||
from onyx.indexing.indexing_pipeline import IndexBatchParams
|
||||
from onyx.indexing.models import ChunkEmbedding
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.indexing.models import IndexChunk
|
||||
@ -24,7 +25,6 @@ from onyx.utils.timing import log_function_time
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.model_server_models import Embedding
|
||||
|
||||
|
||||
TOTAL_DOC_SETS = 8
|
||||
TOTAL_ACL_ENTRIES_PER_CATEGORY = 80
|
||||
|
||||
@ -68,6 +68,8 @@ def generate_dummy_chunk(
|
||||
mini_chunk_embeddings=[],
|
||||
),
|
||||
title_embedding=generate_random_embedding(embedding_dim),
|
||||
large_chunk_id=None,
|
||||
large_chunk_reference_ids=[],
|
||||
)
|
||||
|
||||
document_set_names = []
|
||||
@ -103,7 +105,15 @@ def generate_dummy_chunk(
|
||||
def do_insertion(
|
||||
vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk]
|
||||
) -> None:
|
||||
insertion_records = vespa_index.index(all_chunks)
|
||||
insertion_records = vespa_index.index(
|
||||
chunks=all_chunks,
|
||||
index_batch_params=IndexBatchParams(
|
||||
doc_id_to_previous_chunk_cnt={},
|
||||
doc_id_to_new_chunk_cnt={},
|
||||
tenant_id=POSTGRES_DEFAULT_SCHEMA,
|
||||
large_chunks_enabled=False,
|
||||
),
|
||||
)
|
||||
print(f"Indexed {len(insertion_records)} documents.")
|
||||
print(
|
||||
f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}"
|
||||
|
Reference in New Issue
Block a user