From 0aa9e8968af71cee8a81f725a9f87e6a05cdd817 Mon Sep 17 00:00:00 2001 From: joachim-danswer Date: Wed, 12 Mar 2025 16:14:09 -0700 Subject: [PATCH] RK comments --- backend/onyx/db/chunk.py | 18 ++++++++---------- backend/onyx/indexing/indexing_pipeline.py | 12 +++++++----- backend/onyx/indexing/models.py | 6 ++++++ 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/backend/onyx/db/chunk.py b/backend/onyx/db/chunk.py index e48c22415..4eed6430c 100644 --- a/backend/onyx/db/chunk.py +++ b/backend/onyx/db/chunk.py @@ -5,10 +5,11 @@ from sqlalchemy import delete from sqlalchemy.orm import Session from onyx.db.models import ChunkStats +from onyx.indexing.models import UpdatableChunkData def update_chunk_boost_components__no_commit( - chunk_data: list[dict], + chunk_data: list[UpdatableChunkData], db_session: Session, ) -> None: """Updates the chunk_boost_components for chunks in the database. @@ -21,23 +22,20 @@ def update_chunk_boost_components__no_commit( return for data in chunk_data: - chunk_in_doc_id = int(data.get("chunk_id", -1)) + chunk_in_doc_id = int(data.chunk_id) if chunk_in_doc_id < 0: raise ValueError(f"Chunk ID is empty for chunk {data}") + + chunk_document_id = f"{data.document_id}" f"__{chunk_in_doc_id}" chunk_stats = ( db_session.query(ChunkStats) .filter( - ChunkStats.document_id == data["document_id"], - ChunkStats.chunk_in_doc_id == chunk_in_doc_id, + ChunkStats.id == chunk_document_id, ) .first() ) - # skip chunks without boost score - if data.get("boost_score") is None: - continue - - score = data["boost_score"] + score = data.boost_score if chunk_stats: chunk_stats.information_content_boost = score @@ -49,7 +47,7 @@ def update_chunk_boost_components__no_commit( continue # Create new record chunk_stats = ChunkStats( - document_id=data["document_id"], + document_id=data.document_id, chunk_in_doc_id=chunk_in_doc_id, information_content_boost=score, ) diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index f6891b264..e166abf43 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -55,6 +55,7 @@ from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.indexing.models import DocAwareChunk from onyx.indexing.models import DocMetadataAwareIndexChunk from onyx.indexing.models import IndexChunk +from onyx.indexing.models import UpdatableChunkData from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff from onyx.natural_language_processing.search_nlp_models import ( InformationContentClassificationModel, @@ -66,6 +67,7 @@ from shared_configs.configs import ( INDEXING_INFORMATION_CONTENT_CLASSIFICATION_CUTOFF_LENGTH, ) + logger = setup_logger() @@ -614,11 +616,11 @@ def index_doc_batch( updatable_ids = [doc.id for doc in ctx.updatable_docs] updatable_chunk_data = [ - { - "chunk_id": chunk.chunk_id, - "document_id": chunk.source_document.id, - "boost_score": score, - } + UpdatableChunkData( + chunk_id=chunk.chunk_id, + document_id=chunk.source_document.id, + boost_score=score, + ) for chunk, score in zip(chunks_with_embeddings, chunk_content_scores) ] diff --git a/backend/onyx/indexing/models.py b/backend/onyx/indexing/models.py index feed2352e..686ac2942 100644 --- a/backend/onyx/indexing/models.py +++ b/backend/onyx/indexing/models.py @@ -184,3 +184,9 @@ class IndexingSetting(EmbeddingModelDetail): class MultipassConfig(BaseModel): multipass_indexing: bool enable_large_chunks: bool + + +class UpdatableChunkData(BaseModel): + chunk_id: int + document_id: str + boost_score: float