From 6b84332f1b191d1d5442a4473bfa6b296f6f0303 Mon Sep 17 00:00:00 2001 From: joachim-danswer Date: Mon, 10 Mar 2025 22:11:16 -0700 Subject: [PATCH] saving and updating chunk stats --- .../background/celery/tasks/shared/tasks.py | 6 ----- backend/onyx/db/chunk.py | 22 ++++++++++++------- backend/onyx/db/document.py | 5 +++++ backend/onyx/indexing/indexing_pipeline.py | 2 +- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/backend/onyx/background/celery/tasks/shared/tasks.py b/backend/onyx/background/celery/tasks/shared/tasks.py index c8708e693..36cb88c3c 100644 --- a/backend/onyx/background/celery/tasks/shared/tasks.py +++ b/backend/onyx/background/celery/tasks/shared/tasks.py @@ -19,7 +19,6 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID from onyx.configs.constants import OnyxCeleryPriority from onyx.configs.constants import OnyxCeleryTask from onyx.configs.constants import OnyxRedisLocks -from onyx.db.chunk import delete_chunk_stats_by_connector_credential_pair__no_commit from onyx.db.document import delete_document_by_connector_credential_pair__no_commit from onyx.db.document import delete_documents_complete__no_commit from onyx.db.document import fetch_chunk_count_for_document @@ -128,11 +127,6 @@ def document_by_cc_pair_cleanup_task( chunk_count=chunk_count, ) - delete_chunk_stats_by_connector_credential_pair__no_commit( - db_session=db_session, - document_ids=[document_id], - ) - delete_documents_complete__no_commit( db_session=db_session, document_ids=[document_id], diff --git a/backend/onyx/db/chunk.py b/backend/onyx/db/chunk.py index 270138ca0..bc0afb252 100644 --- a/backend/onyx/db/chunk.py +++ b/backend/onyx/db/chunk.py @@ -21,8 +21,8 @@ def update_chunk_boost_components__no_commit( return for data in chunk_data: - chunk_in_doc_id = str(data.get("chunk_id", "")) - if len(chunk_in_doc_id) == 0: + chunk_in_doc_id = int(data.get("chunk_id", -1)) + if chunk_in_doc_id < 0: raise ValueError(f"Chunk ID is empty for chunk {data}") chunk_stats = ( db_session.query(ChunkStats) @@ -33,19 +33,25 @@ def update_chunk_boost_components__no_commit( .first() ) - boost_components = {"information_content_boost": data["boost_score"]} + # skip chunks without boost score + if data.get("boost_score") is None: + continue + + score = data["boost_score"] + # this will be the only boost component for now + boost_components = {"information_content_boost": score} if chunk_stats: # Update existing record - if chunk_stats.chunk_boost_components: - chunk_stats.chunk_boost_components.update(boost_components) - else: - chunk_stats.chunk_boost_components = boost_components + chunk_stats.chunk_boost_components = boost_components chunk_stats.last_modified = datetime.now(timezone.utc) + db_session.add(chunk_stats) else: + # do not save new chunks with a neutral boost score + if score == 1.0: + continue # Create new record chunk_stats = ChunkStats( - # id=data["chunk_id"], document_id=data["document_id"], chunk_in_doc_id=chunk_in_doc_id, chunk_boost_components=boost_components, diff --git a/backend/onyx/db/document.py b/backend/onyx/db/document.py index e17401e76..33d106380 100644 --- a/backend/onyx/db/document.py +++ b/backend/onyx/db/document.py @@ -570,6 +570,11 @@ def delete_documents_complete__no_commit( document_ids=document_ids, ) + delete_chunk_stats_by_connector_credential_pair__no_commit( + db_session=db_session, + document_ids=document_ids, + ) + delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids) delete_document_feedback_for_documents__no_commit( document_ids=document_ids, db_session=db_session diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index 80d1cb45c..f6891b264 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -620,7 +620,6 @@ def index_doc_batch( "boost_score": score, } for chunk, score in zip(chunks_with_embeddings, chunk_content_scores) - if score != 1.0 ] # Acquires a lock on the documents so that no other process can modify them @@ -761,6 +760,7 @@ def index_doc_batch( db_session=db_session, ) + # save the chunk boost components to postgres update_chunk_boost_components__no_commit( chunk_data=updatable_chunk_data, db_session=db_session )