saving and updating chunk stats

This commit is contained in:
joachim-danswer 2025-03-10 22:11:16 -07:00
parent 4fe5561f44
commit 6b84332f1b
4 changed files with 20 additions and 15 deletions

View File

@ -19,7 +19,6 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryTask
from onyx.configs.constants import OnyxRedisLocks
from onyx.db.chunk import delete_chunk_stats_by_connector_credential_pair__no_commit
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.document import fetch_chunk_count_for_document
@ -128,11 +127,6 @@ def document_by_cc_pair_cleanup_task(
chunk_count=chunk_count,
)
delete_chunk_stats_by_connector_credential_pair__no_commit(
db_session=db_session,
document_ids=[document_id],
)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=[document_id],

View File

@ -21,8 +21,8 @@ def update_chunk_boost_components__no_commit(
return
for data in chunk_data:
chunk_in_doc_id = str(data.get("chunk_id", ""))
if len(chunk_in_doc_id) == 0:
chunk_in_doc_id = int(data.get("chunk_id", -1))
if chunk_in_doc_id < 0:
raise ValueError(f"Chunk ID is empty for chunk {data}")
chunk_stats = (
db_session.query(ChunkStats)
@ -33,19 +33,25 @@ def update_chunk_boost_components__no_commit(
.first()
)
boost_components = {"information_content_boost": data["boost_score"]}
# skip chunks without boost score
if data.get("boost_score") is None:
continue
score = data["boost_score"]
# this will be the only boost component for now
boost_components = {"information_content_boost": score}
if chunk_stats:
# Update existing record
if chunk_stats.chunk_boost_components:
chunk_stats.chunk_boost_components.update(boost_components)
else:
chunk_stats.chunk_boost_components = boost_components
chunk_stats.chunk_boost_components = boost_components
chunk_stats.last_modified = datetime.now(timezone.utc)
db_session.add(chunk_stats)
else:
# do not save new chunks with a neutral boost score
if score == 1.0:
continue
# Create new record
chunk_stats = ChunkStats(
# id=data["chunk_id"],
document_id=data["document_id"],
chunk_in_doc_id=chunk_in_doc_id,
chunk_boost_components=boost_components,

View File

@ -570,6 +570,11 @@ def delete_documents_complete__no_commit(
document_ids=document_ids,
)
delete_chunk_stats_by_connector_credential_pair__no_commit(
db_session=db_session,
document_ids=document_ids,
)
delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids)
delete_document_feedback_for_documents__no_commit(
document_ids=document_ids, db_session=db_session

View File

@ -620,7 +620,6 @@ def index_doc_batch(
"boost_score": score,
}
for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
if score != 1.0
]
# Acquires a lock on the documents so that no other process can modify them
@ -761,6 +760,7 @@ def index_doc_batch(
db_session=db_session,
)
# save the chunk boost components to postgres
update_chunk_boost_components__no_commit(
chunk_data=updatable_chunk_data, db_session=db_session
)