mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 17:51:54 +01:00
saving and updating chunk stats
This commit is contained in:
parent
4fe5561f44
commit
6b84332f1b
@ -19,7 +19,6 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.chunk import delete_chunk_stats_by_connector_credential_pair__no_commit
|
||||
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
|
||||
from onyx.db.document import delete_documents_complete__no_commit
|
||||
from onyx.db.document import fetch_chunk_count_for_document
|
||||
@ -128,11 +127,6 @@ def document_by_cc_pair_cleanup_task(
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
|
||||
delete_chunk_stats_by_connector_credential_pair__no_commit(
|
||||
db_session=db_session,
|
||||
document_ids=[document_id],
|
||||
)
|
||||
|
||||
delete_documents_complete__no_commit(
|
||||
db_session=db_session,
|
||||
document_ids=[document_id],
|
||||
|
@ -21,8 +21,8 @@ def update_chunk_boost_components__no_commit(
|
||||
return
|
||||
|
||||
for data in chunk_data:
|
||||
chunk_in_doc_id = str(data.get("chunk_id", ""))
|
||||
if len(chunk_in_doc_id) == 0:
|
||||
chunk_in_doc_id = int(data.get("chunk_id", -1))
|
||||
if chunk_in_doc_id < 0:
|
||||
raise ValueError(f"Chunk ID is empty for chunk {data}")
|
||||
chunk_stats = (
|
||||
db_session.query(ChunkStats)
|
||||
@ -33,19 +33,25 @@ def update_chunk_boost_components__no_commit(
|
||||
.first()
|
||||
)
|
||||
|
||||
boost_components = {"information_content_boost": data["boost_score"]}
|
||||
# skip chunks without boost score
|
||||
if data.get("boost_score") is None:
|
||||
continue
|
||||
|
||||
score = data["boost_score"]
|
||||
# this will be the only boost component for now
|
||||
boost_components = {"information_content_boost": score}
|
||||
|
||||
if chunk_stats:
|
||||
# Update existing record
|
||||
if chunk_stats.chunk_boost_components:
|
||||
chunk_stats.chunk_boost_components.update(boost_components)
|
||||
else:
|
||||
chunk_stats.chunk_boost_components = boost_components
|
||||
chunk_stats.chunk_boost_components = boost_components
|
||||
chunk_stats.last_modified = datetime.now(timezone.utc)
|
||||
db_session.add(chunk_stats)
|
||||
else:
|
||||
# do not save new chunks with a neutral boost score
|
||||
if score == 1.0:
|
||||
continue
|
||||
# Create new record
|
||||
chunk_stats = ChunkStats(
|
||||
# id=data["chunk_id"],
|
||||
document_id=data["document_id"],
|
||||
chunk_in_doc_id=chunk_in_doc_id,
|
||||
chunk_boost_components=boost_components,
|
||||
|
@ -570,6 +570,11 @@ def delete_documents_complete__no_commit(
|
||||
document_ids=document_ids,
|
||||
)
|
||||
|
||||
delete_chunk_stats_by_connector_credential_pair__no_commit(
|
||||
db_session=db_session,
|
||||
document_ids=document_ids,
|
||||
)
|
||||
|
||||
delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids)
|
||||
delete_document_feedback_for_documents__no_commit(
|
||||
document_ids=document_ids, db_session=db_session
|
||||
|
@ -620,7 +620,6 @@ def index_doc_batch(
|
||||
"boost_score": score,
|
||||
}
|
||||
for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
|
||||
if score != 1.0
|
||||
]
|
||||
|
||||
# Acquires a lock on the documents so that no other process can modify them
|
||||
@ -761,6 +760,7 @@ def index_doc_batch(
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
# save the chunk boost components to postgres
|
||||
update_chunk_boost_components__no_commit(
|
||||
chunk_data=updatable_chunk_data, db_session=db_session
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user