mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-30 04:31:49 +02:00
saving and updating chunk stats
This commit is contained in:
parent
4fe5561f44
commit
6b84332f1b
@ -19,7 +19,6 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
|||||||
from onyx.configs.constants import OnyxCeleryPriority
|
from onyx.configs.constants import OnyxCeleryPriority
|
||||||
from onyx.configs.constants import OnyxCeleryTask
|
from onyx.configs.constants import OnyxCeleryTask
|
||||||
from onyx.configs.constants import OnyxRedisLocks
|
from onyx.configs.constants import OnyxRedisLocks
|
||||||
from onyx.db.chunk import delete_chunk_stats_by_connector_credential_pair__no_commit
|
|
||||||
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
|
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
|
||||||
from onyx.db.document import delete_documents_complete__no_commit
|
from onyx.db.document import delete_documents_complete__no_commit
|
||||||
from onyx.db.document import fetch_chunk_count_for_document
|
from onyx.db.document import fetch_chunk_count_for_document
|
||||||
@ -128,11 +127,6 @@ def document_by_cc_pair_cleanup_task(
|
|||||||
chunk_count=chunk_count,
|
chunk_count=chunk_count,
|
||||||
)
|
)
|
||||||
|
|
||||||
delete_chunk_stats_by_connector_credential_pair__no_commit(
|
|
||||||
db_session=db_session,
|
|
||||||
document_ids=[document_id],
|
|
||||||
)
|
|
||||||
|
|
||||||
delete_documents_complete__no_commit(
|
delete_documents_complete__no_commit(
|
||||||
db_session=db_session,
|
db_session=db_session,
|
||||||
document_ids=[document_id],
|
document_ids=[document_id],
|
||||||
|
@ -21,8 +21,8 @@ def update_chunk_boost_components__no_commit(
|
|||||||
return
|
return
|
||||||
|
|
||||||
for data in chunk_data:
|
for data in chunk_data:
|
||||||
chunk_in_doc_id = str(data.get("chunk_id", ""))
|
chunk_in_doc_id = int(data.get("chunk_id", -1))
|
||||||
if len(chunk_in_doc_id) == 0:
|
if chunk_in_doc_id < 0:
|
||||||
raise ValueError(f"Chunk ID is empty for chunk {data}")
|
raise ValueError(f"Chunk ID is empty for chunk {data}")
|
||||||
chunk_stats = (
|
chunk_stats = (
|
||||||
db_session.query(ChunkStats)
|
db_session.query(ChunkStats)
|
||||||
@ -33,19 +33,25 @@ def update_chunk_boost_components__no_commit(
|
|||||||
.first()
|
.first()
|
||||||
)
|
)
|
||||||
|
|
||||||
boost_components = {"information_content_boost": data["boost_score"]}
|
# skip chunks without boost score
|
||||||
|
if data.get("boost_score") is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = data["boost_score"]
|
||||||
|
# this will be the only boost component for now
|
||||||
|
boost_components = {"information_content_boost": score}
|
||||||
|
|
||||||
if chunk_stats:
|
if chunk_stats:
|
||||||
# Update existing record
|
# Update existing record
|
||||||
if chunk_stats.chunk_boost_components:
|
chunk_stats.chunk_boost_components = boost_components
|
||||||
chunk_stats.chunk_boost_components.update(boost_components)
|
|
||||||
else:
|
|
||||||
chunk_stats.chunk_boost_components = boost_components
|
|
||||||
chunk_stats.last_modified = datetime.now(timezone.utc)
|
chunk_stats.last_modified = datetime.now(timezone.utc)
|
||||||
|
db_session.add(chunk_stats)
|
||||||
else:
|
else:
|
||||||
|
# do not save new chunks with a neutral boost score
|
||||||
|
if score == 1.0:
|
||||||
|
continue
|
||||||
# Create new record
|
# Create new record
|
||||||
chunk_stats = ChunkStats(
|
chunk_stats = ChunkStats(
|
||||||
# id=data["chunk_id"],
|
|
||||||
document_id=data["document_id"],
|
document_id=data["document_id"],
|
||||||
chunk_in_doc_id=chunk_in_doc_id,
|
chunk_in_doc_id=chunk_in_doc_id,
|
||||||
chunk_boost_components=boost_components,
|
chunk_boost_components=boost_components,
|
||||||
|
@ -570,6 +570,11 @@ def delete_documents_complete__no_commit(
|
|||||||
document_ids=document_ids,
|
document_ids=document_ids,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
delete_chunk_stats_by_connector_credential_pair__no_commit(
|
||||||
|
db_session=db_session,
|
||||||
|
document_ids=document_ids,
|
||||||
|
)
|
||||||
|
|
||||||
delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids)
|
delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids)
|
||||||
delete_document_feedback_for_documents__no_commit(
|
delete_document_feedback_for_documents__no_commit(
|
||||||
document_ids=document_ids, db_session=db_session
|
document_ids=document_ids, db_session=db_session
|
||||||
|
@ -620,7 +620,6 @@ def index_doc_batch(
|
|||||||
"boost_score": score,
|
"boost_score": score,
|
||||||
}
|
}
|
||||||
for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
|
for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
|
||||||
if score != 1.0
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Acquires a lock on the documents so that no other process can modify them
|
# Acquires a lock on the documents so that no other process can modify them
|
||||||
@ -761,6 +760,7 @@ def index_doc_batch(
|
|||||||
db_session=db_session,
|
db_session=db_session,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# save the chunk boost components to postgres
|
||||||
update_chunk_boost_components__no_commit(
|
update_chunk_boost_components__no_commit(
|
||||||
chunk_data=updatable_chunk_data, db_session=db_session
|
chunk_data=updatable_chunk_data, db_session=db_session
|
||||||
)
|
)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user