danswer/backend/onyx/db/chunk.py
joachim-danswer 463340b8a1
Reduce ranking scores for short chunks without actual information (#4098)
* remove title for slack

* initial working code

* simplification

* improvements

* name change to information_content_model

* avoid boost_score > 1.0

* nit

* EL comments and improvements

Improvements:
  - proper import of information content model from cache or HF
  - warm up for information content model

Other:
  - EL PR review comments

* nit

* requirements version update

* fixed docker file

* new home for model_server configs

* default off

* small updates

* YS comments - pt 1

* renaming to chunk_boost & chunk table def

* saving and deleting chunk stats in new table

* saving and updating chunk stats

* improved dict score update

* create columns for individual boost factors

* RK comments

* Update migration

* manual import reordering
2025-03-13 17:35:45 +00:00

64 lines
1.9 KiB
Python

from datetime import datetime
from datetime import timezone
from sqlalchemy import delete
from sqlalchemy.orm import Session
from onyx.db.models import ChunkStats
from onyx.indexing.models import UpdatableChunkData
def update_chunk_boost_components__no_commit(
chunk_data: list[UpdatableChunkData],
db_session: Session,
) -> None:
"""Updates the chunk_boost_components for chunks in the database.
Args:
chunk_data: List of dicts containing chunk_id, document_id, and boost_score
db_session: SQLAlchemy database session
"""
if not chunk_data:
return
for data in chunk_data:
chunk_in_doc_id = int(data.chunk_id)
if chunk_in_doc_id < 0:
raise ValueError(f"Chunk ID is empty for chunk {data}")
chunk_document_id = f"{data.document_id}" f"__{chunk_in_doc_id}"
chunk_stats = (
db_session.query(ChunkStats)
.filter(
ChunkStats.id == chunk_document_id,
)
.first()
)
score = data.boost_score
if chunk_stats:
chunk_stats.information_content_boost = score
chunk_stats.last_modified = datetime.now(timezone.utc)
db_session.add(chunk_stats)
else:
# do not save new chunks with a neutral boost score
if score == 1.0:
continue
# Create new record
chunk_stats = ChunkStats(
document_id=data.document_id,
chunk_in_doc_id=chunk_in_doc_id,
information_content_boost=score,
)
db_session.add(chunk_stats)
def delete_chunk_stats_by_connector_credential_pair__no_commit(
db_session: Session, document_ids: list[str]
) -> None:
"""This deletes just chunk stats in postgres."""
stmt = delete(ChunkStats).where(ChunkStats.document_id.in_(document_ids))
db_session.execute(stmt)