RK comments

This commit is contained in:
joachim-danswer 2025-03-12 16:14:09 -07:00
parent 7d9e133e35
commit 0aa9e8968a
3 changed files with 21 additions and 15 deletions

View File

@ -5,10 +5,11 @@ from sqlalchemy import delete
from sqlalchemy.orm import Session
from onyx.db.models import ChunkStats
from onyx.indexing.models import UpdatableChunkData
def update_chunk_boost_components__no_commit(
chunk_data: list[dict],
chunk_data: list[UpdatableChunkData],
db_session: Session,
) -> None:
"""Updates the chunk_boost_components for chunks in the database.
@ -21,23 +22,20 @@ def update_chunk_boost_components__no_commit(
return
for data in chunk_data:
chunk_in_doc_id = int(data.get("chunk_id", -1))
chunk_in_doc_id = int(data.chunk_id)
if chunk_in_doc_id < 0:
raise ValueError(f"Chunk ID is empty for chunk {data}")
chunk_document_id = f"{data.document_id}" f"__{chunk_in_doc_id}"
chunk_stats = (
db_session.query(ChunkStats)
.filter(
ChunkStats.document_id == data["document_id"],
ChunkStats.chunk_in_doc_id == chunk_in_doc_id,
ChunkStats.id == chunk_document_id,
)
.first()
)
# skip chunks without boost score
if data.get("boost_score") is None:
continue
score = data["boost_score"]
score = data.boost_score
if chunk_stats:
chunk_stats.information_content_boost = score
@ -49,7 +47,7 @@ def update_chunk_boost_components__no_commit(
continue
# Create new record
chunk_stats = ChunkStats(
document_id=data["document_id"],
document_id=data.document_id,
chunk_in_doc_id=chunk_in_doc_id,
information_content_boost=score,
)

View File

@ -55,6 +55,7 @@ from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.models import DocAwareChunk
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import IndexChunk
from onyx.indexing.models import UpdatableChunkData
from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
from onyx.natural_language_processing.search_nlp_models import (
InformationContentClassificationModel,
@ -66,6 +67,7 @@ from shared_configs.configs import (
INDEXING_INFORMATION_CONTENT_CLASSIFICATION_CUTOFF_LENGTH,
)
logger = setup_logger()
@ -614,11 +616,11 @@ def index_doc_batch(
updatable_ids = [doc.id for doc in ctx.updatable_docs]
updatable_chunk_data = [
{
"chunk_id": chunk.chunk_id,
"document_id": chunk.source_document.id,
"boost_score": score,
}
UpdatableChunkData(
chunk_id=chunk.chunk_id,
document_id=chunk.source_document.id,
boost_score=score,
)
for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
]

View File

@ -184,3 +184,9 @@ class IndexingSetting(EmbeddingModelDetail):
class MultipassConfig(BaseModel):
multipass_indexing: bool
enable_large_chunks: bool
class UpdatableChunkData(BaseModel):
chunk_id: int
document_id: str
boost_score: float