mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 17:51:54 +01:00
RK comments
This commit is contained in:
parent
7d9e133e35
commit
0aa9e8968a
@ -5,10 +5,11 @@ from sqlalchemy import delete
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.models import ChunkStats
|
||||
from onyx.indexing.models import UpdatableChunkData
|
||||
|
||||
|
||||
def update_chunk_boost_components__no_commit(
|
||||
chunk_data: list[dict],
|
||||
chunk_data: list[UpdatableChunkData],
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
"""Updates the chunk_boost_components for chunks in the database.
|
||||
@ -21,23 +22,20 @@ def update_chunk_boost_components__no_commit(
|
||||
return
|
||||
|
||||
for data in chunk_data:
|
||||
chunk_in_doc_id = int(data.get("chunk_id", -1))
|
||||
chunk_in_doc_id = int(data.chunk_id)
|
||||
if chunk_in_doc_id < 0:
|
||||
raise ValueError(f"Chunk ID is empty for chunk {data}")
|
||||
|
||||
chunk_document_id = f"{data.document_id}" f"__{chunk_in_doc_id}"
|
||||
chunk_stats = (
|
||||
db_session.query(ChunkStats)
|
||||
.filter(
|
||||
ChunkStats.document_id == data["document_id"],
|
||||
ChunkStats.chunk_in_doc_id == chunk_in_doc_id,
|
||||
ChunkStats.id == chunk_document_id,
|
||||
)
|
||||
.first()
|
||||
)
|
||||
|
||||
# skip chunks without boost score
|
||||
if data.get("boost_score") is None:
|
||||
continue
|
||||
|
||||
score = data["boost_score"]
|
||||
score = data.boost_score
|
||||
|
||||
if chunk_stats:
|
||||
chunk_stats.information_content_boost = score
|
||||
@ -49,7 +47,7 @@ def update_chunk_boost_components__no_commit(
|
||||
continue
|
||||
# Create new record
|
||||
chunk_stats = ChunkStats(
|
||||
document_id=data["document_id"],
|
||||
document_id=data.document_id,
|
||||
chunk_in_doc_id=chunk_in_doc_id,
|
||||
information_content_boost=score,
|
||||
)
|
||||
|
@ -55,6 +55,7 @@ from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
from onyx.indexing.models import IndexChunk
|
||||
from onyx.indexing.models import UpdatableChunkData
|
||||
from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
|
||||
from onyx.natural_language_processing.search_nlp_models import (
|
||||
InformationContentClassificationModel,
|
||||
@ -66,6 +67,7 @@ from shared_configs.configs import (
|
||||
INDEXING_INFORMATION_CONTENT_CLASSIFICATION_CUTOFF_LENGTH,
|
||||
)
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@ -614,11 +616,11 @@ def index_doc_batch(
|
||||
|
||||
updatable_ids = [doc.id for doc in ctx.updatable_docs]
|
||||
updatable_chunk_data = [
|
||||
{
|
||||
"chunk_id": chunk.chunk_id,
|
||||
"document_id": chunk.source_document.id,
|
||||
"boost_score": score,
|
||||
}
|
||||
UpdatableChunkData(
|
||||
chunk_id=chunk.chunk_id,
|
||||
document_id=chunk.source_document.id,
|
||||
boost_score=score,
|
||||
)
|
||||
for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
|
||||
]
|
||||
|
||||
|
@ -184,3 +184,9 @@ class IndexingSetting(EmbeddingModelDetail):
|
||||
class MultipassConfig(BaseModel):
|
||||
multipass_indexing: bool
|
||||
enable_large_chunks: bool
|
||||
|
||||
|
||||
class UpdatableChunkData(BaseModel):
|
||||
chunk_id: int
|
||||
document_id: str
|
||||
boost_score: float
|
||||
|
Loading…
x
Reference in New Issue
Block a user