Add chunk level logging when indexing

2025-10-09 12:47:13 +02:00 · 2023-08-15 18:38:24 -07:00
parent 820f8b7b48
commit 0aa04ad616
2 changed files with 17 additions and 0 deletions
--- a/backend/danswer/chunking/models.py
+++ b/backend/danswer/chunking/models.py
@@ -31,6 +31,12 @@ class IndexChunk(BaseChunk):
    # During inference we only have access to the document id and do not reconstruct the Document
    source_document: Document

+    def to_short_descriptor(self) -> str:
+        """Used when logging the identity of a chunk"""
+        return (
+            f"Chunk ID: '{self.chunk_id}'; {self.source_document.to_short_descriptor()}"
+        )
+

@dataclass
 class EmbeddedIndexChunk(IndexChunk):
--- a/backend/danswer/datastores/indexing_pipeline.py
+++ b/backend/danswer/datastores/indexing_pipeline.py
@@ -79,7 +79,15 @@ def _indexing_pipeline(
    """Takes different pieces of the indexing pipeline and applies it to a batch of documents
    Note that the documents should already be batched at this point so that it does not inflate the
    memory requirements"""
+    # Chunk the documents into reasonably-sized chunks so they can fit into the
+    # context-sizes of our embedding models
    chunks = list(chain(*[chunker.chunk(document=document) for document in documents]))
+    logger.debug(
+        f"Indexing the following chunks: {[chunk.to_short_descriptor() for chunk in chunks]}"
+    )
+
+    # Insert the chunks into our Keyword document store + store records of these
+    # documents / chunks into our database
    # TODO keyword indexing can occur at same time as embedding
    keyword_store_insertion_records = keyword_index.index(
        chunks=chunks, index_attempt_metadata=index_attempt_metadata
@@ -94,6 +102,8 @@ def _indexing_pipeline(
        insertion_records=keyword_store_insertion_records
    )

+    # Embed the chunks and then insert them into our Vector document store
+    # + store records of these documents / chunks into our database
    chunks_with_embeddings = embedder.embed(chunks=chunks)
    vector_store_insertion_records = vector_index.index(
        chunks=chunks_with_embeddings, index_attempt_metadata=index_attempt_metadata
@@ -107,6 +117,7 @@ def _indexing_pipeline(
    net_doc_count_vector = _get_net_new_documents(
        insertion_records=vector_store_insertion_records
    )
+
    if net_doc_count_vector != net_doc_count_keyword:
        logger.warning("Document count change from keyword/vector indices don't align")
    net_new_docs = max(net_doc_count_keyword, net_doc_count_vector)