From ae72cd56f84130c60f248169b8a3eb73e542b6a9 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Thu, 16 Nov 2023 11:59:47 -0800
Subject: [PATCH] Add a bit more logging in indexing pipeline

---
 backend/danswer/indexing/indexing_pipeline.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py
index 8ba96c4743..be0894ff7e 100644
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@@ -85,7 +85,7 @@ def _indexing_pipeline(
             doc.id: doc.doc_updated_at for doc in db_docs if doc.doc_updated_at
         }
 
-        updatable_docs = []
+        updatable_docs: list[Document] = []
         for doc in documents:
             if (
                 doc.id in id_update_time_map
@@ -107,12 +107,12 @@ def _indexing_pipeline(
             db_session=db_session,
         )
 
+        logger.debug("Starting chunking")
         chunks: list[DocAwareChunk] = list(
             chain(*[chunker.chunk(document=document) for document in updatable_docs])
         )
-        logger.debug(
-            f"Indexing the following chunks: {[chunk.to_short_descriptor() for chunk in chunks]}"
-        )
+
+        logger.debug("Starting embedding")
         chunks_with_embeddings = embedder.embed(chunks=chunks)
 
         # Attach the latest status from Postgres (source of truth for access) to each
@@ -138,6 +138,9 @@ def _indexing_pipeline(
             for chunk in chunks_with_embeddings
         ]
 
+        logger.debug(
+            f"Indexing the following chunks: {[chunk.to_short_descriptor() for chunk in chunks]}"
+        )
         # A document will not be spread across different batches, so all the
         # documents with chunks in this set, are fully represented by the chunks
         # in this set