From ae72cd56f84130c60f248169b8a3eb73e542b6a9 Mon Sep 17 00:00:00 2001 From: Weves Date: Thu, 16 Nov 2023 11:59:47 -0800 Subject: [PATCH] Add a bit more logging in indexing pipeline --- backend/danswer/indexing/indexing_pipeline.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index 8ba96c474..be0894ff7 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -85,7 +85,7 @@ def _indexing_pipeline( doc.id: doc.doc_updated_at for doc in db_docs if doc.doc_updated_at } - updatable_docs = [] + updatable_docs: list[Document] = [] for doc in documents: if ( doc.id in id_update_time_map @@ -107,12 +107,12 @@ def _indexing_pipeline( db_session=db_session, ) + logger.debug("Starting chunking") chunks: list[DocAwareChunk] = list( chain(*[chunker.chunk(document=document) for document in updatable_docs]) ) - logger.debug( - f"Indexing the following chunks: {[chunk.to_short_descriptor() for chunk in chunks]}" - ) + + logger.debug("Starting embedding") chunks_with_embeddings = embedder.embed(chunks=chunks) # Attach the latest status from Postgres (source of truth for access) to each @@ -138,6 +138,9 @@ def _indexing_pipeline( for chunk in chunks_with_embeddings ] + logger.debug( + f"Indexing the following chunks: {[chunk.to_short_descriptor() for chunk in chunks]}" + ) # A document will not be spread across different batches, so all the # documents with chunks in this set, are fully represented by the chunks # in this set