From d619602a6ff2b4e1302a7af4ac1d863bad6eb20a Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 31 Jul 2024 09:51:01 -0700 Subject: [PATCH] Skip shortcut docs (#1999) --- backend/danswer/indexing/indexing_pipeline.py | 13 +++++++++++-- backend/danswer/server/danswer_api/ingestion.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index 9bd0e2548..9ab67b156 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -165,6 +165,11 @@ def index_doc_batch( if not ignore_time_skip else documents ) + + # No docs to update either because the batch is empty or every doc was already indexed + if not updatable_docs: + return 0, 0 + updatable_ids = [doc.id for doc in updatable_docs] # Create records in the source of truth about these documents, @@ -184,8 +189,12 @@ def index_doc_batch( ] logger.debug("Starting embedding") - chunks_with_embeddings = embedder.embed_chunks( - chunks=chunks, + chunks_with_embeddings = ( + embedder.embed_chunks( + chunks=chunks, + ) + if chunks + else [] ) # Acquires a lock on the documents so that no other process can modify them diff --git a/backend/danswer/server/danswer_api/ingestion.py b/backend/danswer/server/danswer_api/ingestion.py index 9d146e0fc..9127b260d 100644 --- a/backend/danswer/server/danswer_api/ingestion.py +++ b/backend/danswer/server/danswer_api/ingestion.py @@ -107,7 +107,7 @@ def upsert_ingestion_doc( db_session=db_session, ) - new_doc, chunks = indexing_pipeline( + new_doc, __chunk_count = indexing_pipeline( document_batch=[document], index_attempt_metadata=IndexAttemptMetadata( connector_id=cc_pair.connector_id,