From 73a92c046d3883532425dd74a341ce73c85fbf40 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Thu, 1 Aug 2024 10:18:02 -0700
Subject: [PATCH] Fix chunker (#2014)

---
 backend/danswer/indexing/chunker.py           | 5 ++++-
 backend/danswer/indexing/indexing_pipeline.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py
index f58bc8397680..132b422a4091 100644
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -41,7 +41,10 @@ ChunkFunc = Callable[[Document], list[DocAwareChunk]]
 
 
 def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
-    return blurb_splitter.split_text(text)[0]
+    texts = blurb_splitter.split_text(text)
+    if not texts:
+        return ""
+    return texts[0]
 
 
 def chunk_large_section(
diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py
index 9ab67b156638..3fc53fc39e54 100644
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@@ -181,7 +181,7 @@ def index_doc_batch(
     )
 
     logger.debug("Starting chunking")
-    # The first chunk additionally contains the Title of the Document
+    # The embedder is needed here to get the correct tokenizer
     chunks: list[DocAwareChunk] = [
         chunk
         for document in updatable_docs