Fix chunker (#2014)

2025-09-27 20:38:32 +02:00 · 2024-08-01 10:18:02 -07:00
parent 459bd46846
commit 73a92c046d
2 changed files with 5 additions and 2 deletions
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -41,7 +41,10 @@ ChunkFunc = Callable[[Document], list[DocAwareChunk]]


 def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
-    return blurb_splitter.split_text(text)[0]
+    texts = blurb_splitter.split_text(text)
+    if not texts:
+        return ""
+    return texts[0]


 def chunk_large_section(
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@@ -181,7 +181,7 @@ def index_doc_batch(
    )

    logger.debug("Starting chunking")
-    # The first chunk additionally contains the Title of the Document
+    # The embedder is needed here to get the correct tokenizer
    chunks: list[DocAwareChunk] = [
        chunk
        for document in updatable_docs