From 73a92c046d3883532425dd74a341ce73c85fbf40 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 1 Aug 2024 10:18:02 -0700 Subject: [PATCH] Fix chunker (#2014) --- backend/danswer/indexing/chunker.py | 5 ++++- backend/danswer/indexing/indexing_pipeline.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py index f58bc8397680..132b422a4091 100644 --- a/backend/danswer/indexing/chunker.py +++ b/backend/danswer/indexing/chunker.py @@ -41,7 +41,10 @@ ChunkFunc = Callable[[Document], list[DocAwareChunk]] def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str: - return blurb_splitter.split_text(text)[0] + texts = blurb_splitter.split_text(text) + if not texts: + return "" + return texts[0] def chunk_large_section( diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index 9ab67b156638..3fc53fc39e54 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -181,7 +181,7 @@ def index_doc_batch( ) logger.debug("Starting chunking") - # The first chunk additionally contains the Title of the Document + # The embedder is needed here to get the correct tokenizer chunks: list[DocAwareChunk] = [ chunk for document in updatable_docs