Fix chunker (#2014)

This commit is contained in:
Yuhong Sun
2024-08-01 10:18:02 -07:00
committed by GitHub
parent 459bd46846
commit 73a92c046d
2 changed files with 5 additions and 2 deletions

View File

@@ -41,7 +41,10 @@ ChunkFunc = Callable[[Document], list[DocAwareChunk]]
def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
return blurb_splitter.split_text(text)[0]
texts = blurb_splitter.split_text(text)
if not texts:
return ""
return texts[0]
def chunk_large_section(

View File

@@ -181,7 +181,7 @@ def index_doc_batch(
)
logger.debug("Starting chunking")
# The first chunk additionally contains the Title of the Document
# The embedder is needed here to get the correct tokenizer
chunks: list[DocAwareChunk] = [
chunk
for document in updatable_docs