mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 13:15:18 +02:00
Fix chunker (#2014)
This commit is contained in:
@@ -41,7 +41,10 @@ ChunkFunc = Callable[[Document], list[DocAwareChunk]]
|
|||||||
|
|
||||||
|
|
||||||
def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
|
def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
|
||||||
return blurb_splitter.split_text(text)[0]
|
texts = blurb_splitter.split_text(text)
|
||||||
|
if not texts:
|
||||||
|
return ""
|
||||||
|
return texts[0]
|
||||||
|
|
||||||
|
|
||||||
def chunk_large_section(
|
def chunk_large_section(
|
||||||
|
@@ -181,7 +181,7 @@ def index_doc_batch(
|
|||||||
)
|
)
|
||||||
|
|
||||||
logger.debug("Starting chunking")
|
logger.debug("Starting chunking")
|
||||||
# The first chunk additionally contains the Title of the Document
|
# The embedder is needed here to get the correct tokenizer
|
||||||
chunks: list[DocAwareChunk] = [
|
chunks: list[DocAwareChunk] = [
|
||||||
chunk
|
chunk
|
||||||
for document in updatable_docs
|
for document in updatable_docs
|
||||||
|
Reference in New Issue
Block a user