mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-03 09:28:25 +02:00
Use Sentence Aware Splitter (#452)
This commit is contained in:
parent
63215e9c9a
commit
6b305c56b3
@ -1,169 +1,90 @@
|
||||
import abc
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
|
||||
from llama_index.text_splitter import SentenceSplitter
|
||||
from transformers import AutoTokenizer # type:ignore
|
||||
|
||||
from danswer.chunking.models import DocAwareChunk
|
||||
from danswer.configs.app_configs import BLURB_LENGTH
|
||||
from danswer.configs.app_configs import CHUNK_MAX_CHAR_OVERLAP
|
||||
from danswer.configs.app_configs import BLURB_SIZE
|
||||
from danswer.configs.app_configs import CHUNK_OVERLAP
|
||||
from danswer.configs.app_configs import CHUNK_SIZE
|
||||
from danswer.configs.app_configs import CHUNK_WORD_OVERLAP
|
||||
from danswer.configs.app_configs import MINI_CHUNK_SIZE
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.search.search_utils import get_default_tokenizer
|
||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||
|
||||
|
||||
SECTION_SEPARATOR = "\n\n"
|
||||
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
|
||||
|
||||
|
||||
def extract_blurb(text: str, blurb_len: int) -> str:
|
||||
if len(text) < blurb_len:
|
||||
return text
|
||||
|
||||
match = re.search(r"[.!?:]", text[blurb_len:])
|
||||
max_blub_len = min(2 * blurb_len, len(text))
|
||||
|
||||
end_index = (
|
||||
max_blub_len
|
||||
if match is None
|
||||
else min(blurb_len + match.start() + 1, max_blub_len)
|
||||
def extract_blurb(text: str, blurb_size: int) -> str:
|
||||
token_count_func = get_default_tokenizer().tokenize
|
||||
blurb_splitter = SentenceSplitter(
|
||||
tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0
|
||||
)
|
||||
|
||||
if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
|
||||
last_space = text.rfind(" ", 0, end_index)
|
||||
# If there's no space in the text (single word longer than blurb_len), return the whole text
|
||||
end_index = last_space if last_space != -1 else len(text)
|
||||
|
||||
blurb = text[:end_index]
|
||||
|
||||
blurb = blurb.replace("\n", " ")
|
||||
blurb = blurb.replace("\r", " ")
|
||||
while " " in blurb:
|
||||
blurb = blurb.replace(" ", " ")
|
||||
|
||||
return blurb
|
||||
return blurb_splitter.split_text(text)[0]
|
||||
|
||||
|
||||
def chunk_large_section(
|
||||
section: Section,
|
||||
document: Document,
|
||||
start_chunk_id: int,
|
||||
tokenizer: AutoTokenizer,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
word_overlap: int = CHUNK_WORD_OVERLAP,
|
||||
blurb_len: int = BLURB_LENGTH,
|
||||
chunk_overflow_max: int = CHUNK_MAX_CHAR_OVERLAP,
|
||||
chunk_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_size: int = BLURB_SIZE,
|
||||
) -> list[DocAwareChunk]:
|
||||
"""Split large sections into multiple chunks with the final chunk having as much previous overlap as possible.
|
||||
Backtracks word_overlap words, delimited by whitespace, backtrack up to chunk_overflow_max characters max
|
||||
When chunk is finished in forward direction, attempt to finish the word, but only up to chunk_overflow_max
|
||||
|
||||
Some details:
|
||||
- Backtracking (overlap) => finish current word by backtracking + an additional (word_overlap - 1) words
|
||||
- Continuation chunks start with a space generally unless overflow limit is hit
|
||||
- Chunks end with a space generally unless overflow limit is hit
|
||||
"""
|
||||
section_text = section.text
|
||||
blurb = extract_blurb(section_text, blurb_len)
|
||||
char_count = len(section_text)
|
||||
chunk_strs: list[str] = []
|
||||
blurb = extract_blurb(section_text, blurb_size)
|
||||
|
||||
# start_pos is the actual start of the chunk not including the backtracking overlap
|
||||
# segment_start_pos counts backwards to include overlap from previous chunk
|
||||
start_pos = segment_start_pos = 0
|
||||
while start_pos < char_count:
|
||||
back_overflow_chars = 0
|
||||
forward_overflow_chars = 0
|
||||
back_count_words = 0
|
||||
end_pos = segment_end_pos = min(start_pos + chunk_size, char_count)
|
||||
sentence_aware_splitter = SentenceSplitter(
|
||||
tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
|
||||
# Forward overlap to attempt to finish the current word
|
||||
while forward_overflow_chars < chunk_overflow_max:
|
||||
if (
|
||||
segment_end_pos >= char_count
|
||||
or section_text[segment_end_pos - 1].isspace()
|
||||
):
|
||||
break
|
||||
segment_end_pos += 1
|
||||
forward_overflow_chars += 1
|
||||
split_texts = sentence_aware_splitter.split_text(section_text)
|
||||
|
||||
# Backwards overlap counting up to word_overlap words (whitespace delineated) or chunk_overflow_max chars
|
||||
# Counts back by finishing current word by backtracking + an additional (word_overlap - 1) words
|
||||
# If starts on a space, it considers finishing the current word as done
|
||||
while back_overflow_chars < chunk_overflow_max:
|
||||
if segment_start_pos == 0:
|
||||
break
|
||||
# no -1 offset here because we want to include prepended space to be clear it's a continuation
|
||||
if section_text[segment_start_pos].isspace():
|
||||
back_count_words += 1
|
||||
if back_count_words > word_overlap:
|
||||
break
|
||||
back_count_words += 1
|
||||
segment_start_pos -= 1
|
||||
back_overflow_chars += 1
|
||||
|
||||
# Extract chunk from section text based on the pointers from above
|
||||
chunk_str = section_text[segment_start_pos:segment_end_pos]
|
||||
chunk_strs.append(chunk_str)
|
||||
|
||||
# Move pointers to next section, not counting overlaps forward or backward
|
||||
start_pos = segment_start_pos = end_pos
|
||||
|
||||
# Last chunk should be as long as possible, overlap favored over tiny chunk with no context
|
||||
if len(chunk_strs) > 1:
|
||||
chunk_strs.pop()
|
||||
back_count_words = 0
|
||||
back_overflow_chars = 0
|
||||
# Backcount chunk size number of characters then
|
||||
# add in the backcounting overlap like with every other previous chunk
|
||||
start_pos = char_count - chunk_size
|
||||
while back_overflow_chars < chunk_overflow_max:
|
||||
if start_pos == 0:
|
||||
break
|
||||
if section_text[start_pos].isspace():
|
||||
if back_count_words > word_overlap:
|
||||
break
|
||||
back_count_words += 1
|
||||
start_pos -= 1
|
||||
back_overflow_chars += 1
|
||||
chunk_strs.append(section_text[start_pos:])
|
||||
|
||||
chunks = []
|
||||
for chunk_ind, chunk_str in enumerate(chunk_strs):
|
||||
chunks.append(
|
||||
DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=start_chunk_id + chunk_ind,
|
||||
blurb=blurb,
|
||||
content=chunk_str,
|
||||
source_links={0: section.link},
|
||||
section_continuation=(chunk_ind != 0),
|
||||
)
|
||||
chunks = [
|
||||
DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=start_chunk_id + chunk_ind,
|
||||
blurb=blurb,
|
||||
content=chunk_str,
|
||||
source_links={0: section.link},
|
||||
section_continuation=(chunk_ind != 0),
|
||||
)
|
||||
for chunk_ind, chunk_str in enumerate(split_texts)
|
||||
]
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_document(
|
||||
document: Document,
|
||||
chunk_size: int = CHUNK_SIZE,
|
||||
subsection_overlap: int = CHUNK_WORD_OVERLAP,
|
||||
blurb_len: int = BLURB_LENGTH,
|
||||
chunk_tok_size: int = CHUNK_SIZE,
|
||||
subsection_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_size: int = BLURB_SIZE,
|
||||
) -> list[DocAwareChunk]:
|
||||
tokenizer = get_default_tokenizer()
|
||||
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
for section in document.sections:
|
||||
current_length = len(chunk_text)
|
||||
section_tok_length = len(tokenizer.tokenize(section.text))
|
||||
current_tok_length = len(tokenizer.tokenize(chunk_text))
|
||||
curr_offset_len = len(shared_precompare_cleanup(chunk_text))
|
||||
section_length = len(section.text)
|
||||
|
||||
# Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
|
||||
# at the end by other sections
|
||||
if section_length > chunk_size:
|
||||
if section_tok_length > chunk_tok_size:
|
||||
if chunk_text:
|
||||
chunks.append(
|
||||
DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_len),
|
||||
blurb=extract_blurb(chunk_text, blurb_size),
|
||||
content=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
@ -176,15 +97,21 @@ def chunk_document(
|
||||
section=section,
|
||||
document=document,
|
||||
start_chunk_id=len(chunks),
|
||||
chunk_size=chunk_size,
|
||||
word_overlap=subsection_overlap,
|
||||
blurb_len=blurb_len,
|
||||
tokenizer=tokenizer,
|
||||
chunk_size=chunk_tok_size,
|
||||
chunk_overlap=subsection_overlap,
|
||||
blurb_size=blurb_size,
|
||||
)
|
||||
chunks.extend(large_section_chunks)
|
||||
continue
|
||||
|
||||
# In the case where the whole section is shorter than a chunk, either adding to chunk or start a new one
|
||||
if current_length + len(SECTION_SEPARATOR) + section_length <= chunk_size:
|
||||
if (
|
||||
current_tok_length
|
||||
+ len(tokenizer.tokenize(SECTION_SEPARATOR))
|
||||
+ section_tok_length
|
||||
<= chunk_tok_size
|
||||
):
|
||||
chunk_text += (
|
||||
SECTION_SEPARATOR + section.text if chunk_text else section.text
|
||||
)
|
||||
@ -194,7 +121,7 @@ def chunk_document(
|
||||
DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_len),
|
||||
blurb=extract_blurb(chunk_text, blurb_size),
|
||||
content=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
@ -209,7 +136,7 @@ def chunk_document(
|
||||
DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=extract_blurb(chunk_text, blurb_len),
|
||||
blurb=extract_blurb(chunk_text, blurb_size),
|
||||
content=chunk_text,
|
||||
source_links=link_offsets,
|
||||
section_continuation=False,
|
||||
@ -218,6 +145,17 @@ def chunk_document(
|
||||
return chunks
|
||||
|
||||
|
||||
def split_chunk_text_into_mini_chunks(
|
||||
chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
|
||||
) -> list[str]:
|
||||
token_count_func = get_default_tokenizer().tokenize
|
||||
sentence_aware_splitter = SentenceSplitter(
|
||||
tokenizer=token_count_func, chunk_size=mini_chunk_size, chunk_overlap=0
|
||||
)
|
||||
|
||||
return sentence_aware_splitter.split_text(chunk_text)
|
||||
|
||||
|
||||
class Chunker:
|
||||
@abc.abstractmethod
|
||||
def chunk(self, document: Document) -> list[DocAwareChunk]:
|
||||
|
@ -12,7 +12,7 @@ APP_PORT = 8080
|
||||
#####
|
||||
# User Facing Features Configs
|
||||
#####
|
||||
BLURB_LENGTH = 200 # Characters. Blurbs will be truncated at the first punctuation after this many characters.
|
||||
BLURB_SIZE = 128 # Number Encoder Tokens included in the chunk blurb
|
||||
GENERATIVE_MODEL_ACCESS_CHECK_FREQ = 86400 # 1 day
|
||||
# DISABLE_GENERATIVE_AI will turn of the question answering part of Danswer. Use this
|
||||
# if you want to use Danswer as a search engine only and/or you are not comfortable sending
|
||||
@ -158,19 +158,14 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "fals
|
||||
#####
|
||||
# Text Processing Configs
|
||||
#####
|
||||
# Chunking docs to this number of characters not including finishing the last word and the overlap words below
|
||||
# Calculated by ~500 to 512 tokens max * average 4 chars per token
|
||||
CHUNK_SIZE = 2000
|
||||
CHUNK_SIZE = 512 # Tokens by embedding model
|
||||
CHUNK_OVERLAP = int(CHUNK_SIZE * 0.05) # 5% overlap
|
||||
# More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors)
|
||||
ENABLE_MINI_CHUNK = False
|
||||
# Mini chunks for fine-grained embedding, calculated as 128 tokens for 4 additional vectors for 512 chunk size above
|
||||
# Not rounded down to not lose any context in full chunk.
|
||||
MINI_CHUNK_SIZE = 512
|
||||
# Each chunk includes an additional CHUNK_WORD_OVERLAP words from previous chunk
|
||||
CHUNK_WORD_OVERLAP = 5
|
||||
# When trying to finish the last word in the chunk or counting back CHUNK_WORD_OVERLAP backwards,
|
||||
# This is the max number of characters allowed in either direction
|
||||
CHUNK_MAX_CHAR_OVERLAP = 50
|
||||
ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true"
|
||||
# Finer grained chunking for more detail retention
|
||||
# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
|
||||
# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
|
||||
MINI_CHUNK_SIZE = 150
|
||||
|
||||
|
||||
#####
|
||||
|
@ -4,12 +4,12 @@ from uuid import UUID
|
||||
import numpy
|
||||
from sentence_transformers import SentenceTransformer # type: ignore
|
||||
|
||||
from danswer.chunking.chunk import split_chunk_text_into_mini_chunks
|
||||
from danswer.chunking.models import ChunkEmbedding
|
||||
from danswer.chunking.models import DocAwareChunk
|
||||
from danswer.chunking.models import IndexChunk
|
||||
from danswer.chunking.models import InferenceChunk
|
||||
from danswer.configs.app_configs import ENABLE_MINI_CHUNK
|
||||
from danswer.configs.app_configs import MINI_CHUNK_SIZE
|
||||
from danswer.configs.app_configs import NUM_RERANKED_RESULTS
|
||||
from danswer.configs.app_configs import NUM_RETURNED_HITS
|
||||
from danswer.configs.model_configs import ASYMMETRIC_PREFIX
|
||||
@ -124,36 +124,6 @@ def retrieve_ranked_documents(
|
||||
return ranked_chunks, top_chunks[num_rerank:]
|
||||
|
||||
|
||||
def split_chunk_text_into_mini_chunks(
|
||||
chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
|
||||
) -> list[str]:
|
||||
chunks = []
|
||||
start = 0
|
||||
separators = [" ", "\n", "\r", "\t"]
|
||||
|
||||
while start < len(chunk_text):
|
||||
if len(chunk_text) - start <= mini_chunk_size:
|
||||
end = len(chunk_text)
|
||||
else:
|
||||
# Find the first separator character after min_chunk_length
|
||||
end_positions = [
|
||||
(chunk_text[start + mini_chunk_size :]).find(sep) for sep in separators
|
||||
]
|
||||
# Filter out the not found cases (-1)
|
||||
end_positions = [pos for pos in end_positions if pos != -1]
|
||||
if not end_positions:
|
||||
# If no more separators, the rest of the string becomes a chunk
|
||||
end = len(chunk_text)
|
||||
else:
|
||||
# Add min_chunk_length and start to the end position
|
||||
end = min(end_positions) + start + mini_chunk_size
|
||||
|
||||
chunks.append(chunk_text[start:end])
|
||||
start = end + 1 # Move to the next character after the separator
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def encode_chunks(
|
||||
chunks: list[DocAwareChunk],
|
||||
|
@ -22,6 +22,7 @@ httpx-oauth==0.11.2
|
||||
huggingface-hub==0.16.4
|
||||
jira==3.5.1
|
||||
langchain==0.0.273
|
||||
llama-index==0.8.27
|
||||
Mako==1.2.4
|
||||
nltk==3.8.1
|
||||
docx2txt==0.8
|
||||
@ -45,7 +46,7 @@ rfc3986==1.5.0
|
||||
safetensors==0.3.1
|
||||
sentence-transformers==2.2.2
|
||||
slack-sdk==3.20.2
|
||||
SQLAlchemy[mypy]==2.0.12
|
||||
SQLAlchemy[mypy]==2.0.15
|
||||
tensorflow==2.13.0
|
||||
tiktoken==0.4.0
|
||||
transformers==4.30.1
|
||||
|
@ -1,109 +0,0 @@
|
||||
import unittest
|
||||
|
||||
from danswer.chunking.chunk import chunk_document
|
||||
from danswer.chunking.chunk import chunk_large_section
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
|
||||
|
||||
WAR_AND_PEACE = (
|
||||
"Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, "
|
||||
"if you don’t tell me that this means war, if you still try to defend the infamies and horrors perpetrated by "
|
||||
"that Antichrist—I really believe he is Antichrist—I will have nothing more to do with you and you are no longer "
|
||||
"my friend, no longer my ‘faithful slave,’ as you call yourself! But how do you do? I see I have frightened "
|
||||
"you—sit down and tell me all the news."
|
||||
)
|
||||
|
||||
|
||||
class TestDocumentChunking(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
|
||||
self.large_unbroken_section = Section(
|
||||
text="0123456789" * 40, link="https://www.test.com/"
|
||||
)
|
||||
self.document = Document(
|
||||
id="test_document",
|
||||
sections=[
|
||||
Section(
|
||||
text="Here is some testing text", link="https://www.test.com/0"
|
||||
),
|
||||
Section(
|
||||
text="Some more text, still under 100 chars",
|
||||
link="https://www.test.com/1",
|
||||
),
|
||||
Section(
|
||||
text="Now with this section it's longer than the chunk size",
|
||||
link="https://www.test.com/2",
|
||||
),
|
||||
self.large_section,
|
||||
Section(text="These last 2 sections", link="https://www.test.com/4"),
|
||||
Section(
|
||||
text="should be combined into one", link="https://www.test.com/5"
|
||||
),
|
||||
],
|
||||
source=DocumentSource.WEB, # arbitrary picking web, doens't matter for this test
|
||||
semantic_identifier="Whatever",
|
||||
metadata={},
|
||||
)
|
||||
|
||||
def test_chunk_large_section(self) -> None:
|
||||
chunks = chunk_large_section(
|
||||
section=self.large_section,
|
||||
document=self.document,
|
||||
start_chunk_id=5,
|
||||
chunk_size=100,
|
||||
word_overlap=3,
|
||||
)
|
||||
contents = [chunk.content for chunk in chunks]
|
||||
|
||||
self.assertEqual(len(contents), 5)
|
||||
self.assertEqual(contents[0], WAR_AND_PEACE[:100])
|
||||
self.assertEqual(
|
||||
contents[-2], WAR_AND_PEACE[-172:-62]
|
||||
) # slightly longer than 100 due to overlap
|
||||
self.assertEqual(
|
||||
contents[-1], WAR_AND_PEACE[-125:]
|
||||
) # large overlap with second to last segment
|
||||
self.assertFalse(chunks[0].section_continuation)
|
||||
self.assertTrue(chunks[1].section_continuation)
|
||||
self.assertTrue(chunks[-1].section_continuation)
|
||||
|
||||
def test_chunk_max_overflow(self) -> None:
|
||||
chunks = chunk_large_section(
|
||||
section=self.large_unbroken_section,
|
||||
document=self.document,
|
||||
start_chunk_id=5,
|
||||
chunk_size=100,
|
||||
word_overlap=3,
|
||||
)
|
||||
contents = [chunk.content for chunk in chunks]
|
||||
|
||||
self.assertEqual(len(contents), 4)
|
||||
self.assertEqual(contents[0], self.large_unbroken_section.text[:150])
|
||||
self.assertEqual(contents[1], self.large_unbroken_section.text[50:250])
|
||||
self.assertEqual(contents[2], self.large_unbroken_section.text[150:350])
|
||||
# Last chunk counts back from the end, full chunk size (100) + 50 overlap => 400 - 150 = 250
|
||||
self.assertEqual(contents[3], self.large_unbroken_section.text[250:])
|
||||
|
||||
def test_chunk_document(self) -> None:
|
||||
chunks = chunk_document(self.document, chunk_size=100, subsection_overlap=3)
|
||||
self.assertEqual(len(chunks), 8)
|
||||
self.assertEqual(
|
||||
chunks[0].content,
|
||||
self.document.sections[0].text + "\n\n" + self.document.sections[1].text,
|
||||
)
|
||||
self.assertEqual(
|
||||
chunks[0].source_links,
|
||||
{0: "https://www.test.com/0", 21: "https://www.test.com/1"},
|
||||
)
|
||||
self.assertEqual(
|
||||
chunks[-1].source_links,
|
||||
{0: "https://www.test.com/4", 18: "https://www.test.com/5"},
|
||||
)
|
||||
self.assertEqual(chunks[5].chunk_id, 5)
|
||||
self.assertEqual(chunks[6].source_document, self.document)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
Loading…
x
Reference in New Issue
Block a user