Use Sentence Aware Splitter (#452)

This commit is contained in:
Yuhong Sun 2023-09-16 16:28:16 -07:00 committed by GitHub
parent 63215e9c9a
commit 6b305c56b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 74 additions and 279 deletions

View File

@ -1,169 +1,90 @@
import abc
import re
from collections.abc import Callable
from llama_index.text_splitter import SentenceSplitter
from transformers import AutoTokenizer # type:ignore
from danswer.chunking.models import DocAwareChunk
from danswer.configs.app_configs import BLURB_LENGTH
from danswer.configs.app_configs import CHUNK_MAX_CHAR_OVERLAP
from danswer.configs.app_configs import BLURB_SIZE
from danswer.configs.app_configs import CHUNK_OVERLAP
from danswer.configs.app_configs import CHUNK_SIZE
from danswer.configs.app_configs import CHUNK_WORD_OVERLAP
from danswer.configs.app_configs import MINI_CHUNK_SIZE
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.search.search_utils import get_default_tokenizer
from danswer.utils.text_processing import shared_precompare_cleanup
SECTION_SEPARATOR = "\n\n"
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
def extract_blurb(text: str, blurb_len: int) -> str:
if len(text) < blurb_len:
return text
match = re.search(r"[.!?:]", text[blurb_len:])
max_blub_len = min(2 * blurb_len, len(text))
end_index = (
max_blub_len
if match is None
else min(blurb_len + match.start() + 1, max_blub_len)
def extract_blurb(text: str, blurb_size: int) -> str:
token_count_func = get_default_tokenizer().tokenize
blurb_splitter = SentenceSplitter(
tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0
)
if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
last_space = text.rfind(" ", 0, end_index)
# If there's no space in the text (single word longer than blurb_len), return the whole text
end_index = last_space if last_space != -1 else len(text)
blurb = text[:end_index]
blurb = blurb.replace("\n", " ")
blurb = blurb.replace("\r", " ")
while " " in blurb:
blurb = blurb.replace(" ", " ")
return blurb
return blurb_splitter.split_text(text)[0]
def chunk_large_section(
section: Section,
document: Document,
start_chunk_id: int,
tokenizer: AutoTokenizer,
chunk_size: int = CHUNK_SIZE,
word_overlap: int = CHUNK_WORD_OVERLAP,
blurb_len: int = BLURB_LENGTH,
chunk_overflow_max: int = CHUNK_MAX_CHAR_OVERLAP,
chunk_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE,
) -> list[DocAwareChunk]:
"""Split large sections into multiple chunks with the final chunk having as much previous overlap as possible.
Backtracks word_overlap words, delimited by whitespace, backtrack up to chunk_overflow_max characters max
When chunk is finished in forward direction, attempt to finish the word, but only up to chunk_overflow_max
Some details:
- Backtracking (overlap) => finish current word by backtracking + an additional (word_overlap - 1) words
- Continuation chunks start with a space generally unless overflow limit is hit
- Chunks end with a space generally unless overflow limit is hit
"""
section_text = section.text
blurb = extract_blurb(section_text, blurb_len)
char_count = len(section_text)
chunk_strs: list[str] = []
blurb = extract_blurb(section_text, blurb_size)
# start_pos is the actual start of the chunk not including the backtracking overlap
# segment_start_pos counts backwards to include overlap from previous chunk
start_pos = segment_start_pos = 0
while start_pos < char_count:
back_overflow_chars = 0
forward_overflow_chars = 0
back_count_words = 0
end_pos = segment_end_pos = min(start_pos + chunk_size, char_count)
sentence_aware_splitter = SentenceSplitter(
tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
# Forward overlap to attempt to finish the current word
while forward_overflow_chars < chunk_overflow_max:
if (
segment_end_pos >= char_count
or section_text[segment_end_pos - 1].isspace()
):
break
segment_end_pos += 1
forward_overflow_chars += 1
split_texts = sentence_aware_splitter.split_text(section_text)
# Backwards overlap counting up to word_overlap words (whitespace delineated) or chunk_overflow_max chars
# Counts back by finishing current word by backtracking + an additional (word_overlap - 1) words
# If starts on a space, it considers finishing the current word as done
while back_overflow_chars < chunk_overflow_max:
if segment_start_pos == 0:
break
# no -1 offset here because we want to include prepended space to be clear it's a continuation
if section_text[segment_start_pos].isspace():
back_count_words += 1
if back_count_words > word_overlap:
break
back_count_words += 1
segment_start_pos -= 1
back_overflow_chars += 1
# Extract chunk from section text based on the pointers from above
chunk_str = section_text[segment_start_pos:segment_end_pos]
chunk_strs.append(chunk_str)
# Move pointers to next section, not counting overlaps forward or backward
start_pos = segment_start_pos = end_pos
# Last chunk should be as long as possible, overlap favored over tiny chunk with no context
if len(chunk_strs) > 1:
chunk_strs.pop()
back_count_words = 0
back_overflow_chars = 0
# Backcount chunk size number of characters then
# add in the backcounting overlap like with every other previous chunk
start_pos = char_count - chunk_size
while back_overflow_chars < chunk_overflow_max:
if start_pos == 0:
break
if section_text[start_pos].isspace():
if back_count_words > word_overlap:
break
back_count_words += 1
start_pos -= 1
back_overflow_chars += 1
chunk_strs.append(section_text[start_pos:])
chunks = []
for chunk_ind, chunk_str in enumerate(chunk_strs):
chunks.append(
DocAwareChunk(
source_document=document,
chunk_id=start_chunk_id + chunk_ind,
blurb=blurb,
content=chunk_str,
source_links={0: section.link},
section_continuation=(chunk_ind != 0),
)
chunks = [
DocAwareChunk(
source_document=document,
chunk_id=start_chunk_id + chunk_ind,
blurb=blurb,
content=chunk_str,
source_links={0: section.link},
section_continuation=(chunk_ind != 0),
)
for chunk_ind, chunk_str in enumerate(split_texts)
]
return chunks
def chunk_document(
document: Document,
chunk_size: int = CHUNK_SIZE,
subsection_overlap: int = CHUNK_WORD_OVERLAP,
blurb_len: int = BLURB_LENGTH,
chunk_tok_size: int = CHUNK_SIZE,
subsection_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE,
) -> list[DocAwareChunk]:
tokenizer = get_default_tokenizer()
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""
for section in document.sections:
current_length = len(chunk_text)
section_tok_length = len(tokenizer.tokenize(section.text))
current_tok_length = len(tokenizer.tokenize(chunk_text))
curr_offset_len = len(shared_precompare_cleanup(chunk_text))
section_length = len(section.text)
# Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
# at the end by other sections
if section_length > chunk_size:
if section_tok_length > chunk_tok_size:
if chunk_text:
chunks.append(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_len),
blurb=extract_blurb(chunk_text, blurb_size),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
@ -176,15 +97,21 @@ def chunk_document(
section=section,
document=document,
start_chunk_id=len(chunks),
chunk_size=chunk_size,
word_overlap=subsection_overlap,
blurb_len=blurb_len,
tokenizer=tokenizer,
chunk_size=chunk_tok_size,
chunk_overlap=subsection_overlap,
blurb_size=blurb_size,
)
chunks.extend(large_section_chunks)
continue
# In the case where the whole section is shorter than a chunk, either adding to chunk or start a new one
if current_length + len(SECTION_SEPARATOR) + section_length <= chunk_size:
if (
current_tok_length
+ len(tokenizer.tokenize(SECTION_SEPARATOR))
+ section_tok_length
<= chunk_tok_size
):
chunk_text += (
SECTION_SEPARATOR + section.text if chunk_text else section.text
)
@ -194,7 +121,7 @@ def chunk_document(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_len),
blurb=extract_blurb(chunk_text, blurb_size),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
@ -209,7 +136,7 @@ def chunk_document(
DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=extract_blurb(chunk_text, blurb_len),
blurb=extract_blurb(chunk_text, blurb_size),
content=chunk_text,
source_links=link_offsets,
section_continuation=False,
@ -218,6 +145,17 @@ def chunk_document(
return chunks
def split_chunk_text_into_mini_chunks(
chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
) -> list[str]:
token_count_func = get_default_tokenizer().tokenize
sentence_aware_splitter = SentenceSplitter(
tokenizer=token_count_func, chunk_size=mini_chunk_size, chunk_overlap=0
)
return sentence_aware_splitter.split_text(chunk_text)
class Chunker:
@abc.abstractmethod
def chunk(self, document: Document) -> list[DocAwareChunk]:

View File

@ -12,7 +12,7 @@ APP_PORT = 8080
#####
# User Facing Features Configs
#####
BLURB_LENGTH = 200 # Characters. Blurbs will be truncated at the first punctuation after this many characters.
BLURB_SIZE = 128 # Number Encoder Tokens included in the chunk blurb
GENERATIVE_MODEL_ACCESS_CHECK_FREQ = 86400 # 1 day
# DISABLE_GENERATIVE_AI will turn of the question answering part of Danswer. Use this
# if you want to use Danswer as a search engine only and/or you are not comfortable sending
@ -158,19 +158,14 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "fals
#####
# Text Processing Configs
#####
# Chunking docs to this number of characters not including finishing the last word and the overlap words below
# Calculated by ~500 to 512 tokens max * average 4 chars per token
CHUNK_SIZE = 2000
CHUNK_SIZE = 512 # Tokens by embedding model
CHUNK_OVERLAP = int(CHUNK_SIZE * 0.05) # 5% overlap
# More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors)
ENABLE_MINI_CHUNK = False
# Mini chunks for fine-grained embedding, calculated as 128 tokens for 4 additional vectors for 512 chunk size above
# Not rounded down to not lose any context in full chunk.
MINI_CHUNK_SIZE = 512
# Each chunk includes an additional CHUNK_WORD_OVERLAP words from previous chunk
CHUNK_WORD_OVERLAP = 5
# When trying to finish the last word in the chunk or counting back CHUNK_WORD_OVERLAP backwards,
# This is the max number of characters allowed in either direction
CHUNK_MAX_CHAR_OVERLAP = 50
ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true"
# Finer grained chunking for more detail retention
# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
MINI_CHUNK_SIZE = 150
#####

View File

@ -4,12 +4,12 @@ from uuid import UUID
import numpy
from sentence_transformers import SentenceTransformer # type: ignore
from danswer.chunking.chunk import split_chunk_text_into_mini_chunks
from danswer.chunking.models import ChunkEmbedding
from danswer.chunking.models import DocAwareChunk
from danswer.chunking.models import IndexChunk
from danswer.chunking.models import InferenceChunk
from danswer.configs.app_configs import ENABLE_MINI_CHUNK
from danswer.configs.app_configs import MINI_CHUNK_SIZE
from danswer.configs.app_configs import NUM_RERANKED_RESULTS
from danswer.configs.app_configs import NUM_RETURNED_HITS
from danswer.configs.model_configs import ASYMMETRIC_PREFIX
@ -124,36 +124,6 @@ def retrieve_ranked_documents(
return ranked_chunks, top_chunks[num_rerank:]
def split_chunk_text_into_mini_chunks(
chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
) -> list[str]:
chunks = []
start = 0
separators = [" ", "\n", "\r", "\t"]
while start < len(chunk_text):
if len(chunk_text) - start <= mini_chunk_size:
end = len(chunk_text)
else:
# Find the first separator character after min_chunk_length
end_positions = [
(chunk_text[start + mini_chunk_size :]).find(sep) for sep in separators
]
# Filter out the not found cases (-1)
end_positions = [pos for pos in end_positions if pos != -1]
if not end_positions:
# If no more separators, the rest of the string becomes a chunk
end = len(chunk_text)
else:
# Add min_chunk_length and start to the end position
end = min(end_positions) + start + mini_chunk_size
chunks.append(chunk_text[start:end])
start = end + 1 # Move to the next character after the separator
return chunks
@log_function_time()
def encode_chunks(
chunks: list[DocAwareChunk],

View File

@ -22,6 +22,7 @@ httpx-oauth==0.11.2
huggingface-hub==0.16.4
jira==3.5.1
langchain==0.0.273
llama-index==0.8.27
Mako==1.2.4
nltk==3.8.1
docx2txt==0.8
@ -45,7 +46,7 @@ rfc3986==1.5.0
safetensors==0.3.1
sentence-transformers==2.2.2
slack-sdk==3.20.2
SQLAlchemy[mypy]==2.0.12
SQLAlchemy[mypy]==2.0.15
tensorflow==2.13.0
tiktoken==0.4.0
transformers==4.30.1

View File

@ -1,109 +0,0 @@
import unittest
from danswer.chunking.chunk import chunk_document
from danswer.chunking.chunk import chunk_large_section
from danswer.configs.constants import DocumentSource
from danswer.connectors.models import Document
from danswer.connectors.models import Section
WAR_AND_PEACE = (
"Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, "
"if you dont tell me that this means war, if you still try to defend the infamies and horrors perpetrated by "
"that Antichrist—I really believe he is Antichrist—I will have nothing more to do with you and you are no longer "
"my friend, no longer my faithful slave, as you call yourself! But how do you do? I see I have frightened "
"you—sit down and tell me all the news."
)
class TestDocumentChunking(unittest.TestCase):
def setUp(self) -> None:
self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
self.large_unbroken_section = Section(
text="0123456789" * 40, link="https://www.test.com/"
)
self.document = Document(
id="test_document",
sections=[
Section(
text="Here is some testing text", link="https://www.test.com/0"
),
Section(
text="Some more text, still under 100 chars",
link="https://www.test.com/1",
),
Section(
text="Now with this section it's longer than the chunk size",
link="https://www.test.com/2",
),
self.large_section,
Section(text="These last 2 sections", link="https://www.test.com/4"),
Section(
text="should be combined into one", link="https://www.test.com/5"
),
],
source=DocumentSource.WEB, # arbitrary picking web, doens't matter for this test
semantic_identifier="Whatever",
metadata={},
)
def test_chunk_large_section(self) -> None:
chunks = chunk_large_section(
section=self.large_section,
document=self.document,
start_chunk_id=5,
chunk_size=100,
word_overlap=3,
)
contents = [chunk.content for chunk in chunks]
self.assertEqual(len(contents), 5)
self.assertEqual(contents[0], WAR_AND_PEACE[:100])
self.assertEqual(
contents[-2], WAR_AND_PEACE[-172:-62]
) # slightly longer than 100 due to overlap
self.assertEqual(
contents[-1], WAR_AND_PEACE[-125:]
) # large overlap with second to last segment
self.assertFalse(chunks[0].section_continuation)
self.assertTrue(chunks[1].section_continuation)
self.assertTrue(chunks[-1].section_continuation)
def test_chunk_max_overflow(self) -> None:
chunks = chunk_large_section(
section=self.large_unbroken_section,
document=self.document,
start_chunk_id=5,
chunk_size=100,
word_overlap=3,
)
contents = [chunk.content for chunk in chunks]
self.assertEqual(len(contents), 4)
self.assertEqual(contents[0], self.large_unbroken_section.text[:150])
self.assertEqual(contents[1], self.large_unbroken_section.text[50:250])
self.assertEqual(contents[2], self.large_unbroken_section.text[150:350])
# Last chunk counts back from the end, full chunk size (100) + 50 overlap => 400 - 150 = 250
self.assertEqual(contents[3], self.large_unbroken_section.text[250:])
def test_chunk_document(self) -> None:
chunks = chunk_document(self.document, chunk_size=100, subsection_overlap=3)
self.assertEqual(len(chunks), 8)
self.assertEqual(
chunks[0].content,
self.document.sections[0].text + "\n\n" + self.document.sections[1].text,
)
self.assertEqual(
chunks[0].source_links,
{0: "https://www.test.com/0", 21: "https://www.test.com/1"},
)
self.assertEqual(
chunks[-1].source_links,
{0: "https://www.test.com/4", 18: "https://www.test.com/5"},
)
self.assertEqual(chunks[5].chunk_id, 5)
self.assertEqual(chunks[6].source_document, self.document)
if __name__ == "__main__":
unittest.main()