From 6b305c56b3e7e92ad6a431b198c83c57fbc58690 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sat, 16 Sep 2023 16:28:16 -0700 Subject: [PATCH] Use Sentence Aware Splitter (#452) --- backend/danswer/chunking/chunk.py | 188 ++++++------------ backend/danswer/configs/app_configs.py | 21 +- backend/danswer/search/semantic_search.py | 32 +-- backend/requirements/default.txt | 3 +- .../unit/qa_service/chunking/test_chunk.py | 109 ---------- 5 files changed, 74 insertions(+), 279 deletions(-) delete mode 100644 backend/tests/unit/qa_service/chunking/test_chunk.py diff --git a/backend/danswer/chunking/chunk.py b/backend/danswer/chunking/chunk.py index d9e8fea34..3dad17f83 100644 --- a/backend/danswer/chunking/chunk.py +++ b/backend/danswer/chunking/chunk.py @@ -1,169 +1,90 @@ import abc -import re from collections.abc import Callable +from llama_index.text_splitter import SentenceSplitter +from transformers import AutoTokenizer # type:ignore + from danswer.chunking.models import DocAwareChunk -from danswer.configs.app_configs import BLURB_LENGTH -from danswer.configs.app_configs import CHUNK_MAX_CHAR_OVERLAP +from danswer.configs.app_configs import BLURB_SIZE +from danswer.configs.app_configs import CHUNK_OVERLAP from danswer.configs.app_configs import CHUNK_SIZE -from danswer.configs.app_configs import CHUNK_WORD_OVERLAP +from danswer.configs.app_configs import MINI_CHUNK_SIZE from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.search.search_utils import get_default_tokenizer from danswer.utils.text_processing import shared_precompare_cleanup + SECTION_SEPARATOR = "\n\n" ChunkFunc = Callable[[Document], list[DocAwareChunk]] -def extract_blurb(text: str, blurb_len: int) -> str: - if len(text) < blurb_len: - return text - - match = re.search(r"[.!?:]", text[blurb_len:]) - max_blub_len = min(2 * blurb_len, len(text)) - - end_index = ( - max_blub_len - if match is None - else min(blurb_len + match.start() + 1, max_blub_len) +def extract_blurb(text: str, blurb_size: int) -> str: + token_count_func = get_default_tokenizer().tokenize + blurb_splitter = SentenceSplitter( + tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0 ) - if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]: - last_space = text.rfind(" ", 0, end_index) - # If there's no space in the text (single word longer than blurb_len), return the whole text - end_index = last_space if last_space != -1 else len(text) - - blurb = text[:end_index] - - blurb = blurb.replace("\n", " ") - blurb = blurb.replace("\r", " ") - while " " in blurb: - blurb = blurb.replace(" ", " ") - - return blurb + return blurb_splitter.split_text(text)[0] def chunk_large_section( section: Section, document: Document, start_chunk_id: int, + tokenizer: AutoTokenizer, chunk_size: int = CHUNK_SIZE, - word_overlap: int = CHUNK_WORD_OVERLAP, - blurb_len: int = BLURB_LENGTH, - chunk_overflow_max: int = CHUNK_MAX_CHAR_OVERLAP, + chunk_overlap: int = CHUNK_OVERLAP, + blurb_size: int = BLURB_SIZE, ) -> list[DocAwareChunk]: - """Split large sections into multiple chunks with the final chunk having as much previous overlap as possible. - Backtracks word_overlap words, delimited by whitespace, backtrack up to chunk_overflow_max characters max - When chunk is finished in forward direction, attempt to finish the word, but only up to chunk_overflow_max - - Some details: - - Backtracking (overlap) => finish current word by backtracking + an additional (word_overlap - 1) words - - Continuation chunks start with a space generally unless overflow limit is hit - - Chunks end with a space generally unless overflow limit is hit - """ section_text = section.text - blurb = extract_blurb(section_text, blurb_len) - char_count = len(section_text) - chunk_strs: list[str] = [] + blurb = extract_blurb(section_text, blurb_size) - # start_pos is the actual start of the chunk not including the backtracking overlap - # segment_start_pos counts backwards to include overlap from previous chunk - start_pos = segment_start_pos = 0 - while start_pos < char_count: - back_overflow_chars = 0 - forward_overflow_chars = 0 - back_count_words = 0 - end_pos = segment_end_pos = min(start_pos + chunk_size, char_count) + sentence_aware_splitter = SentenceSplitter( + tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) - # Forward overlap to attempt to finish the current word - while forward_overflow_chars < chunk_overflow_max: - if ( - segment_end_pos >= char_count - or section_text[segment_end_pos - 1].isspace() - ): - break - segment_end_pos += 1 - forward_overflow_chars += 1 + split_texts = sentence_aware_splitter.split_text(section_text) - # Backwards overlap counting up to word_overlap words (whitespace delineated) or chunk_overflow_max chars - # Counts back by finishing current word by backtracking + an additional (word_overlap - 1) words - # If starts on a space, it considers finishing the current word as done - while back_overflow_chars < chunk_overflow_max: - if segment_start_pos == 0: - break - # no -1 offset here because we want to include prepended space to be clear it's a continuation - if section_text[segment_start_pos].isspace(): - back_count_words += 1 - if back_count_words > word_overlap: - break - back_count_words += 1 - segment_start_pos -= 1 - back_overflow_chars += 1 - - # Extract chunk from section text based on the pointers from above - chunk_str = section_text[segment_start_pos:segment_end_pos] - chunk_strs.append(chunk_str) - - # Move pointers to next section, not counting overlaps forward or backward - start_pos = segment_start_pos = end_pos - - # Last chunk should be as long as possible, overlap favored over tiny chunk with no context - if len(chunk_strs) > 1: - chunk_strs.pop() - back_count_words = 0 - back_overflow_chars = 0 - # Backcount chunk size number of characters then - # add in the backcounting overlap like with every other previous chunk - start_pos = char_count - chunk_size - while back_overflow_chars < chunk_overflow_max: - if start_pos == 0: - break - if section_text[start_pos].isspace(): - if back_count_words > word_overlap: - break - back_count_words += 1 - start_pos -= 1 - back_overflow_chars += 1 - chunk_strs.append(section_text[start_pos:]) - - chunks = [] - for chunk_ind, chunk_str in enumerate(chunk_strs): - chunks.append( - DocAwareChunk( - source_document=document, - chunk_id=start_chunk_id + chunk_ind, - blurb=blurb, - content=chunk_str, - source_links={0: section.link}, - section_continuation=(chunk_ind != 0), - ) + chunks = [ + DocAwareChunk( + source_document=document, + chunk_id=start_chunk_id + chunk_ind, + blurb=blurb, + content=chunk_str, + source_links={0: section.link}, + section_continuation=(chunk_ind != 0), ) + for chunk_ind, chunk_str in enumerate(split_texts) + ] return chunks def chunk_document( document: Document, - chunk_size: int = CHUNK_SIZE, - subsection_overlap: int = CHUNK_WORD_OVERLAP, - blurb_len: int = BLURB_LENGTH, + chunk_tok_size: int = CHUNK_SIZE, + subsection_overlap: int = CHUNK_OVERLAP, + blurb_size: int = BLURB_SIZE, ) -> list[DocAwareChunk]: + tokenizer = get_default_tokenizer() + chunks: list[DocAwareChunk] = [] link_offsets: dict[int, str] = {} chunk_text = "" for section in document.sections: - current_length = len(chunk_text) + section_tok_length = len(tokenizer.tokenize(section.text)) + current_tok_length = len(tokenizer.tokenize(chunk_text)) curr_offset_len = len(shared_precompare_cleanup(chunk_text)) - section_length = len(section.text) # Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated # at the end by other sections - if section_length > chunk_size: + if section_tok_length > chunk_tok_size: if chunk_text: chunks.append( DocAwareChunk( source_document=document, chunk_id=len(chunks), - blurb=extract_blurb(chunk_text, blurb_len), + blurb=extract_blurb(chunk_text, blurb_size), content=chunk_text, source_links=link_offsets, section_continuation=False, @@ -176,15 +97,21 @@ def chunk_document( section=section, document=document, start_chunk_id=len(chunks), - chunk_size=chunk_size, - word_overlap=subsection_overlap, - blurb_len=blurb_len, + tokenizer=tokenizer, + chunk_size=chunk_tok_size, + chunk_overlap=subsection_overlap, + blurb_size=blurb_size, ) chunks.extend(large_section_chunks) continue # In the case where the whole section is shorter than a chunk, either adding to chunk or start a new one - if current_length + len(SECTION_SEPARATOR) + section_length <= chunk_size: + if ( + current_tok_length + + len(tokenizer.tokenize(SECTION_SEPARATOR)) + + section_tok_length + <= chunk_tok_size + ): chunk_text += ( SECTION_SEPARATOR + section.text if chunk_text else section.text ) @@ -194,7 +121,7 @@ def chunk_document( DocAwareChunk( source_document=document, chunk_id=len(chunks), - blurb=extract_blurb(chunk_text, blurb_len), + blurb=extract_blurb(chunk_text, blurb_size), content=chunk_text, source_links=link_offsets, section_continuation=False, @@ -209,7 +136,7 @@ def chunk_document( DocAwareChunk( source_document=document, chunk_id=len(chunks), - blurb=extract_blurb(chunk_text, blurb_len), + blurb=extract_blurb(chunk_text, blurb_size), content=chunk_text, source_links=link_offsets, section_continuation=False, @@ -218,6 +145,17 @@ def chunk_document( return chunks +def split_chunk_text_into_mini_chunks( + chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE +) -> list[str]: + token_count_func = get_default_tokenizer().tokenize + sentence_aware_splitter = SentenceSplitter( + tokenizer=token_count_func, chunk_size=mini_chunk_size, chunk_overlap=0 + ) + + return sentence_aware_splitter.split_text(chunk_text) + + class Chunker: @abc.abstractmethod def chunk(self, document: Document) -> list[DocAwareChunk]: diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 889d823e5..18a8df088 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -12,7 +12,7 @@ APP_PORT = 8080 ##### # User Facing Features Configs ##### -BLURB_LENGTH = 200 # Characters. Blurbs will be truncated at the first punctuation after this many characters. +BLURB_SIZE = 128 # Number Encoder Tokens included in the chunk blurb GENERATIVE_MODEL_ACCESS_CHECK_FREQ = 86400 # 1 day # DISABLE_GENERATIVE_AI will turn of the question answering part of Danswer. Use this # if you want to use Danswer as a search engine only and/or you are not comfortable sending @@ -158,19 +158,14 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "fals ##### # Text Processing Configs ##### -# Chunking docs to this number of characters not including finishing the last word and the overlap words below -# Calculated by ~500 to 512 tokens max * average 4 chars per token -CHUNK_SIZE = 2000 +CHUNK_SIZE = 512 # Tokens by embedding model +CHUNK_OVERLAP = int(CHUNK_SIZE * 0.05) # 5% overlap # More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors) -ENABLE_MINI_CHUNK = False -# Mini chunks for fine-grained embedding, calculated as 128 tokens for 4 additional vectors for 512 chunk size above -# Not rounded down to not lose any context in full chunk. -MINI_CHUNK_SIZE = 512 -# Each chunk includes an additional CHUNK_WORD_OVERLAP words from previous chunk -CHUNK_WORD_OVERLAP = 5 -# When trying to finish the last word in the chunk or counting back CHUNK_WORD_OVERLAP backwards, -# This is the max number of characters allowed in either direction -CHUNK_MAX_CHAR_OVERLAP = 50 +ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true" +# Finer grained chunking for more detail retention +# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE +# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end +MINI_CHUNK_SIZE = 150 ##### diff --git a/backend/danswer/search/semantic_search.py b/backend/danswer/search/semantic_search.py index 433a8268f..60aff8dea 100644 --- a/backend/danswer/search/semantic_search.py +++ b/backend/danswer/search/semantic_search.py @@ -4,12 +4,12 @@ from uuid import UUID import numpy from sentence_transformers import SentenceTransformer # type: ignore +from danswer.chunking.chunk import split_chunk_text_into_mini_chunks from danswer.chunking.models import ChunkEmbedding from danswer.chunking.models import DocAwareChunk from danswer.chunking.models import IndexChunk from danswer.chunking.models import InferenceChunk from danswer.configs.app_configs import ENABLE_MINI_CHUNK -from danswer.configs.app_configs import MINI_CHUNK_SIZE from danswer.configs.app_configs import NUM_RERANKED_RESULTS from danswer.configs.app_configs import NUM_RETURNED_HITS from danswer.configs.model_configs import ASYMMETRIC_PREFIX @@ -124,36 +124,6 @@ def retrieve_ranked_documents( return ranked_chunks, top_chunks[num_rerank:] -def split_chunk_text_into_mini_chunks( - chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE -) -> list[str]: - chunks = [] - start = 0 - separators = [" ", "\n", "\r", "\t"] - - while start < len(chunk_text): - if len(chunk_text) - start <= mini_chunk_size: - end = len(chunk_text) - else: - # Find the first separator character after min_chunk_length - end_positions = [ - (chunk_text[start + mini_chunk_size :]).find(sep) for sep in separators - ] - # Filter out the not found cases (-1) - end_positions = [pos for pos in end_positions if pos != -1] - if not end_positions: - # If no more separators, the rest of the string becomes a chunk - end = len(chunk_text) - else: - # Add min_chunk_length and start to the end position - end = min(end_positions) + start + mini_chunk_size - - chunks.append(chunk_text[start:end]) - start = end + 1 # Move to the next character after the separator - - return chunks - - @log_function_time() def encode_chunks( chunks: list[DocAwareChunk], diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 5921b4888..0cc8effa9 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -22,6 +22,7 @@ httpx-oauth==0.11.2 huggingface-hub==0.16.4 jira==3.5.1 langchain==0.0.273 +llama-index==0.8.27 Mako==1.2.4 nltk==3.8.1 docx2txt==0.8 @@ -45,7 +46,7 @@ rfc3986==1.5.0 safetensors==0.3.1 sentence-transformers==2.2.2 slack-sdk==3.20.2 -SQLAlchemy[mypy]==2.0.12 +SQLAlchemy[mypy]==2.0.15 tensorflow==2.13.0 tiktoken==0.4.0 transformers==4.30.1 diff --git a/backend/tests/unit/qa_service/chunking/test_chunk.py b/backend/tests/unit/qa_service/chunking/test_chunk.py deleted file mode 100644 index 738ba0c1f..000000000 --- a/backend/tests/unit/qa_service/chunking/test_chunk.py +++ /dev/null @@ -1,109 +0,0 @@ -import unittest - -from danswer.chunking.chunk import chunk_document -from danswer.chunking.chunk import chunk_large_section -from danswer.configs.constants import DocumentSource -from danswer.connectors.models import Document -from danswer.connectors.models import Section - - -WAR_AND_PEACE = ( - "Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, " - "if you don’t tell me that this means war, if you still try to defend the infamies and horrors perpetrated by " - "that Antichrist—I really believe he is Antichrist—I will have nothing more to do with you and you are no longer " - "my friend, no longer my ‘faithful slave,’ as you call yourself! But how do you do? I see I have frightened " - "you—sit down and tell me all the news." -) - - -class TestDocumentChunking(unittest.TestCase): - def setUp(self) -> None: - self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/") - self.large_unbroken_section = Section( - text="0123456789" * 40, link="https://www.test.com/" - ) - self.document = Document( - id="test_document", - sections=[ - Section( - text="Here is some testing text", link="https://www.test.com/0" - ), - Section( - text="Some more text, still under 100 chars", - link="https://www.test.com/1", - ), - Section( - text="Now with this section it's longer than the chunk size", - link="https://www.test.com/2", - ), - self.large_section, - Section(text="These last 2 sections", link="https://www.test.com/4"), - Section( - text="should be combined into one", link="https://www.test.com/5" - ), - ], - source=DocumentSource.WEB, # arbitrary picking web, doens't matter for this test - semantic_identifier="Whatever", - metadata={}, - ) - - def test_chunk_large_section(self) -> None: - chunks = chunk_large_section( - section=self.large_section, - document=self.document, - start_chunk_id=5, - chunk_size=100, - word_overlap=3, - ) - contents = [chunk.content for chunk in chunks] - - self.assertEqual(len(contents), 5) - self.assertEqual(contents[0], WAR_AND_PEACE[:100]) - self.assertEqual( - contents[-2], WAR_AND_PEACE[-172:-62] - ) # slightly longer than 100 due to overlap - self.assertEqual( - contents[-1], WAR_AND_PEACE[-125:] - ) # large overlap with second to last segment - self.assertFalse(chunks[0].section_continuation) - self.assertTrue(chunks[1].section_continuation) - self.assertTrue(chunks[-1].section_continuation) - - def test_chunk_max_overflow(self) -> None: - chunks = chunk_large_section( - section=self.large_unbroken_section, - document=self.document, - start_chunk_id=5, - chunk_size=100, - word_overlap=3, - ) - contents = [chunk.content for chunk in chunks] - - self.assertEqual(len(contents), 4) - self.assertEqual(contents[0], self.large_unbroken_section.text[:150]) - self.assertEqual(contents[1], self.large_unbroken_section.text[50:250]) - self.assertEqual(contents[2], self.large_unbroken_section.text[150:350]) - # Last chunk counts back from the end, full chunk size (100) + 50 overlap => 400 - 150 = 250 - self.assertEqual(contents[3], self.large_unbroken_section.text[250:]) - - def test_chunk_document(self) -> None: - chunks = chunk_document(self.document, chunk_size=100, subsection_overlap=3) - self.assertEqual(len(chunks), 8) - self.assertEqual( - chunks[0].content, - self.document.sections[0].text + "\n\n" + self.document.sections[1].text, - ) - self.assertEqual( - chunks[0].source_links, - {0: "https://www.test.com/0", 21: "https://www.test.com/1"}, - ) - self.assertEqual( - chunks[-1].source_links, - {0: "https://www.test.com/4", 18: "https://www.test.com/5"}, - ) - self.assertEqual(chunks[5].chunk_id, 5) - self.assertEqual(chunks[6].source_document, self.document) - - -if __name__ == "__main__": - unittest.main()