Use Sentence Aware Splitter (#452)

2025-04-03 09:28:25 +02:00 · 2023-09-16 16:28:16 -07:00 · 2023-09-16 16:28:16 -07:00 · 6b305c56b3
commit 6b305c56b3
parent 63215e9c9a
5 changed files with 74 additions and 279 deletions
--- a/backend/danswer/chunking/chunk.py
+++ b/backend/danswer/chunking/chunk.py
@ -1,169 +1,90 @@
 import abc
-import re
 from collections.abc import Callable

+from llama_index.text_splitter import SentenceSplitter
+from transformers import AutoTokenizer  # type:ignore
+
 from danswer.chunking.models import DocAwareChunk
-from danswer.configs.app_configs import BLURB_LENGTH
-from danswer.configs.app_configs import CHUNK_MAX_CHAR_OVERLAP
+from danswer.configs.app_configs import BLURB_SIZE
+from danswer.configs.app_configs import CHUNK_OVERLAP
 from danswer.configs.app_configs import CHUNK_SIZE
-from danswer.configs.app_configs import CHUNK_WORD_OVERLAP
+from danswer.configs.app_configs import MINI_CHUNK_SIZE
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
+from danswer.search.search_utils import get_default_tokenizer
 from danswer.utils.text_processing import shared_precompare_cleanup

+
 SECTION_SEPARATOR = "\n\n"
 ChunkFunc = Callable[[Document], list[DocAwareChunk]]


-def extract_blurb(text: str, blurb_len: int) -> str:
-    if len(text) < blurb_len:
-        return text
-
-    match = re.search(r"[.!?:]", text[blurb_len:])
-    max_blub_len = min(2 * blurb_len, len(text))
-
-    end_index = (
-        max_blub_len
-        if match is None
-        else min(blurb_len + match.start() + 1, max_blub_len)
+def extract_blurb(text: str, blurb_size: int) -> str:
+    token_count_func = get_default_tokenizer().tokenize
+    blurb_splitter = SentenceSplitter(
+        tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0
    )

-    if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
-        last_space = text.rfind(" ", 0, end_index)
-        # If there's no space in the text (single word longer than blurb_len), return the whole text
-        end_index = last_space if last_space != -1 else len(text)
-
-    blurb = text[:end_index]
-
-    blurb = blurb.replace("\n", " ")
-    blurb = blurb.replace("\r", " ")
-    while "  " in blurb:
-        blurb = blurb.replace("  ", " ")
-
-    return blurb
+    return blurb_splitter.split_text(text)[0]


 def chunk_large_section(
    section: Section,
    document: Document,
    start_chunk_id: int,
+    tokenizer: AutoTokenizer,
    chunk_size: int = CHUNK_SIZE,
-    word_overlap: int = CHUNK_WORD_OVERLAP,
-    blurb_len: int = BLURB_LENGTH,
-    chunk_overflow_max: int = CHUNK_MAX_CHAR_OVERLAP,
+    chunk_overlap: int = CHUNK_OVERLAP,
+    blurb_size: int = BLURB_SIZE,
 ) -> list[DocAwareChunk]:
-    """Split large sections into multiple chunks with the final chunk having as much previous overlap as possible.
-    Backtracks word_overlap words, delimited by whitespace, backtrack up to chunk_overflow_max characters max
-    When chunk is finished in forward direction, attempt to finish the word, but only up to chunk_overflow_max
-
-    Some details:
-        - Backtracking (overlap) => finish current word by backtracking + an additional (word_overlap - 1) words
-        - Continuation chunks start with a space generally unless overflow limit is hit
-        - Chunks end with a space generally unless overflow limit is hit
-    """
    section_text = section.text
-    blurb = extract_blurb(section_text, blurb_len)
-    char_count = len(section_text)
-    chunk_strs: list[str] = []
+    blurb = extract_blurb(section_text, blurb_size)

-    # start_pos is the actual start of the chunk not including the backtracking overlap
-    # segment_start_pos counts backwards to include overlap from previous chunk
-    start_pos = segment_start_pos = 0
-    while start_pos < char_count:
-        back_overflow_chars = 0
-        forward_overflow_chars = 0
-        back_count_words = 0
-        end_pos = segment_end_pos = min(start_pos + chunk_size, char_count)
+    sentence_aware_splitter = SentenceSplitter(
+        tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )

-        # Forward overlap to attempt to finish the current word
-        while forward_overflow_chars < chunk_overflow_max:
-            if (
-                segment_end_pos >= char_count
-                or section_text[segment_end_pos - 1].isspace()
-            ):
-                break
-            segment_end_pos += 1
-            forward_overflow_chars += 1
+    split_texts = sentence_aware_splitter.split_text(section_text)

-        # Backwards overlap counting up to word_overlap words (whitespace delineated) or chunk_overflow_max chars
-        # Counts back by finishing current word by backtracking + an additional (word_overlap - 1) words
-        # If starts on a space, it considers finishing the current word as done
-        while back_overflow_chars < chunk_overflow_max:
-            if segment_start_pos == 0:
-                break
-            # no -1 offset here because we want to include prepended space to be clear it's a continuation
-            if section_text[segment_start_pos].isspace():
-                back_count_words += 1
-                if back_count_words > word_overlap:
-                    break
-                back_count_words += 1
-            segment_start_pos -= 1
-            back_overflow_chars += 1
-
-        # Extract chunk from section text based on the pointers from above
-        chunk_str = section_text[segment_start_pos:segment_end_pos]
-        chunk_strs.append(chunk_str)
-
-        # Move pointers to next section, not counting overlaps forward or backward
-        start_pos = segment_start_pos = end_pos
-
-    # Last chunk should be as long as possible, overlap favored over tiny chunk with no context
-    if len(chunk_strs) > 1:
-        chunk_strs.pop()
-        back_count_words = 0
-        back_overflow_chars = 0
-        # Backcount chunk size number of characters then
-        # add in the backcounting overlap like with every other previous chunk
-        start_pos = char_count - chunk_size
-        while back_overflow_chars < chunk_overflow_max:
-            if start_pos == 0:
-                break
-            if section_text[start_pos].isspace():
-                if back_count_words > word_overlap:
-                    break
-                back_count_words += 1
-            start_pos -= 1
-            back_overflow_chars += 1
-        chunk_strs.append(section_text[start_pos:])
-
-    chunks = []
-    for chunk_ind, chunk_str in enumerate(chunk_strs):
-        chunks.append(
-            DocAwareChunk(
-                source_document=document,
-                chunk_id=start_chunk_id + chunk_ind,
-                blurb=blurb,
-                content=chunk_str,
-                source_links={0: section.link},
-                section_continuation=(chunk_ind != 0),
-            )
+    chunks = [
+        DocAwareChunk(
+            source_document=document,
+            chunk_id=start_chunk_id + chunk_ind,
+            blurb=blurb,
+            content=chunk_str,
+            source_links={0: section.link},
+            section_continuation=(chunk_ind != 0),
        )
+        for chunk_ind, chunk_str in enumerate(split_texts)
+    ]
    return chunks


 def chunk_document(
    document: Document,
-    chunk_size: int = CHUNK_SIZE,
-    subsection_overlap: int = CHUNK_WORD_OVERLAP,
-    blurb_len: int = BLURB_LENGTH,
+    chunk_tok_size: int = CHUNK_SIZE,
+    subsection_overlap: int = CHUNK_OVERLAP,
+    blurb_size: int = BLURB_SIZE,
 ) -> list[DocAwareChunk]:
+    tokenizer = get_default_tokenizer()
+
    chunks: list[DocAwareChunk] = []
    link_offsets: dict[int, str] = {}
    chunk_text = ""
    for section in document.sections:
-        current_length = len(chunk_text)
+        section_tok_length = len(tokenizer.tokenize(section.text))
+        current_tok_length = len(tokenizer.tokenize(chunk_text))
        curr_offset_len = len(shared_precompare_cleanup(chunk_text))
-        section_length = len(section.text)

        # Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
        # at the end by other sections
-        if section_length > chunk_size:
+        if section_tok_length > chunk_tok_size:
            if chunk_text:
                chunks.append(
                    DocAwareChunk(
                        source_document=document,
                        chunk_id=len(chunks),
-                        blurb=extract_blurb(chunk_text, blurb_len),
+                        blurb=extract_blurb(chunk_text, blurb_size),
                        content=chunk_text,
                        source_links=link_offsets,
                        section_continuation=False,
@ -176,15 +97,21 @@ def chunk_document(
                section=section,
                document=document,
                start_chunk_id=len(chunks),
-                chunk_size=chunk_size,
-                word_overlap=subsection_overlap,
-                blurb_len=blurb_len,
+                tokenizer=tokenizer,
+                chunk_size=chunk_tok_size,
+                chunk_overlap=subsection_overlap,
+                blurb_size=blurb_size,
            )
            chunks.extend(large_section_chunks)
            continue

        # In the case where the whole section is shorter than a chunk, either adding to chunk or start a new one
-        if current_length + len(SECTION_SEPARATOR) + section_length <= chunk_size:
+        if (
+            current_tok_length
+            + len(tokenizer.tokenize(SECTION_SEPARATOR))
+            + section_tok_length
+            <= chunk_tok_size
+        ):
            chunk_text += (
                SECTION_SEPARATOR + section.text if chunk_text else section.text
            )
@ -194,7 +121,7 @@ def chunk_document(
                DocAwareChunk(
                    source_document=document,
                    chunk_id=len(chunks),
-                    blurb=extract_blurb(chunk_text, blurb_len),
+                    blurb=extract_blurb(chunk_text, blurb_size),
                    content=chunk_text,
                    source_links=link_offsets,
                    section_continuation=False,
@ -209,7 +136,7 @@ def chunk_document(
            DocAwareChunk(
                source_document=document,
                chunk_id=len(chunks),
-                blurb=extract_blurb(chunk_text, blurb_len),
+                blurb=extract_blurb(chunk_text, blurb_size),
                content=chunk_text,
                source_links=link_offsets,
                section_continuation=False,
@ -218,6 +145,17 @@ def chunk_document(
    return chunks


+def split_chunk_text_into_mini_chunks(
+    chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
+) -> list[str]:
+    token_count_func = get_default_tokenizer().tokenize
+    sentence_aware_splitter = SentenceSplitter(
+        tokenizer=token_count_func, chunk_size=mini_chunk_size, chunk_overlap=0
+    )
+
+    return sentence_aware_splitter.split_text(chunk_text)
+
+
 class Chunker:
    @abc.abstractmethod
    def chunk(self, document: Document) -> list[DocAwareChunk]:
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@ -12,7 +12,7 @@ APP_PORT = 8080
 #####
 # User Facing Features Configs
 #####
-BLURB_LENGTH = 200  # Characters. Blurbs will be truncated at the first punctuation after this many characters.
+BLURB_SIZE = 128  # Number Encoder Tokens included in the chunk blurb
 GENERATIVE_MODEL_ACCESS_CHECK_FREQ = 86400  # 1 day
 # DISABLE_GENERATIVE_AI will turn of the question answering part of Danswer. Use this
 # if you want to use Danswer as a search engine only and/or you are not comfortable sending
@ -158,19 +158,14 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "True").lower() != "fals
 #####
 # Text Processing Configs
 #####
-# Chunking docs to this number of characters not including finishing the last word and the overlap words below
-# Calculated by ~500 to 512 tokens max * average 4 chars per token
-CHUNK_SIZE = 2000
+CHUNK_SIZE = 512  # Tokens by embedding model
+CHUNK_OVERLAP = int(CHUNK_SIZE * 0.05)  # 5% overlap
 # More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors)
-ENABLE_MINI_CHUNK = False
-# Mini chunks for fine-grained embedding, calculated as 128 tokens for 4 additional vectors for 512 chunk size above
-# Not rounded down to not lose any context in full chunk.
-MINI_CHUNK_SIZE = 512
-# Each chunk includes an additional CHUNK_WORD_OVERLAP words from previous chunk
-CHUNK_WORD_OVERLAP = 5
-# When trying to finish the last word in the chunk or counting back CHUNK_WORD_OVERLAP backwards,
-# This is the max number of characters allowed in either direction
-CHUNK_MAX_CHAR_OVERLAP = 50
+ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true"
+# Finer grained chunking for more detail retention
+# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
+# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
+MINI_CHUNK_SIZE = 150


 #####
--- a/backend/danswer/search/semantic_search.py
+++ b/backend/danswer/search/semantic_search.py
@ -4,12 +4,12 @@ from uuid import UUID
 import numpy
 from sentence_transformers import SentenceTransformer  # type: ignore

+from danswer.chunking.chunk import split_chunk_text_into_mini_chunks
 from danswer.chunking.models import ChunkEmbedding
 from danswer.chunking.models import DocAwareChunk
 from danswer.chunking.models import IndexChunk
 from danswer.chunking.models import InferenceChunk
 from danswer.configs.app_configs import ENABLE_MINI_CHUNK
-from danswer.configs.app_configs import MINI_CHUNK_SIZE
 from danswer.configs.app_configs import NUM_RERANKED_RESULTS
 from danswer.configs.app_configs import NUM_RETURNED_HITS
 from danswer.configs.model_configs import ASYMMETRIC_PREFIX
@ -124,36 +124,6 @@ def retrieve_ranked_documents(
    return ranked_chunks, top_chunks[num_rerank:]


-def split_chunk_text_into_mini_chunks(
-    chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
-) -> list[str]:
-    chunks = []
-    start = 0
-    separators = [" ", "\n", "\r", "\t"]
-
-    while start < len(chunk_text):
-        if len(chunk_text) - start <= mini_chunk_size:
-            end = len(chunk_text)
-        else:
-            # Find the first separator character after min_chunk_length
-            end_positions = [
-                (chunk_text[start + mini_chunk_size :]).find(sep) for sep in separators
-            ]
-            # Filter out the not found cases (-1)
-            end_positions = [pos for pos in end_positions if pos != -1]
-            if not end_positions:
-                # If no more separators, the rest of the string becomes a chunk
-                end = len(chunk_text)
-            else:
-                # Add min_chunk_length and start to the end position
-                end = min(end_positions) + start + mini_chunk_size
-
-        chunks.append(chunk_text[start:end])
-        start = end + 1  # Move to the next character after the separator
-
-    return chunks
-
-
@log_function_time()
 def encode_chunks(
    chunks: list[DocAwareChunk],
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@ -22,6 +22,7 @@ httpx-oauth==0.11.2
 huggingface-hub==0.16.4
 jira==3.5.1
 langchain==0.0.273
+llama-index==0.8.27
 Mako==1.2.4
 nltk==3.8.1
 docx2txt==0.8
@ -45,7 +46,7 @@ rfc3986==1.5.0
 safetensors==0.3.1
 sentence-transformers==2.2.2
 slack-sdk==3.20.2
-SQLAlchemy[mypy]==2.0.12
+SQLAlchemy[mypy]==2.0.15
 tensorflow==2.13.0
 tiktoken==0.4.0
 transformers==4.30.1
--- a/backend/tests/unit/qa_service/chunking/test_chunk.py
+++ b/backend/tests/unit/qa_service/chunking/test_chunk.py
@ -1,109 +0,0 @@
-import unittest
-
-from danswer.chunking.chunk import chunk_document
-from danswer.chunking.chunk import chunk_large_section
-from danswer.configs.constants import DocumentSource
-from danswer.connectors.models import Document
-from danswer.connectors.models import Section
-
-
-WAR_AND_PEACE = (
-    "Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, "
-    "if you don’t tell me that this means war, if you still try to defend the infamies and horrors perpetrated by "
-    "that Antichrist—I really believe he is Antichrist—I will have nothing more to do with you and you are no longer "
-    "my friend, no longer my ‘faithful slave,’ as you call yourself! But how do you do? I see I have frightened "
-    "you—sit down and tell me all the news."
-)
-
-
-class TestDocumentChunking(unittest.TestCase):
-    def setUp(self) -> None:
-        self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/")
-        self.large_unbroken_section = Section(
-            text="0123456789" * 40, link="https://www.test.com/"
-        )
-        self.document = Document(
-            id="test_document",
-            sections=[
-                Section(
-                    text="Here is some testing text", link="https://www.test.com/0"
-                ),
-                Section(
-                    text="Some more text, still under 100 chars",
-                    link="https://www.test.com/1",
-                ),
-                Section(
-                    text="Now with this section it's longer than the chunk size",
-                    link="https://www.test.com/2",
-                ),
-                self.large_section,
-                Section(text="These last 2 sections", link="https://www.test.com/4"),
-                Section(
-                    text="should be combined into one", link="https://www.test.com/5"
-                ),
-            ],
-            source=DocumentSource.WEB,  # arbitrary picking web, doens't matter for this test
-            semantic_identifier="Whatever",
-            metadata={},
-        )
-
-    def test_chunk_large_section(self) -> None:
-        chunks = chunk_large_section(
-            section=self.large_section,
-            document=self.document,
-            start_chunk_id=5,
-            chunk_size=100,
-            word_overlap=3,
-        )
-        contents = [chunk.content for chunk in chunks]
-
-        self.assertEqual(len(contents), 5)
-        self.assertEqual(contents[0], WAR_AND_PEACE[:100])
-        self.assertEqual(
-            contents[-2], WAR_AND_PEACE[-172:-62]
-        )  # slightly longer than 100 due to overlap
-        self.assertEqual(
-            contents[-1], WAR_AND_PEACE[-125:]
-        )  # large overlap with second to last segment
-        self.assertFalse(chunks[0].section_continuation)
-        self.assertTrue(chunks[1].section_continuation)
-        self.assertTrue(chunks[-1].section_continuation)
-
-    def test_chunk_max_overflow(self) -> None:
-        chunks = chunk_large_section(
-            section=self.large_unbroken_section,
-            document=self.document,
-            start_chunk_id=5,
-            chunk_size=100,
-            word_overlap=3,
-        )
-        contents = [chunk.content for chunk in chunks]
-
-        self.assertEqual(len(contents), 4)
-        self.assertEqual(contents[0], self.large_unbroken_section.text[:150])
-        self.assertEqual(contents[1], self.large_unbroken_section.text[50:250])
-        self.assertEqual(contents[2], self.large_unbroken_section.text[150:350])
-        # Last chunk counts back from the end, full chunk size (100) + 50 overlap => 400 - 150 = 250
-        self.assertEqual(contents[3], self.large_unbroken_section.text[250:])
-
-    def test_chunk_document(self) -> None:
-        chunks = chunk_document(self.document, chunk_size=100, subsection_overlap=3)
-        self.assertEqual(len(chunks), 8)
-        self.assertEqual(
-            chunks[0].content,
-            self.document.sections[0].text + "\n\n" + self.document.sections[1].text,
-        )
-        self.assertEqual(
-            chunks[0].source_links,
-            {0: "https://www.test.com/0", 21: "https://www.test.com/1"},
-        )
-        self.assertEqual(
-            chunks[-1].source_links,
-            {0: "https://www.test.com/4", 18: "https://www.test.com/5"},
-        )
-        self.assertEqual(chunks[5].chunk_id, 5)
-        self.assertEqual(chunks[6].source_document, self.document)
-
-
-if __name__ == "__main__":
-    unittest.main()