Each section is now cleaned before being chunked (#3210)

* Each section is now cleaned before being chunked * k --------- Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
2025-09-28 21:05:17 +02:00 · 2024-11-22 11:06:19 -08:00
parent 129c8f8faf
commit 5dc07d4178
3 changed files with 55 additions and 50 deletions
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -14,6 +14,7 @@ from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from danswer.indexing.models import DocAwareChunk
 from danswer.natural_language_processing.utils import BaseTokenizer
 from danswer.utils.logger import setup_logger
 from danswer.utils.text_processing import clean_text
 from danswer.utils.text_processing import shared_precompare_cleanup
 from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
@@ -220,9 +221,20 @@ class Chunker:
                mini_chunk_texts=self._get_mini_chunk_texts(text),
            )
-        for section in document.sections:
+        for section_idx, section in enumerate(document.sections):
-            section_text = section.text
+            section_text = clean_text(section.text)
            section_link_text = section.link or ""
            # If there is no useful content, not even the title, just drop it
            if not section_text and (not document.title or section_idx > 0):
                # If a section is empty and the document has no title, we can just drop it. We return a list of
                # DocAwareChunks where each one contains the necessary information needed down the line for indexing.
                # There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
                logger.warning(
                    f"Skipping section {section.text} from document "
                    f"{document.semantic_identifier} due to empty text after cleaning "
                    f" with link {section_link_text}"
                )
                continue
            section_token_count = len(self.tokenizer.tokenize(section_text))
@@ -238,31 +250,26 @@ class Chunker:
                split_texts = self.chunk_splitter.split_text(section_text)
                for i, split_text in enumerate(split_texts):
-                    split_token_count = len(self.tokenizer.tokenize(split_text))
+                    if (
-
+                        STRICT_CHUNK_TOKEN_LIMIT
-                    if STRICT_CHUNK_TOKEN_LIMIT:
+                        and
-                        split_token_count = len(self.tokenizer.tokenize(split_text))
+                        # Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
-                        if split_token_count > content_token_limit:
+                        len(self.tokenizer.tokenize(split_text)) > content_token_limit
-                            # Further split the oversized chunk
+                    ):
-                            smaller_chunks = self._split_oversized_chunk(
+                        # If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
-                                split_text, content_token_limit
+                        # the token count of each split text to ensure it is
-                            )
+                        # not larger than the content_token_limit
-                            for i, small_chunk in enumerate(smaller_chunks):
+                        smaller_chunks = self._split_oversized_chunk(
-                                chunks.append(
+                            split_text, content_token_limit
-                                    _create_chunk(
+                        )
-                                        text=small_chunk,
+                        for i, small_chunk in enumerate(smaller_chunks):
                                        links={0: section_link_text},
                                        is_continuation=(i != 0),
                                    )
                                )
                        else:
                            chunks.append(
                                _create_chunk(
-                                    text=split_text,
+                                    text=small_chunk,
                                    links={0: section_link_text},
                                    is_continuation=(i != 0),
                                )
                            )
                    else:
                        chunks.append(
                            _create_chunk(
@@ -354,6 +361,10 @@ class Chunker:
        return normal_chunks
    def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
        """
        Takes in a list of documents and chunks them into smaller chunks for indexing
        while persisting the document metadata.
        """
        final_chunks: list[DocAwareChunk] = []
        for document in documents:
            if self.callback:
--- a/backend/danswer/natural_language_processing/search_nlp_models.py
+++ b/backend/danswer/natural_language_processing/search_nlp_models.py
@@ -1,4 +1,3 @@
 import re
 import threading
 import time
 from collections.abc import Callable
@@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
    return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
 _INITIAL_FILTER = re.compile(
    "["
    "\U0000FFF0-\U0000FFFF"  # Specials
    "\U0001F000-\U0001F9FF"  # Emoticons
    "\U00002000-\U0000206F"  # General Punctuation
    "\U00002190-\U000021FF"  # Arrows
    "\U00002700-\U000027BF"  # Dingbats
    "]+",
    flags=re.UNICODE,
 )
 def clean_openai_text(text: str) -> str:
    # Remove specific Unicode ranges that might cause issues
    cleaned = _INITIAL_FILTER.sub("", text)
    # Remove any control characters except for newline and tab
    cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
    return cleaned
 def build_model_server_url(
    model_server_host: str,
    model_server_port: int,
@@ -215,11 +192,6 @@ class EmbeddingModel:
                for text in texts
            ]
        if self.provider_type == EmbeddingProvider.OPENAI:
            # If the provider is openai, we need to clean the text
            # as a temporary workaround for the openai API
            texts = [clean_openai_text(text) for text in texts]
        batch_size = (
            api_embedding_batch_size
            if self.provider_type
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
    return text
 _INITIAL_FILTER = re.compile(
    "["
    "\U0000FFF0-\U0000FFFF"  # Specials
    "\U0001F000-\U0001F9FF"  # Emoticons
    "\U00002000-\U0000206F"  # General Punctuation
    "\U00002190-\U000021FF"  # Arrows
    "\U00002700-\U000027BF"  # Dingbats
    "]+",
    flags=re.UNICODE,
 )
 def clean_text(text: str) -> str:
    # Remove specific Unicode ranges that might cause issues
    cleaned = _INITIAL_FILTER.sub("", text)
    # Remove any control characters except for newline and tab
    cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
    return cleaned
 def is_valid_email(text: str) -> bool:
    """Can use a library instead if more detailed checks are needed"""
    regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"