Each section is now cleaned before being chunked (#3210)

* Each section is now cleaned before being chunked * k --------- Co-authored-by: Yuhong Sun <yuhongsun96@gmail.com>
2025-09-28 21:05:17 +02:00 · 2024-11-22 11:06:19 -08:00
parent 129c8f8faf
commit 5dc07d4178
3 changed files with 55 additions and 50 deletions
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -14,6 +14,7 @@ from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from danswer.indexing.models import DocAwareChunk
 from danswer.natural_language_processing.utils import BaseTokenizer
 from danswer.utils.logger import setup_logger
+from danswer.utils.text_processing import clean_text
 from danswer.utils.text_processing import shared_precompare_cleanup
 from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT

@@ -220,9 +221,20 @@ class Chunker:
                mini_chunk_texts=self._get_mini_chunk_texts(text),
            )

-        for section in document.sections:
-            section_text = section.text
+        for section_idx, section in enumerate(document.sections):
+            section_text = clean_text(section.text)
            section_link_text = section.link or ""
+            # If there is no useful content, not even the title, just drop it
+            if not section_text and (not document.title or section_idx > 0):
+                # If a section is empty and the document has no title, we can just drop it. We return a list of
+                # DocAwareChunks where each one contains the necessary information needed down the line for indexing.
+                # There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
+                logger.warning(
+                    f"Skipping section {section.text} from document "
+                    f"{document.semantic_identifier} due to empty text after cleaning "
+                    f" with link {section_link_text}"
+                )
+                continue

            section_token_count = len(self.tokenizer.tokenize(section_text))

@@ -238,31 +250,26 @@ class Chunker:
                split_texts = self.chunk_splitter.split_text(section_text)

                for i, split_text in enumerate(split_texts):
-                    split_token_count = len(self.tokenizer.tokenize(split_text))
-
-                    if STRICT_CHUNK_TOKEN_LIMIT:
-                        split_token_count = len(self.tokenizer.tokenize(split_text))
-                        if split_token_count > content_token_limit:
-                            # Further split the oversized chunk
-                            smaller_chunks = self._split_oversized_chunk(
-                                split_text, content_token_limit
-                            )
-                            for i, small_chunk in enumerate(smaller_chunks):
-                                chunks.append(
-                                    _create_chunk(
-                                        text=small_chunk,
-                                        links={0: section_link_text},
-                                        is_continuation=(i != 0),
-                                    )
-                                )
-                        else:
+                    if (
+                        STRICT_CHUNK_TOKEN_LIMIT
+                        and
+                        # Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
+                        len(self.tokenizer.tokenize(split_text)) > content_token_limit
+                    ):
+                        # If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
+                        # the token count of each split text to ensure it is
+                        # not larger than the content_token_limit
+                        smaller_chunks = self._split_oversized_chunk(
+                            split_text, content_token_limit
+                        )
+                        for i, small_chunk in enumerate(smaller_chunks):
                            chunks.append(
                                _create_chunk(
-                                    text=split_text,
+                                    text=small_chunk,
                                    links={0: section_link_text},
+                                    is_continuation=(i != 0),
                                )
                            )
-
                    else:
                        chunks.append(
                            _create_chunk(
@@ -354,6 +361,10 @@ class Chunker:
        return normal_chunks

    def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
+        """
+        Takes in a list of documents and chunks them into smaller chunks for indexing
+        while persisting the document metadata.
+        """
        final_chunks: list[DocAwareChunk] = []
        for document in documents:
            if self.callback:
--- a/backend/danswer/natural_language_processing/search_nlp_models.py
+++ b/backend/danswer/natural_language_processing/search_nlp_models.py
@@ -1,4 +1,3 @@
-import re
 import threading
 import time
 from collections.abc import Callable
@@ -50,28 +49,6 @@ def clean_model_name(model_str: str) -> str:
    return model_str.replace("/", "_").replace("-", "_").replace(".", "_")


-_INITIAL_FILTER = re.compile(
-    "["
-    "\U0000FFF0-\U0000FFFF"  # Specials
-    "\U0001F000-\U0001F9FF"  # Emoticons
-    "\U00002000-\U0000206F"  # General Punctuation
-    "\U00002190-\U000021FF"  # Arrows
-    "\U00002700-\U000027BF"  # Dingbats
-    "]+",
-    flags=re.UNICODE,
-)
-
-
-def clean_openai_text(text: str) -> str:
-    # Remove specific Unicode ranges that might cause issues
-    cleaned = _INITIAL_FILTER.sub("", text)
-
-    # Remove any control characters except for newline and tab
-    cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
-
-    return cleaned
-
-
 def build_model_server_url(
    model_server_host: str,
    model_server_port: int,
@@ -215,11 +192,6 @@ class EmbeddingModel:
                for text in texts
            ]

-        if self.provider_type == EmbeddingProvider.OPENAI:
-            # If the provider is openai, we need to clean the text
-            # as a temporary workaround for the openai API
-            texts = [clean_openai_text(text) for text in texts]
-
        batch_size = (
            api_embedding_batch_size
            if self.provider_type
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -126,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str:
    return text


+_INITIAL_FILTER = re.compile(
+    "["
+    "\U0000FFF0-\U0000FFFF"  # Specials
+    "\U0001F000-\U0001F9FF"  # Emoticons
+    "\U00002000-\U0000206F"  # General Punctuation
+    "\U00002190-\U000021FF"  # Arrows
+    "\U00002700-\U000027BF"  # Dingbats
+    "]+",
+    flags=re.UNICODE,
+)
+
+
+def clean_text(text: str) -> str:
+    # Remove specific Unicode ranges that might cause issues
+    cleaned = _INITIAL_FILTER.sub("", text)
+
+    # Remove any control characters except for newline and tab
+    cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
+
+    return cleaned
+
+
 def is_valid_email(text: str) -> bool:
    """Can use a library instead if more detailed checks are needed"""
    regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"