Parsing (#2734)

* k * update chunking limits * nit * nit * clean up types * nit * validate * k
2025-10-09 20:55:06 +02:00 · 2024-10-16 09:44:19 -07:00
parent 65573210f1
commit a385234c0e
6 changed files with 100 additions and 10 deletions
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -404,6 +404,8 @@ VESPA_REQUEST_TIMEOUT = int(os.environ.get("VESPA_REQUEST_TIMEOUT") or "5")
 SYSTEM_RECURSION_LIMIT = int(os.environ.get("SYSTEM_RECURSION_LIMIT") or "1000")
 PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true"
 #####
 # Enterprise Edition Configs
 #####
--- a/backend/danswer/file_processing/html_utils.py
+++ b/backend/danswer/file_processing/html_utils.py
@@ -4,11 +4,17 @@ from dataclasses import dataclass
 from typing import IO
 import bs4
 import trafilatura  # type: ignore
 from trafilatura.settings import use_config  # type: ignore
 from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
 from danswer.configs.app_configs import PARSE_WITH_TRAFILATURA
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
 from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
 from danswer.utils.logger import setup_logger
 logger = setup_logger()
 MINTLIFY_UNWANTED = ["sticky", "hidden"]
@@ -47,6 +53,18 @@ def format_element_text(element_text: str, link_href: str | None) -> str:
    return f"[{element_text_no_newlines}]({link_href})"
 def parse_html_with_trafilatura(html_content: str) -> str:
    """Parse HTML content using trafilatura."""
    config = use_config()
    config.set("DEFAULT", "include_links", "True")
    config.set("DEFAULT", "include_tables", "True")
    config.set("DEFAULT", "include_images", "True")
    config.set("DEFAULT", "include_formatting", "True")
    extracted_text = trafilatura.extract(html_content, config=config)
    return strip_excessive_newlines_and_spaces(extracted_text) if extracted_text else ""
 def format_document_soup(
    document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
 ) -> str:
@@ -183,7 +201,21 @@ def web_html_cleanup(
        for undesired_tag in additional_element_types_to_discard:
            [tag.extract() for tag in soup.find_all(undesired_tag)]
-    # 200B is ZeroWidthSpace which we don't care for
+    soup_string = str(soup)
-    page_text = format_document_soup(soup).replace("\u200B", "")
+    page_text = ""
-    return ParsedHTML(title=title, cleaned_text=page_text)
+    if PARSE_WITH_TRAFILATURA:
        try:
            page_text = parse_html_with_trafilatura(soup_string)
            if not page_text:
                raise ValueError("Empty content returned by trafilatura.")
        except Exception as e:
            logger.info(f"Trafilatura parsing failed: {e}. Falling back on bs4.")
            page_text = format_document_soup(soup)
    else:
        page_text = format_document_soup(soup)
    # 200B is ZeroWidthSpace which we don't care for
    cleaned_text = page_text.replace("\u200B", "")
    return ParsedHTML(title=title, cleaned_text=cleaned_text)
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -15,7 +15,7 @@ from danswer.indexing.models import DocAwareChunk
 from danswer.natural_language_processing.utils import BaseTokenizer
 from danswer.utils.logger import setup_logger
 from danswer.utils.text_processing import shared_precompare_cleanup
-
+from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
 # Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
 # actually help quality at all
@@ -158,6 +158,24 @@ class Chunker:
            else None
        )
    def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
        """
        Splits the text into smaller chunks based on token count to ensure
        no chunk exceeds the content_token_limit.
        """
        tokens = self.tokenizer.tokenize(text)
        chunks = []
        start = 0
        total_tokens = len(tokens)
        while start < total_tokens:
            end = min(start + content_token_limit, total_tokens)
            token_chunk = tokens[start:end]
            # Join the tokens to reconstruct the text
            chunk_text = " ".join(token_chunk)
            chunks.append(chunk_text)
            start = end
        return chunks
    def _extract_blurb(self, text: str) -> str:
        texts = self.blurb_splitter.split_text(text)
        if not texts:
@@ -218,14 +236,42 @@ class Chunker:
                    chunk_text = ""
                split_texts = self.chunk_splitter.split_text(section_text)
                for i, split_text in enumerate(split_texts):
-                    chunks.append(
+                    split_token_count = len(self.tokenizer.tokenize(split_text))
-                        _create_chunk(
+
-                            text=split_text,
+                    if STRICT_CHUNK_TOKEN_LIMIT:
-                            links={0: section_link_text},
+                        split_token_count = len(self.tokenizer.tokenize(split_text))
-                            is_continuation=(i != 0),
+                        if split_token_count > content_token_limit:
                            # Further split the oversized chunk
                            smaller_chunks = self._split_oversized_chunk(
                                split_text, content_token_limit
                            )
                            for i, small_chunk in enumerate(smaller_chunks):
                                chunks.append(
                                    _create_chunk(
                                        text=small_chunk,
                                        links={0: section_link_text},
                                        is_continuation=(i != 0),
                                    )
                                )
                        else:
                            chunks.append(
                                _create_chunk(
                                    text=split_text,
                                    links={0: section_link_text},
                                )
                            )
                    else:
                        chunks.append(
                            _create_chunk(
                                text=split_text,
                                links={0: section_link_text},
                                is_continuation=(i != 0),
                            )
                        )
-                    )
+
                continue
            current_token_count = len(self.tokenizer.tokenize(chunk_text))
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -25,10 +25,13 @@ httpx-oauth==0.15.1
 huggingface-hub==0.20.1
 jira==3.5.1
 jsonref==1.1.0
 trafilatura==1.12.2
 langchain==0.1.17
 langchain-core==0.1.50
 langchain-text-splitters==0.0.1
 litellm==1.48.7
 lxml==5.3.0
 lxml_html_clean==0.2.2
 llama-index==0.9.45
 Mako==1.2.4
 msal==1.28.0
--- a/backend/requirements/dev.txt
+++ b/backend/requirements/dev.txt
@@ -21,4 +21,7 @@ types-regex==2023.3.23.1
 types-requests==2.28.11.17
 types-retry==0.9.9.3
 types-urllib3==1.26.25.11
 trafilatura==1.12.2
 lxml==5.3.0
 lxml_html_clean==0.2.2
 boto3-stubs[s3]==1.34.133
--- a/backend/shared_configs/configs.py
+++ b/backend/shared_configs/configs.py
@@ -66,6 +66,10 @@ LOG_LEVEL = os.environ.get("LOG_LEVEL", "notice")
 # Only used for OpenAI
 OPENAI_EMBEDDING_TIMEOUT = int(os.environ.get("OPENAI_EMBEDDING_TIMEOUT", "600"))
 # Whether or not to strictly enforce token limit for chunking.
 STRICT_CHUNK_TOKEN_LIMIT = (
    os.environ.get("STRICT_CHUNK_TOKEN_LIMIT", "").lower() == "true"
 )
 # Fields which should only be set on new search setting
 PRESERVED_SEARCH_FIELDS = [