mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-09 20:55:06 +02:00
Parsing (#2734)
* k * update chunking limits * nit * nit * clean up types * nit * validate * k
This commit is contained in:
@@ -404,6 +404,8 @@ VESPA_REQUEST_TIMEOUT = int(os.environ.get("VESPA_REQUEST_TIMEOUT") or "5")
|
|||||||
|
|
||||||
SYSTEM_RECURSION_LIMIT = int(os.environ.get("SYSTEM_RECURSION_LIMIT") or "1000")
|
SYSTEM_RECURSION_LIMIT = int(os.environ.get("SYSTEM_RECURSION_LIMIT") or "1000")
|
||||||
|
|
||||||
|
PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true"
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# Enterprise Edition Configs
|
# Enterprise Edition Configs
|
||||||
#####
|
#####
|
||||||
|
@@ -4,11 +4,17 @@ from dataclasses import dataclass
|
|||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
import trafilatura # type: ignore
|
||||||
|
from trafilatura.settings import use_config # type: ignore
|
||||||
|
|
||||||
from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
|
from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
|
||||||
|
from danswer.configs.app_configs import PARSE_WITH_TRAFILATURA
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||||
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
|
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||||
|
|
||||||
@@ -47,6 +53,18 @@ def format_element_text(element_text: str, link_href: str | None) -> str:
|
|||||||
return f"[{element_text_no_newlines}]({link_href})"
|
return f"[{element_text_no_newlines}]({link_href})"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html_with_trafilatura(html_content: str) -> str:
|
||||||
|
"""Parse HTML content using trafilatura."""
|
||||||
|
config = use_config()
|
||||||
|
config.set("DEFAULT", "include_links", "True")
|
||||||
|
config.set("DEFAULT", "include_tables", "True")
|
||||||
|
config.set("DEFAULT", "include_images", "True")
|
||||||
|
config.set("DEFAULT", "include_formatting", "True")
|
||||||
|
|
||||||
|
extracted_text = trafilatura.extract(html_content, config=config)
|
||||||
|
return strip_excessive_newlines_and_spaces(extracted_text) if extracted_text else ""
|
||||||
|
|
||||||
|
|
||||||
def format_document_soup(
|
def format_document_soup(
|
||||||
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
|
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
|
||||||
) -> str:
|
) -> str:
|
||||||
@@ -183,7 +201,21 @@ def web_html_cleanup(
|
|||||||
for undesired_tag in additional_element_types_to_discard:
|
for undesired_tag in additional_element_types_to_discard:
|
||||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||||
|
|
||||||
# 200B is ZeroWidthSpace which we don't care for
|
soup_string = str(soup)
|
||||||
page_text = format_document_soup(soup).replace("\u200B", "")
|
page_text = ""
|
||||||
|
|
||||||
return ParsedHTML(title=title, cleaned_text=page_text)
|
if PARSE_WITH_TRAFILATURA:
|
||||||
|
try:
|
||||||
|
page_text = parse_html_with_trafilatura(soup_string)
|
||||||
|
if not page_text:
|
||||||
|
raise ValueError("Empty content returned by trafilatura.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.info(f"Trafilatura parsing failed: {e}. Falling back on bs4.")
|
||||||
|
page_text = format_document_soup(soup)
|
||||||
|
else:
|
||||||
|
page_text = format_document_soup(soup)
|
||||||
|
|
||||||
|
# 200B is ZeroWidthSpace which we don't care for
|
||||||
|
cleaned_text = page_text.replace("\u200B", "")
|
||||||
|
|
||||||
|
return ParsedHTML(title=title, cleaned_text=cleaned_text)
|
||||||
|
@@ -15,7 +15,7 @@ from danswer.indexing.models import DocAwareChunk
|
|||||||
from danswer.natural_language_processing.utils import BaseTokenizer
|
from danswer.natural_language_processing.utils import BaseTokenizer
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||||
|
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||||
|
|
||||||
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
||||||
# actually help quality at all
|
# actually help quality at all
|
||||||
@@ -158,6 +158,24 @@ class Chunker:
|
|||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
|
||||||
|
"""
|
||||||
|
Splits the text into smaller chunks based on token count to ensure
|
||||||
|
no chunk exceeds the content_token_limit.
|
||||||
|
"""
|
||||||
|
tokens = self.tokenizer.tokenize(text)
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
total_tokens = len(tokens)
|
||||||
|
while start < total_tokens:
|
||||||
|
end = min(start + content_token_limit, total_tokens)
|
||||||
|
token_chunk = tokens[start:end]
|
||||||
|
# Join the tokens to reconstruct the text
|
||||||
|
chunk_text = " ".join(token_chunk)
|
||||||
|
chunks.append(chunk_text)
|
||||||
|
start = end
|
||||||
|
return chunks
|
||||||
|
|
||||||
def _extract_blurb(self, text: str) -> str:
|
def _extract_blurb(self, text: str) -> str:
|
||||||
texts = self.blurb_splitter.split_text(text)
|
texts = self.blurb_splitter.split_text(text)
|
||||||
if not texts:
|
if not texts:
|
||||||
@@ -218,14 +236,42 @@ class Chunker:
|
|||||||
chunk_text = ""
|
chunk_text = ""
|
||||||
|
|
||||||
split_texts = self.chunk_splitter.split_text(section_text)
|
split_texts = self.chunk_splitter.split_text(section_text)
|
||||||
|
|
||||||
for i, split_text in enumerate(split_texts):
|
for i, split_text in enumerate(split_texts):
|
||||||
chunks.append(
|
split_token_count = len(self.tokenizer.tokenize(split_text))
|
||||||
_create_chunk(
|
|
||||||
text=split_text,
|
if STRICT_CHUNK_TOKEN_LIMIT:
|
||||||
links={0: section_link_text},
|
split_token_count = len(self.tokenizer.tokenize(split_text))
|
||||||
is_continuation=(i != 0),
|
if split_token_count > content_token_limit:
|
||||||
|
# Further split the oversized chunk
|
||||||
|
smaller_chunks = self._split_oversized_chunk(
|
||||||
|
split_text, content_token_limit
|
||||||
|
)
|
||||||
|
for i, small_chunk in enumerate(smaller_chunks):
|
||||||
|
chunks.append(
|
||||||
|
_create_chunk(
|
||||||
|
text=small_chunk,
|
||||||
|
links={0: section_link_text},
|
||||||
|
is_continuation=(i != 0),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
chunks.append(
|
||||||
|
_create_chunk(
|
||||||
|
text=split_text,
|
||||||
|
links={0: section_link_text},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
chunks.append(
|
||||||
|
_create_chunk(
|
||||||
|
text=split_text,
|
||||||
|
links={0: section_link_text},
|
||||||
|
is_continuation=(i != 0),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
current_token_count = len(self.tokenizer.tokenize(chunk_text))
|
current_token_count = len(self.tokenizer.tokenize(chunk_text))
|
||||||
|
@@ -25,10 +25,13 @@ httpx-oauth==0.15.1
|
|||||||
huggingface-hub==0.20.1
|
huggingface-hub==0.20.1
|
||||||
jira==3.5.1
|
jira==3.5.1
|
||||||
jsonref==1.1.0
|
jsonref==1.1.0
|
||||||
|
trafilatura==1.12.2
|
||||||
langchain==0.1.17
|
langchain==0.1.17
|
||||||
langchain-core==0.1.50
|
langchain-core==0.1.50
|
||||||
langchain-text-splitters==0.0.1
|
langchain-text-splitters==0.0.1
|
||||||
litellm==1.48.7
|
litellm==1.48.7
|
||||||
|
lxml==5.3.0
|
||||||
|
lxml_html_clean==0.2.2
|
||||||
llama-index==0.9.45
|
llama-index==0.9.45
|
||||||
Mako==1.2.4
|
Mako==1.2.4
|
||||||
msal==1.28.0
|
msal==1.28.0
|
||||||
|
@@ -21,4 +21,7 @@ types-regex==2023.3.23.1
|
|||||||
types-requests==2.28.11.17
|
types-requests==2.28.11.17
|
||||||
types-retry==0.9.9.3
|
types-retry==0.9.9.3
|
||||||
types-urllib3==1.26.25.11
|
types-urllib3==1.26.25.11
|
||||||
|
trafilatura==1.12.2
|
||||||
|
lxml==5.3.0
|
||||||
|
lxml_html_clean==0.2.2
|
||||||
boto3-stubs[s3]==1.34.133
|
boto3-stubs[s3]==1.34.133
|
@@ -66,6 +66,10 @@ LOG_LEVEL = os.environ.get("LOG_LEVEL", "notice")
|
|||||||
# Only used for OpenAI
|
# Only used for OpenAI
|
||||||
OPENAI_EMBEDDING_TIMEOUT = int(os.environ.get("OPENAI_EMBEDDING_TIMEOUT", "600"))
|
OPENAI_EMBEDDING_TIMEOUT = int(os.environ.get("OPENAI_EMBEDDING_TIMEOUT", "600"))
|
||||||
|
|
||||||
|
# Whether or not to strictly enforce token limit for chunking.
|
||||||
|
STRICT_CHUNK_TOKEN_LIMIT = (
|
||||||
|
os.environ.get("STRICT_CHUNK_TOKEN_LIMIT", "").lower() == "true"
|
||||||
|
)
|
||||||
|
|
||||||
# Fields which should only be set on new search setting
|
# Fields which should only be set on new search setting
|
||||||
PRESERVED_SEARCH_FIELDS = [
|
PRESERVED_SEARCH_FIELDS = [
|
||||||
|
Reference in New Issue
Block a user