mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-28 12:39:54 +02:00
Parsing (#2734)
* k * update chunking limits * nit * nit * clean up types * nit * validate * k
This commit is contained in:
parent
65573210f1
commit
a385234c0e
@ -404,6 +404,8 @@ VESPA_REQUEST_TIMEOUT = int(os.environ.get("VESPA_REQUEST_TIMEOUT") or "5")
|
||||
|
||||
SYSTEM_RECURSION_LIMIT = int(os.environ.get("SYSTEM_RECURSION_LIMIT") or "1000")
|
||||
|
||||
PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true"
|
||||
|
||||
#####
|
||||
# Enterprise Edition Configs
|
||||
#####
|
||||
|
@ -4,11 +4,17 @@ from dataclasses import dataclass
|
||||
from typing import IO
|
||||
|
||||
import bs4
|
||||
import trafilatura # type: ignore
|
||||
from trafilatura.settings import use_config # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
|
||||
from danswer.configs.app_configs import PARSE_WITH_TRAFILATURA
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||
|
||||
@ -47,6 +53,18 @@ def format_element_text(element_text: str, link_href: str | None) -> str:
|
||||
return f"[{element_text_no_newlines}]({link_href})"
|
||||
|
||||
|
||||
def parse_html_with_trafilatura(html_content: str) -> str:
|
||||
"""Parse HTML content using trafilatura."""
|
||||
config = use_config()
|
||||
config.set("DEFAULT", "include_links", "True")
|
||||
config.set("DEFAULT", "include_tables", "True")
|
||||
config.set("DEFAULT", "include_images", "True")
|
||||
config.set("DEFAULT", "include_formatting", "True")
|
||||
|
||||
extracted_text = trafilatura.extract(html_content, config=config)
|
||||
return strip_excessive_newlines_and_spaces(extracted_text) if extracted_text else ""
|
||||
|
||||
|
||||
def format_document_soup(
|
||||
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
|
||||
) -> str:
|
||||
@ -183,7 +201,21 @@ def web_html_cleanup(
|
||||
for undesired_tag in additional_element_types_to_discard:
|
||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||
|
||||
# 200B is ZeroWidthSpace which we don't care for
|
||||
page_text = format_document_soup(soup).replace("\u200B", "")
|
||||
soup_string = str(soup)
|
||||
page_text = ""
|
||||
|
||||
return ParsedHTML(title=title, cleaned_text=page_text)
|
||||
if PARSE_WITH_TRAFILATURA:
|
||||
try:
|
||||
page_text = parse_html_with_trafilatura(soup_string)
|
||||
if not page_text:
|
||||
raise ValueError("Empty content returned by trafilatura.")
|
||||
except Exception as e:
|
||||
logger.info(f"Trafilatura parsing failed: {e}. Falling back on bs4.")
|
||||
page_text = format_document_soup(soup)
|
||||
else:
|
||||
page_text = format_document_soup(soup)
|
||||
|
||||
# 200B is ZeroWidthSpace which we don't care for
|
||||
cleaned_text = page_text.replace("\u200B", "")
|
||||
|
||||
return ParsedHTML(title=title, cleaned_text=cleaned_text)
|
||||
|
@ -15,7 +15,7 @@ from danswer.indexing.models import DocAwareChunk
|
||||
from danswer.natural_language_processing.utils import BaseTokenizer
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
||||
# actually help quality at all
|
||||
@ -158,6 +158,24 @@ class Chunker:
|
||||
else None
|
||||
)
|
||||
|
||||
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
|
||||
"""
|
||||
Splits the text into smaller chunks based on token count to ensure
|
||||
no chunk exceeds the content_token_limit.
|
||||
"""
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
chunks = []
|
||||
start = 0
|
||||
total_tokens = len(tokens)
|
||||
while start < total_tokens:
|
||||
end = min(start + content_token_limit, total_tokens)
|
||||
token_chunk = tokens[start:end]
|
||||
# Join the tokens to reconstruct the text
|
||||
chunk_text = " ".join(token_chunk)
|
||||
chunks.append(chunk_text)
|
||||
start = end
|
||||
return chunks
|
||||
|
||||
def _extract_blurb(self, text: str) -> str:
|
||||
texts = self.blurb_splitter.split_text(text)
|
||||
if not texts:
|
||||
@ -218,14 +236,42 @@ class Chunker:
|
||||
chunk_text = ""
|
||||
|
||||
split_texts = self.chunk_splitter.split_text(section_text)
|
||||
|
||||
for i, split_text in enumerate(split_texts):
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=split_text,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
split_token_count = len(self.tokenizer.tokenize(split_text))
|
||||
|
||||
if STRICT_CHUNK_TOKEN_LIMIT:
|
||||
split_token_count = len(self.tokenizer.tokenize(split_text))
|
||||
if split_token_count > content_token_limit:
|
||||
# Further split the oversized chunk
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for i, small_chunk in enumerate(smaller_chunks):
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=small_chunk,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
else:
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=split_text,
|
||||
links={0: section_link_text},
|
||||
)
|
||||
)
|
||||
|
||||
else:
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=split_text,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
current_token_count = len(self.tokenizer.tokenize(chunk_text))
|
||||
|
@ -25,10 +25,13 @@ httpx-oauth==0.15.1
|
||||
huggingface-hub==0.20.1
|
||||
jira==3.5.1
|
||||
jsonref==1.1.0
|
||||
trafilatura==1.12.2
|
||||
langchain==0.1.17
|
||||
langchain-core==0.1.50
|
||||
langchain-text-splitters==0.0.1
|
||||
litellm==1.48.7
|
||||
lxml==5.3.0
|
||||
lxml_html_clean==0.2.2
|
||||
llama-index==0.9.45
|
||||
Mako==1.2.4
|
||||
msal==1.28.0
|
||||
|
@ -21,4 +21,7 @@ types-regex==2023.3.23.1
|
||||
types-requests==2.28.11.17
|
||||
types-retry==0.9.9.3
|
||||
types-urllib3==1.26.25.11
|
||||
trafilatura==1.12.2
|
||||
lxml==5.3.0
|
||||
lxml_html_clean==0.2.2
|
||||
boto3-stubs[s3]==1.34.133
|
@ -66,6 +66,10 @@ LOG_LEVEL = os.environ.get("LOG_LEVEL", "notice")
|
||||
# Only used for OpenAI
|
||||
OPENAI_EMBEDDING_TIMEOUT = int(os.environ.get("OPENAI_EMBEDDING_TIMEOUT", "600"))
|
||||
|
||||
# Whether or not to strictly enforce token limit for chunking.
|
||||
STRICT_CHUNK_TOKEN_LIMIT = (
|
||||
os.environ.get("STRICT_CHUNK_TOKEN_LIMIT", "").lower() == "true"
|
||||
)
|
||||
|
||||
# Fields which should only be set on new search setting
|
||||
PRESERVED_SEARCH_FIELDS = [
|
||||
|
Loading…
x
Reference in New Issue
Block a user