* k

* update chunking limits

* nit

* nit

* clean up types

* nit

* validate

* k
This commit is contained in:
pablodanswer 2024-10-16 09:44:19 -07:00 committed by GitHub
parent 65573210f1
commit a385234c0e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 100 additions and 10 deletions

View File

@ -404,6 +404,8 @@ VESPA_REQUEST_TIMEOUT = int(os.environ.get("VESPA_REQUEST_TIMEOUT") or "5")
SYSTEM_RECURSION_LIMIT = int(os.environ.get("SYSTEM_RECURSION_LIMIT") or "1000")
PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true"
#####
# Enterprise Edition Configs
#####

View File

@ -4,11 +4,17 @@ from dataclasses import dataclass
from typing import IO
import bs4
import trafilatura # type: ignore
from trafilatura.settings import use_config # type: ignore
from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
from danswer.configs.app_configs import PARSE_WITH_TRAFILATURA
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
from danswer.utils.logger import setup_logger
logger = setup_logger()
MINTLIFY_UNWANTED = ["sticky", "hidden"]
@ -47,6 +53,18 @@ def format_element_text(element_text: str, link_href: str | None) -> str:
return f"[{element_text_no_newlines}]({link_href})"
def parse_html_with_trafilatura(html_content: str) -> str:
"""Parse HTML content using trafilatura."""
config = use_config()
config.set("DEFAULT", "include_links", "True")
config.set("DEFAULT", "include_tables", "True")
config.set("DEFAULT", "include_images", "True")
config.set("DEFAULT", "include_formatting", "True")
extracted_text = trafilatura.extract(html_content, config=config)
return strip_excessive_newlines_and_spaces(extracted_text) if extracted_text else ""
def format_document_soup(
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
) -> str:
@ -183,7 +201,21 @@ def web_html_cleanup(
for undesired_tag in additional_element_types_to_discard:
[tag.extract() for tag in soup.find_all(undesired_tag)]
# 200B is ZeroWidthSpace which we don't care for
page_text = format_document_soup(soup).replace("\u200B", "")
soup_string = str(soup)
page_text = ""
return ParsedHTML(title=title, cleaned_text=page_text)
if PARSE_WITH_TRAFILATURA:
try:
page_text = parse_html_with_trafilatura(soup_string)
if not page_text:
raise ValueError("Empty content returned by trafilatura.")
except Exception as e:
logger.info(f"Trafilatura parsing failed: {e}. Falling back on bs4.")
page_text = format_document_soup(soup)
else:
page_text = format_document_soup(soup)
# 200B is ZeroWidthSpace which we don't care for
cleaned_text = page_text.replace("\u200B", "")
return ParsedHTML(title=title, cleaned_text=cleaned_text)

View File

@ -15,7 +15,7 @@ from danswer.indexing.models import DocAwareChunk
from danswer.natural_language_processing.utils import BaseTokenizer
from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import shared_precompare_cleanup
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
# actually help quality at all
@ -158,6 +158,24 @@ class Chunker:
else None
)
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
"""
Splits the text into smaller chunks based on token count to ensure
no chunk exceeds the content_token_limit.
"""
tokens = self.tokenizer.tokenize(text)
chunks = []
start = 0
total_tokens = len(tokens)
while start < total_tokens:
end = min(start + content_token_limit, total_tokens)
token_chunk = tokens[start:end]
# Join the tokens to reconstruct the text
chunk_text = " ".join(token_chunk)
chunks.append(chunk_text)
start = end
return chunks
def _extract_blurb(self, text: str) -> str:
texts = self.blurb_splitter.split_text(text)
if not texts:
@ -218,14 +236,42 @@ class Chunker:
chunk_text = ""
split_texts = self.chunk_splitter.split_text(section_text)
for i, split_text in enumerate(split_texts):
chunks.append(
_create_chunk(
text=split_text,
links={0: section_link_text},
is_continuation=(i != 0),
split_token_count = len(self.tokenizer.tokenize(split_text))
if STRICT_CHUNK_TOKEN_LIMIT:
split_token_count = len(self.tokenizer.tokenize(split_text))
if split_token_count > content_token_limit:
# Further split the oversized chunk
smaller_chunks = self._split_oversized_chunk(
split_text, content_token_limit
)
for i, small_chunk in enumerate(smaller_chunks):
chunks.append(
_create_chunk(
text=small_chunk,
links={0: section_link_text},
is_continuation=(i != 0),
)
)
else:
chunks.append(
_create_chunk(
text=split_text,
links={0: section_link_text},
)
)
else:
chunks.append(
_create_chunk(
text=split_text,
links={0: section_link_text},
is_continuation=(i != 0),
)
)
)
continue
current_token_count = len(self.tokenizer.tokenize(chunk_text))

View File

@ -25,10 +25,13 @@ httpx-oauth==0.15.1
huggingface-hub==0.20.1
jira==3.5.1
jsonref==1.1.0
trafilatura==1.12.2
langchain==0.1.17
langchain-core==0.1.50
langchain-text-splitters==0.0.1
litellm==1.48.7
lxml==5.3.0
lxml_html_clean==0.2.2
llama-index==0.9.45
Mako==1.2.4
msal==1.28.0

View File

@ -21,4 +21,7 @@ types-regex==2023.3.23.1
types-requests==2.28.11.17
types-retry==0.9.9.3
types-urllib3==1.26.25.11
trafilatura==1.12.2
lxml==5.3.0
lxml_html_clean==0.2.2
boto3-stubs[s3]==1.34.133

View File

@ -66,6 +66,10 @@ LOG_LEVEL = os.environ.get("LOG_LEVEL", "notice")
# Only used for OpenAI
OPENAI_EMBEDDING_TIMEOUT = int(os.environ.get("OPENAI_EMBEDDING_TIMEOUT", "600"))
# Whether or not to strictly enforce token limit for chunking.
STRICT_CHUNK_TOKEN_LIMIT = (
os.environ.get("STRICT_CHUNK_TOKEN_LIMIT", "").lower() == "true"
)
# Fields which should only be set on new search setting
PRESERVED_SEARCH_FIELDS = [