mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-09 20:55:06 +02:00
Transform HTML links to markdown behind config option (#1671)
This commit is contained in:
@@ -4,6 +4,7 @@ import urllib.parse
|
|||||||
|
|
||||||
from danswer.configs.constants import AuthType
|
from danswer.configs.constants import AuthType
|
||||||
from danswer.configs.constants import DocumentIndexType
|
from danswer.configs.constants import DocumentIndexType
|
||||||
|
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
|
||||||
|
|
||||||
#####
|
#####
|
||||||
# App Configs
|
# App Configs
|
||||||
@@ -160,6 +161,11 @@ WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_S
|
|||||||
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
|
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
|
||||||
WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS")
|
WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS")
|
||||||
|
|
||||||
|
HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
|
||||||
|
"HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY",
|
||||||
|
HtmlBasedConnectorTransformLinksStrategy.STRIP,
|
||||||
|
)
|
||||||
|
|
||||||
NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
|
NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
|
||||||
os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
|
os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
|
||||||
== "true"
|
== "true"
|
||||||
|
8
backend/danswer/file_processing/enums.py
Normal file
8
backend/danswer/file_processing/enums.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
|
||||||
|
# remove links entirely
|
||||||
|
STRIP = "strip"
|
||||||
|
# turn HTML links into markdown links
|
||||||
|
MARKDOWN = "markdown"
|
@@ -5,8 +5,10 @@ from typing import IO
|
|||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
|
|
||||||
|
from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||||
|
from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy
|
||||||
|
|
||||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||||
|
|
||||||
@@ -32,6 +34,19 @@ def strip_newlines(document: str) -> str:
|
|||||||
return re.sub(r"[\n\r]+", " ", document)
|
return re.sub(r"[\n\r]+", " ", document)
|
||||||
|
|
||||||
|
|
||||||
|
def format_element_text(element_text: str, link_href: str | None) -> str:
|
||||||
|
element_text_no_newlines = strip_newlines(element_text)
|
||||||
|
|
||||||
|
if (
|
||||||
|
not link_href
|
||||||
|
or HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY
|
||||||
|
== HtmlBasedConnectorTransformLinksStrategy.STRIP
|
||||||
|
):
|
||||||
|
return element_text_no_newlines
|
||||||
|
|
||||||
|
return f"[{element_text_no_newlines}]({link_href})"
|
||||||
|
|
||||||
|
|
||||||
def format_document_soup(
|
def format_document_soup(
|
||||||
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
|
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
|
||||||
) -> str:
|
) -> str:
|
||||||
@@ -49,6 +64,8 @@ def format_document_soup(
|
|||||||
verbatim_output = 0
|
verbatim_output = 0
|
||||||
in_table = False
|
in_table = False
|
||||||
last_added_newline = False
|
last_added_newline = False
|
||||||
|
link_href: str | None = None
|
||||||
|
|
||||||
for e in document.descendants:
|
for e in document.descendants:
|
||||||
verbatim_output -= 1
|
verbatim_output -= 1
|
||||||
if isinstance(e, bs4.element.NavigableString):
|
if isinstance(e, bs4.element.NavigableString):
|
||||||
@@ -71,7 +88,7 @@ def format_document_soup(
|
|||||||
content_to_add = (
|
content_to_add = (
|
||||||
element_text
|
element_text
|
||||||
if verbatim_output > 0
|
if verbatim_output > 0
|
||||||
else strip_newlines(element_text)
|
else format_element_text(element_text, link_href)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Don't join separate elements without any spacing
|
# Don't join separate elements without any spacing
|
||||||
@@ -98,7 +115,14 @@ def format_document_soup(
|
|||||||
elif in_table:
|
elif in_table:
|
||||||
# don't handle other cases while in table
|
# don't handle other cases while in table
|
||||||
pass
|
pass
|
||||||
|
elif e.name == "a":
|
||||||
|
href_value = e.get("href", None)
|
||||||
|
# mostly for typing, having multiple hrefs is not valid HTML
|
||||||
|
link_href = (
|
||||||
|
href_value[0] if isinstance(href_value, list) else href_value
|
||||||
|
)
|
||||||
|
elif e.name == "/a":
|
||||||
|
link_href = None
|
||||||
elif e.name in ["p", "div"]:
|
elif e.name in ["p", "div"]:
|
||||||
if not list_element_start:
|
if not list_element_start:
|
||||||
text += "\n"
|
text += "\n"
|
||||||
|
Reference in New Issue
Block a user