diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index 1571a66c4..1bc8d3f9e 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -7,6 +7,7 @@ from typing import Any from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.bookstack.client import BookStackApiClient +from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -14,7 +15,6 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section -from danswer.utils.text_processing import parse_html_page_basic class BookstackConnector(LoadConnector, PollConnector): diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 78bafc299..c75e6099c 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -1,4 +1,3 @@ -import os from collections.abc import Callable from collections.abc import Collection from datetime import datetime @@ -13,6 +12,7 @@ from requests import HTTPError from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -21,7 +21,6 @@ from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger -from danswer.utils.text_processing import parse_html_page_basic logger = setup_logger() @@ -266,6 +265,8 @@ class ConfluenceConnector(LoadConnector, PollConnector): if __name__ == "__main__": + import os + connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"]) connector.load_credentials( { diff --git a/backend/danswer/connectors/cross_connector_utils/html_utils.py b/backend/danswer/connectors/cross_connector_utils/html_utils.py index ef860fe1f..550658b47 100644 --- a/backend/danswer/connectors/cross_connector_utils/html_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py @@ -1,11 +1,11 @@ +import re from copy import copy from dataclasses import dataclass -from bs4 import BeautifulSoup +import bs4 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS -from danswer.utils.text_processing import format_document_soup MINTLIFY_UNWANTED = ["sticky", "hidden"] @@ -16,13 +16,96 @@ class ParsedHTML: cleaned_text: str -def standard_html_cleanup( - page_content: str | BeautifulSoup, +def strip_excessive_newlines_and_spaces(document: str) -> str: + # collapse repeated spaces into one + document = re.sub(r" +", " ", document) + # remove trailing spaces + document = re.sub(r" +[\n\r]", "\n", document) + # remove repeated newlines + document = re.sub(r"[\n\r]+", "\n", document) + return document.strip() + + +def strip_newlines(document: str) -> str: + # HTML might contain newlines which are just whitespaces to a browser + return re.sub(r"[\n\r]+", " ", document) + + +def format_document_soup( + document: bs4.BeautifulSoup, table_cell_separator: str = "\t" +) -> str: + """Format html to a flat text document. + + The following goals: + - Newlines from within the HTML are removed (as browser would ignore them as well). + - Repeated newlines/spaces are removed (as browsers would ignore them). + - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag) + - Table columns/rows are separated by newline + - List elements are separated by newline and start with a hyphen + """ + text = "" + list_element_start = False + verbatim_output = 0 + in_table = False + for e in document.descendants: + verbatim_output -= 1 + if isinstance(e, bs4.element.NavigableString): + if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)): + continue + element_text = e.text + if in_table: + # Tables are represented in natural language with rows separated by newlines + # Can't have newlines then in the table elements + element_text = element_text.replace("\n", " ").strip() + if element_text: + if verbatim_output > 0: + text += element_text + else: + text += strip_newlines(element_text) + list_element_start = False + elif isinstance(e, bs4.element.Tag): + # table is standard HTML element + if e.name == "table": + in_table = True + # tr is for rows + elif e.name == "tr" and in_table: + text += "\n" + # td for data cell, th for header + elif e.name in ["td", "th"] and in_table: + text += table_cell_separator + elif e.name == "/table": + in_table = False + elif in_table: + # don't handle other cases while in table + pass + + elif e.name in ["p", "div"]: + if not list_element_start: + text += "\n" + elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]: + text += "\n" + list_element_start = False + elif e.name == "li": + text += "\n- " + list_element_start = True + elif e.name == "pre": + if verbatim_output <= 0: + verbatim_output = len(list(e.childGenerator())) + return strip_excessive_newlines_and_spaces(text) + + +def parse_html_page_basic(text: str) -> str: + soup = bs4.BeautifulSoup(text, "html.parser") + return format_document_soup(soup) + + +def web_html_cleanup( + page_content: str | bs4.BeautifulSoup, mintlify_cleanup_enabled: bool = True, additional_element_types_to_discard: list[str] | None = None, ) -> ParsedHTML: if isinstance(page_content, str): - soup = BeautifulSoup(page_content, "html.parser") + soup = bs4.BeautifulSoup(page_content, "html.parser") else: soup = page_content diff --git a/backend/danswer/connectors/google_site/connector.py b/backend/danswer/connectors/google_site/connector.py index f064db98f..d7fd8c99a 100644 --- a/backend/danswer/connectors/google_site/connector.py +++ b/backend/danswer/connectors/google_site/connector.py @@ -10,7 +10,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip from danswer.connectors.cross_connector_utils.file_utils import read_file -from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup +from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document @@ -101,7 +101,7 @@ class GoogleSitesConnector(LoadConnector): div.extract() # get the body of the page - parsed_html = standard_html_cleanup( + parsed_html = web_html_cleanup( soup, additional_element_types_to_discard=["header", "nav"] ) diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py index 0b6e7040a..c9e9fb6d0 100644 --- a/backend/danswer/connectors/guru/connector.py +++ b/backend/danswer/connectors/guru/connector.py @@ -7,6 +7,7 @@ import requests from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -15,7 +16,6 @@ from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger -from danswer.utils.text_processing import parse_html_page_basic # Potential Improvements # 1. Support fetching per collection via collection token (configured at connector creation) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 3d56f5c43..ddb02490c 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -21,7 +21,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup +from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document @@ -218,7 +218,7 @@ class WebConnector(LoadConnector): if link not in visited_links: to_visit.append(link) - parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup) + parsed_html = web_html_cleanup(soup, self.mintlify_cleanup) doc_batch.append( Document( diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index 6289ab5af..b1780826e 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -1,9 +1,6 @@ import json import re -import bs4 -from bs4 import BeautifulSoup - def has_unescaped_quote(s: str) -> bool: pattern = r'(? str: text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text) return text - - -def strip_excessive_newlines_and_spaces(document: str) -> str: - # collapse repeated spaces into one - document = re.sub(r" +", " ", document) - # remove trailing spaces - document = re.sub(r" +[\n\r]", "\n", document) - # remove repeated newlines - document = re.sub(r"[\n\r]+", "\n", document) - return document.strip() - - -def strip_newlines(document: str) -> str: - # HTML might contain newlines which are just whitespaces to a browser - return re.sub(r"[\n\r]+", " ", document) - - -def format_document_soup(document: BeautifulSoup) -> str: - """Format html to a flat text document. - - The following goals: - - Newlines from within the HTML are removed (as browser would ignore them as well). - - Repeated newlines/spaces are removed (as browsers would ignore them). - - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag) - - Table columns/rows are separated by newline - - List elements are separated by newline and start with a hyphen - """ - text = "" - list_element_start = False - verbatim_output = 0 - for e in document.descendants: - verbatim_output -= 1 - if isinstance(e, bs4.element.NavigableString): - if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)): - continue - element_text = e.text - if element_text: - if verbatim_output > 0: - text += element_text - else: - text += strip_newlines(element_text) - list_element_start = False - elif isinstance(e, bs4.element.Tag): - if e.name in ["p", "div"]: - if not list_element_start: - text += "\n" - elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]: - text += "\n" - list_element_start = False - elif e.name == "li": - text += "\n- " - list_element_start = True - elif e.name == "pre": - if verbatim_output <= 0: - verbatim_output = len(list(e.childGenerator())) - return strip_excessive_newlines_and_spaces(text) - - -def parse_html_page_basic(text: str) -> str: - soup = BeautifulSoup(text, "html.parser") - return format_document_soup(soup) diff --git a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py new file mode 100644 index 000000000..571f818a2 --- /dev/null +++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py @@ -0,0 +1,17 @@ +import unittest + +from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic + + +class TestQAPostprocessing(unittest.TestCase): + def test_parse_table(self) -> None: + with open("./test_table.html", "r") as file: + content = file.read() + + parsed = parse_html_page_basic(content) + expected = "\n\thello\tthere\tgeneral\n\tkenobi\ta\tb\n\tc\td\te" + self.assertIn(expected, parsed) + + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_table.html b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_table.html new file mode 100644 index 000000000..e1c3e07ba --- /dev/null +++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_table.html @@ -0,0 +1,39 @@ +

This page is to ensure we’re able to parse a table into a tsv

+ + + + + + + + + + + + + + + + + + +
+

hello

+
+

there

+
+

general

+
+

kenobi

+
+

a

+
+

b

+
+

c

+
+

d

+
+

e

+
+