Recreate Tables from HTML (#588)

2025-09-28 21:05:17 +02:00 · 2023-10-18 11:16:40 -07:00
parent 022f59e5b2
commit a5d2759fbc
9 changed files with 153 additions and 77 deletions
--- a/backend/danswer/connectors/bookstack/connector.py
+++ b/backend/danswer/connectors/bookstack/connector.py
@@ -7,6 +7,7 @@ from typing import Any
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.bookstack.client import BookStackApiClient
+from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -14,7 +15,6 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
-from danswer.utils.text_processing import parse_html_page_basic


 class BookstackConnector(LoadConnector, PollConnector):
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -1,4 +1,3 @@
-import os
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@@ -13,6 +12,7 @@ from requests import HTTPError
 from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -21,7 +21,6 @@ from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
-from danswer.utils.text_processing import parse_html_page_basic

 logger = setup_logger()

@@ -266,6 +265,8 @@ class ConfluenceConnector(LoadConnector, PollConnector):


 if __name__ == "__main__":
+    import os
+
    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
    connector.load_credentials(
        {
--- a/backend/danswer/connectors/cross_connector_utils/html_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -1,11 +1,11 @@
+import re
 from copy import copy
 from dataclasses import dataclass

-from bs4 import BeautifulSoup
+import bs4

 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
 from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
-from danswer.utils.text_processing import format_document_soup

 MINTLIFY_UNWANTED = ["sticky", "hidden"]

@@ -16,13 +16,96 @@ class ParsedHTML:
    cleaned_text: str


-def standard_html_cleanup(
-    page_content: str | BeautifulSoup,
+def strip_excessive_newlines_and_spaces(document: str) -> str:
+    # collapse repeated spaces into one
+    document = re.sub(r" +", " ", document)
+    # remove trailing spaces
+    document = re.sub(r" +[\n\r]", "\n", document)
+    # remove repeated newlines
+    document = re.sub(r"[\n\r]+", "\n", document)
+    return document.strip()
+
+
+def strip_newlines(document: str) -> str:
+    # HTML might contain newlines which are just whitespaces to a browser
+    return re.sub(r"[\n\r]+", " ", document)
+
+
+def format_document_soup(
+    document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
+) -> str:
+    """Format html to a flat text document.
+
+    The following goals:
+    - Newlines from within the HTML are removed (as browser would ignore them as well).
+    - Repeated newlines/spaces are removed (as browsers would ignore them).
+    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
+    - Table columns/rows are separated by newline
+    - List elements are separated by newline and start with a hyphen
+    """
+    text = ""
+    list_element_start = False
+    verbatim_output = 0
+    in_table = False
+    for e in document.descendants:
+        verbatim_output -= 1
+        if isinstance(e, bs4.element.NavigableString):
+            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
+                continue
+            element_text = e.text
+            if in_table:
+                # Tables are represented in natural language with rows separated by newlines
+                # Can't have newlines then in the table elements
+                element_text = element_text.replace("\n", " ").strip()
+            if element_text:
+                if verbatim_output > 0:
+                    text += element_text
+                else:
+                    text += strip_newlines(element_text)
+                list_element_start = False
+        elif isinstance(e, bs4.element.Tag):
+            # table is standard HTML element
+            if e.name == "table":
+                in_table = True
+            # tr is for rows
+            elif e.name == "tr" and in_table:
+                text += "\n"
+            # td for data cell, th for header
+            elif e.name in ["td", "th"] and in_table:
+                text += table_cell_separator
+            elif e.name == "/table":
+                in_table = False
+            elif in_table:
+                # don't handle other cases while in table
+                pass
+
+            elif e.name in ["p", "div"]:
+                if not list_element_start:
+                    text += "\n"
+            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+                text += "\n"
+                list_element_start = False
+            elif e.name == "li":
+                text += "\n- "
+                list_element_start = True
+            elif e.name == "pre":
+                if verbatim_output <= 0:
+                    verbatim_output = len(list(e.childGenerator()))
+    return strip_excessive_newlines_and_spaces(text)
+
+
+def parse_html_page_basic(text: str) -> str:
+    soup = bs4.BeautifulSoup(text, "html.parser")
+    return format_document_soup(soup)
+
+
+def web_html_cleanup(
+    page_content: str | bs4.BeautifulSoup,
    mintlify_cleanup_enabled: bool = True,
    additional_element_types_to_discard: list[str] | None = None,
 ) -> ParsedHTML:
    if isinstance(page_content, str):
-        soup = BeautifulSoup(page_content, "html.parser")
+        soup = bs4.BeautifulSoup(page_content, "html.parser")
    else:
        soup = page_content

--- a/backend/danswer/connectors/google_site/connector.py
+++ b/backend/danswer/connectors/google_site/connector.py
@@ -10,7 +10,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
 from danswer.connectors.cross_connector_utils.file_utils import read_file
-from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
+from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
@@ -101,7 +101,7 @@ class GoogleSitesConnector(LoadConnector):
                div.extract()

            # get the body of the page
-            parsed_html = standard_html_cleanup(
+            parsed_html = web_html_cleanup(
                soup, additional_element_types_to_discard=["header", "nav"]
            )

--- a/backend/danswer/connectors/guru/connector.py
+++ b/backend/danswer/connectors/guru/connector.py
@@ -7,6 +7,7 @@ import requests

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -15,7 +16,6 @@ from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
-from danswer.utils.text_processing import parse_html_page_basic

 # Potential Improvements
 # 1. Support fetching per collection via collection token (configured at connector creation)
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -21,7 +21,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
 from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
 from danswer.configs.constants import DocumentSource
-from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
+from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
@@ -218,7 +218,7 @@ class WebConnector(LoadConnector):
                        if link not in visited_links:
                            to_visit.append(link)

-                parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)
+                parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)

                doc_batch.append(
                    Document(
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -1,9 +1,6 @@
 import json
 import re

-import bs4
-from bs4 import BeautifulSoup
-

 def has_unescaped_quote(s: str) -> bool:
    pattern = r'(?<!\\)"'
@@ -57,64 +54,3 @@ def shared_precompare_cleanup(text: str) -> str:
    text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)

    return text
-
-
-def strip_excessive_newlines_and_spaces(document: str) -> str:
-    # collapse repeated spaces into one
-    document = re.sub(r" +", " ", document)
-    # remove trailing spaces
-    document = re.sub(r" +[\n\r]", "\n", document)
-    # remove repeated newlines
-    document = re.sub(r"[\n\r]+", "\n", document)
-    return document.strip()
-
-
-def strip_newlines(document: str) -> str:
-    # HTML might contain newlines which are just whitespaces to a browser
-    return re.sub(r"[\n\r]+", " ", document)
-
-
-def format_document_soup(document: BeautifulSoup) -> str:
-    """Format html to a flat text document.
-
-    The following goals:
-    - Newlines from within the HTML are removed (as browser would ignore them as well).
-    - Repeated newlines/spaces are removed (as browsers would ignore them).
-    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
-    - Table columns/rows are separated by newline
-    - List elements are separated by newline and start with a hyphen
-    """
-    text = ""
-    list_element_start = False
-    verbatim_output = 0
-    for e in document.descendants:
-        verbatim_output -= 1
-        if isinstance(e, bs4.element.NavigableString):
-            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
-                continue
-            element_text = e.text
-            if element_text:
-                if verbatim_output > 0:
-                    text += element_text
-                else:
-                    text += strip_newlines(element_text)
-                list_element_start = False
-        elif isinstance(e, bs4.element.Tag):
-            if e.name in ["p", "div"]:
-                if not list_element_start:
-                    text += "\n"
-            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
-                text += "\n"
-                list_element_start = False
-            elif e.name == "li":
-                text += "\n- "
-                list_element_start = True
-            elif e.name == "pre":
-                if verbatim_output <= 0:
-                    verbatim_output = len(list(e.childGenerator()))
-    return strip_excessive_newlines_and_spaces(text)
-
-
-def parse_html_page_basic(text: str) -> str:
-    soup = BeautifulSoup(text, "html.parser")
-    return format_document_soup(soup)
--- a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py
+++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py
@@ -0,0 +1,17 @@
+import unittest
+
+from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
+
+
+class TestQAPostprocessing(unittest.TestCase):
+    def test_parse_table(self) -> None:
+        with open("./test_table.html", "r") as file:
+            content = file.read()
+
+        parsed = parse_html_page_basic(content)
+        expected = "\n\thello\tthere\tgeneral\n\tkenobi\ta\tb\n\tc\td\te"
+        self.assertIn(expected, parsed)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_table.html
+++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_table.html
@@ -0,0 +1,39 @@
+<p>This page is to ensure we’re able to parse a table into a tsv</p>
+<table data-table-width="760" data-layout="default" ac:local-id="3ad64d9f-01f1-4f78-876e-0fdf84e826a6">
+   <tbody>
+      <tr>
+         <th>
+            <p><strong>hello</strong></p>
+         </th>
+         <th>
+            <p><strong>there</strong></p>
+         </th>
+         <th>
+            <p><strong>general</strong></p>
+         </th>
+      </tr>
+      <tr>
+         <td>
+            <p>kenobi</p>
+         </td>
+         <td>
+            <p>a</p>
+         </td>
+         <td>
+            <p>b</p>
+         </td>
+      </tr>
+      <tr>
+         <td>
+            <p>c</p>
+         </td>
+         <td>
+            <p>d</p>
+         </td>
+         <td>
+            <p>e</p>
+         </td>
+      </tr>
+   </tbody>
+</table>
+<p />