mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-25 11:16:43 +02:00
Recreate Tables from HTML (#588)
This commit is contained in:
@@ -7,6 +7,7 @@ from typing import Any
|
|||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.bookstack.client import BookStackApiClient
|
from danswer.connectors.bookstack.client import BookStackApiClient
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@@ -14,7 +15,6 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
|||||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.utils.text_processing import parse_html_page_basic
|
|
||||||
|
|
||||||
|
|
||||||
class BookstackConnector(LoadConnector, PollConnector):
|
class BookstackConnector(LoadConnector, PollConnector):
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from collections.abc import Collection
|
from collections.abc import Collection
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -13,6 +12,7 @@ from requests import HTTPError
|
|||||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@@ -21,7 +21,6 @@ from danswer.connectors.models import ConnectorMissingCredentialError
|
|||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from danswer.utils.text_processing import parse_html_page_basic
|
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
@@ -266,6 +265,8 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import os
|
||||||
|
|
||||||
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
|
||||||
connector.load_credentials(
|
connector.load_credentials(
|
||||||
{
|
{
|
||||||
|
@@ -1,11 +1,11 @@
|
|||||||
|
import re
|
||||||
from copy import copy
|
from copy import copy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
import bs4
|
||||||
|
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS
|
||||||
from danswer.utils.text_processing import format_document_soup
|
|
||||||
|
|
||||||
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||||
|
|
||||||
@@ -16,13 +16,96 @@ class ParsedHTML:
|
|||||||
cleaned_text: str
|
cleaned_text: str
|
||||||
|
|
||||||
|
|
||||||
def standard_html_cleanup(
|
def strip_excessive_newlines_and_spaces(document: str) -> str:
|
||||||
page_content: str | BeautifulSoup,
|
# collapse repeated spaces into one
|
||||||
|
document = re.sub(r" +", " ", document)
|
||||||
|
# remove trailing spaces
|
||||||
|
document = re.sub(r" +[\n\r]", "\n", document)
|
||||||
|
# remove repeated newlines
|
||||||
|
document = re.sub(r"[\n\r]+", "\n", document)
|
||||||
|
return document.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_newlines(document: str) -> str:
|
||||||
|
# HTML might contain newlines which are just whitespaces to a browser
|
||||||
|
return re.sub(r"[\n\r]+", " ", document)
|
||||||
|
|
||||||
|
|
||||||
|
def format_document_soup(
|
||||||
|
document: bs4.BeautifulSoup, table_cell_separator: str = "\t"
|
||||||
|
) -> str:
|
||||||
|
"""Format html to a flat text document.
|
||||||
|
|
||||||
|
The following goals:
|
||||||
|
- Newlines from within the HTML are removed (as browser would ignore them as well).
|
||||||
|
- Repeated newlines/spaces are removed (as browsers would ignore them).
|
||||||
|
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
|
||||||
|
- Table columns/rows are separated by newline
|
||||||
|
- List elements are separated by newline and start with a hyphen
|
||||||
|
"""
|
||||||
|
text = ""
|
||||||
|
list_element_start = False
|
||||||
|
verbatim_output = 0
|
||||||
|
in_table = False
|
||||||
|
for e in document.descendants:
|
||||||
|
verbatim_output -= 1
|
||||||
|
if isinstance(e, bs4.element.NavigableString):
|
||||||
|
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
|
||||||
|
continue
|
||||||
|
element_text = e.text
|
||||||
|
if in_table:
|
||||||
|
# Tables are represented in natural language with rows separated by newlines
|
||||||
|
# Can't have newlines then in the table elements
|
||||||
|
element_text = element_text.replace("\n", " ").strip()
|
||||||
|
if element_text:
|
||||||
|
if verbatim_output > 0:
|
||||||
|
text += element_text
|
||||||
|
else:
|
||||||
|
text += strip_newlines(element_text)
|
||||||
|
list_element_start = False
|
||||||
|
elif isinstance(e, bs4.element.Tag):
|
||||||
|
# table is standard HTML element
|
||||||
|
if e.name == "table":
|
||||||
|
in_table = True
|
||||||
|
# tr is for rows
|
||||||
|
elif e.name == "tr" and in_table:
|
||||||
|
text += "\n"
|
||||||
|
# td for data cell, th for header
|
||||||
|
elif e.name in ["td", "th"] and in_table:
|
||||||
|
text += table_cell_separator
|
||||||
|
elif e.name == "/table":
|
||||||
|
in_table = False
|
||||||
|
elif in_table:
|
||||||
|
# don't handle other cases while in table
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif e.name in ["p", "div"]:
|
||||||
|
if not list_element_start:
|
||||||
|
text += "\n"
|
||||||
|
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
||||||
|
text += "\n"
|
||||||
|
list_element_start = False
|
||||||
|
elif e.name == "li":
|
||||||
|
text += "\n- "
|
||||||
|
list_element_start = True
|
||||||
|
elif e.name == "pre":
|
||||||
|
if verbatim_output <= 0:
|
||||||
|
verbatim_output = len(list(e.childGenerator()))
|
||||||
|
return strip_excessive_newlines_and_spaces(text)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html_page_basic(text: str) -> str:
|
||||||
|
soup = bs4.BeautifulSoup(text, "html.parser")
|
||||||
|
return format_document_soup(soup)
|
||||||
|
|
||||||
|
|
||||||
|
def web_html_cleanup(
|
||||||
|
page_content: str | bs4.BeautifulSoup,
|
||||||
mintlify_cleanup_enabled: bool = True,
|
mintlify_cleanup_enabled: bool = True,
|
||||||
additional_element_types_to_discard: list[str] | None = None,
|
additional_element_types_to_discard: list[str] | None = None,
|
||||||
) -> ParsedHTML:
|
) -> ParsedHTML:
|
||||||
if isinstance(page_content, str):
|
if isinstance(page_content, str):
|
||||||
soup = BeautifulSoup(page_content, "html.parser")
|
soup = bs4.BeautifulSoup(page_content, "html.parser")
|
||||||
else:
|
else:
|
||||||
soup = page_content
|
soup = page_content
|
||||||
|
|
||||||
|
@@ -10,7 +10,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
|||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
|
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
@@ -101,7 +101,7 @@ class GoogleSitesConnector(LoadConnector):
|
|||||||
div.extract()
|
div.extract()
|
||||||
|
|
||||||
# get the body of the page
|
# get the body of the page
|
||||||
parsed_html = standard_html_cleanup(
|
parsed_html = web_html_cleanup(
|
||||||
soup, additional_element_types_to_discard=["header", "nav"]
|
soup, additional_element_types_to_discard=["header", "nav"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@@ -7,6 +7,7 @@ import requests
|
|||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@@ -15,7 +16,6 @@ from danswer.connectors.models import ConnectorMissingCredentialError
|
|||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from danswer.utils.text_processing import parse_html_page_basic
|
|
||||||
|
|
||||||
# Potential Improvements
|
# Potential Improvements
|
||||||
# 1. Support fetching per collection via collection token (configured at connector creation)
|
# 1. Support fetching per collection via collection token (configured at connector creation)
|
||||||
|
@@ -21,7 +21,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
|||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import standard_html_cleanup
|
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
@@ -218,7 +218,7 @@ class WebConnector(LoadConnector):
|
|||||||
if link not in visited_links:
|
if link not in visited_links:
|
||||||
to_visit.append(link)
|
to_visit.append(link)
|
||||||
|
|
||||||
parsed_html = standard_html_cleanup(soup, self.mintlify_cleanup)
|
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
@@ -1,9 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import bs4
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
def has_unescaped_quote(s: str) -> bool:
|
def has_unescaped_quote(s: str) -> bool:
|
||||||
pattern = r'(?<!\\)"'
|
pattern = r'(?<!\\)"'
|
||||||
@@ -57,64 +54,3 @@ def shared_precompare_cleanup(text: str) -> str:
|
|||||||
text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
|
text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def strip_excessive_newlines_and_spaces(document: str) -> str:
|
|
||||||
# collapse repeated spaces into one
|
|
||||||
document = re.sub(r" +", " ", document)
|
|
||||||
# remove trailing spaces
|
|
||||||
document = re.sub(r" +[\n\r]", "\n", document)
|
|
||||||
# remove repeated newlines
|
|
||||||
document = re.sub(r"[\n\r]+", "\n", document)
|
|
||||||
return document.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def strip_newlines(document: str) -> str:
|
|
||||||
# HTML might contain newlines which are just whitespaces to a browser
|
|
||||||
return re.sub(r"[\n\r]+", " ", document)
|
|
||||||
|
|
||||||
|
|
||||||
def format_document_soup(document: BeautifulSoup) -> str:
|
|
||||||
"""Format html to a flat text document.
|
|
||||||
|
|
||||||
The following goals:
|
|
||||||
- Newlines from within the HTML are removed (as browser would ignore them as well).
|
|
||||||
- Repeated newlines/spaces are removed (as browsers would ignore them).
|
|
||||||
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
|
|
||||||
- Table columns/rows are separated by newline
|
|
||||||
- List elements are separated by newline and start with a hyphen
|
|
||||||
"""
|
|
||||||
text = ""
|
|
||||||
list_element_start = False
|
|
||||||
verbatim_output = 0
|
|
||||||
for e in document.descendants:
|
|
||||||
verbatim_output -= 1
|
|
||||||
if isinstance(e, bs4.element.NavigableString):
|
|
||||||
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
|
|
||||||
continue
|
|
||||||
element_text = e.text
|
|
||||||
if element_text:
|
|
||||||
if verbatim_output > 0:
|
|
||||||
text += element_text
|
|
||||||
else:
|
|
||||||
text += strip_newlines(element_text)
|
|
||||||
list_element_start = False
|
|
||||||
elif isinstance(e, bs4.element.Tag):
|
|
||||||
if e.name in ["p", "div"]:
|
|
||||||
if not list_element_start:
|
|
||||||
text += "\n"
|
|
||||||
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
|
||||||
text += "\n"
|
|
||||||
list_element_start = False
|
|
||||||
elif e.name == "li":
|
|
||||||
text += "\n- "
|
|
||||||
list_element_start = True
|
|
||||||
elif e.name == "pre":
|
|
||||||
if verbatim_output <= 0:
|
|
||||||
verbatim_output = len(list(e.childGenerator()))
|
|
||||||
return strip_excessive_newlines_and_spaces(text)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_html_page_basic(text: str) -> str:
|
|
||||||
soup = BeautifulSoup(text, "html.parser")
|
|
||||||
return format_document_soup(soup)
|
|
||||||
|
@@ -0,0 +1,17 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||||
|
|
||||||
|
|
||||||
|
class TestQAPostprocessing(unittest.TestCase):
|
||||||
|
def test_parse_table(self) -> None:
|
||||||
|
with open("./test_table.html", "r") as file:
|
||||||
|
content = file.read()
|
||||||
|
|
||||||
|
parsed = parse_html_page_basic(content)
|
||||||
|
expected = "\n\thello\tthere\tgeneral\n\tkenobi\ta\tb\n\tc\td\te"
|
||||||
|
self.assertIn(expected, parsed)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
@@ -0,0 +1,39 @@
|
|||||||
|
<p>This page is to ensure we’re able to parse a table into a tsv</p>
|
||||||
|
<table data-table-width="760" data-layout="default" ac:local-id="3ad64d9f-01f1-4f78-876e-0fdf84e826a6">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>
|
||||||
|
<p><strong>hello</strong></p>
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
<p><strong>there</strong></p>
|
||||||
|
</th>
|
||||||
|
<th>
|
||||||
|
<p><strong>general</strong></p>
|
||||||
|
</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
<p>kenobi</p>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<p>a</p>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<p>b</p>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>
|
||||||
|
<p>c</p>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<p>d</p>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<p>e</p>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
<p />
|
Reference in New Issue
Block a user