Confluence handle pages without body.storage.value (#347)

Workaround for: https://jira.atlassian.com/browse/CONFCLOUD-76433
This commit is contained in:
Yuhong Sun 2023-08-28 18:35:13 -07:00 committed by GitHub
parent b2a51283d1
commit 548f0a41cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 120 additions and 70 deletions

View File

@ -14,7 +14,6 @@ ALLOWED_USERS = "allowed_users"
ALLOWED_GROUPS = "allowed_groups" ALLOWED_GROUPS = "allowed_groups"
METADATA = "metadata" METADATA = "metadata"
GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key" GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
HTML_SEPARATOR = "\n"
PUBLIC_DOC_PAT = "PUBLIC" PUBLIC_DOC_PAT = "PUBLIC"
QUOTE = "quote" QUOTE = "quote"
BOOST = "boost" BOOST = "boost"

View File

@ -1,3 +1,4 @@
import os
from collections.abc import Callable from collections.abc import Callable
from collections.abc import Collection from collections.abc import Collection
from datetime import datetime from datetime import datetime
@ -101,12 +102,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
start_ind: int, start_ind: int,
) -> Collection[dict[str, Any]]: ) -> Collection[dict[str, Any]]:
def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]: def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
try:
return confluence_client.get_all_pages_from_space( return confluence_client.get_all_pages_from_space(
self.space, self.space,
start=start_ind, start=start_ind,
limit=batch_size, limit=batch_size,
expand="body.storage.value,version", expand="body.storage.value,version",
) )
except:
logger.warning(
f"Batch failed with space {self.space} at offset {start_ind}"
)
view_pages: list[dict[str, Any]] = []
for i in range(self.batch_size):
try:
# Could be that one of the pages here failed due to this bug:
# https://jira.atlassian.com/browse/CONFCLOUD-76433
view_pages.extend(
confluence_client.get_all_pages_from_space(
self.space,
start=start_ind + i,
limit=1,
expand="body.storage.value,version",
)
)
except:
# Use view instead, which captures most info but is less complete
view_pages.extend(
confluence_client.get_all_pages_from_space(
self.space,
start=start_ind + i,
limit=1,
expand="body.view.value,version",
)
)
return view_pages
try: try:
return _fetch(start_ind, self.batch_size) return _fetch(start_ind, self.batch_size)
@ -162,7 +194,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
last_modified = datetime.fromisoformat(last_modified_str) last_modified = datetime.fromisoformat(last_modified_str)
if time_filter is None or time_filter(last_modified): if time_filter is None or time_filter(last_modified):
page_html = page["body"]["storage"]["value"] page_html = (
page["body"].get("storage", {}).get("value")
or page["body"]["view"]["value"]
)
page_text = ( page_text = (
page.get("title", "") + "\n" + parse_html_page_basic(page_html) page.get("title", "") + "\n" + parse_html_page_basic(page_html)
) )
@ -219,3 +254,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if num_pages < self.batch_size: if num_pages < self.batch_size:
break break
if __name__ == "__main__":
connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
connector.load_credentials(
{
"confluence_username": os.environ["CONFLUENCE_USER_NAME"],
"confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
}
)
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@ -1,5 +1,4 @@
import io import io
import re
from datetime import datetime from datetime import datetime
from typing import Any from typing import Any
from typing import cast from typing import cast
@ -7,7 +6,6 @@ from typing import Tuple
from urllib.parse import urljoin from urllib.parse import urljoin
from urllib.parse import urlparse from urllib.parse import urlparse
import bs4
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from oauthlib.oauth2 import BackendApplicationClient from oauthlib.oauth2 import BackendApplicationClient
@ -29,6 +27,7 @@ from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
from danswer.utils.text_processing import format_document_soup
logger = setup_logger() logger = setup_logger()
@ -62,62 +61,6 @@ def get_internal_links(
return internal_links return internal_links
def strip_excessive_newlines_and_spaces(document: str) -> str:
# collapse repeated spaces into one
document = re.sub(r" +", " ", document)
# remove trailing spaces
document = re.sub(r" +[\n\r]", "\n", document)
# remove repeated newlines
document = re.sub(r"[\n\r]+", "\n", document)
return document.strip()
def strip_newlines(document: str) -> str:
# HTML might contain newlines which are just whitespaces to a browser
return re.sub(r"[\n\r]+", " ", document)
def format_document(document: BeautifulSoup) -> str:
"""Format html to a flat text document.
The following goals:
- Newlines from within the HTML are removed (as browser would ignore them as well).
- Repeated newlines/spaces are removed (as browsers would ignore them).
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
- Table columns/rows are separated by newline
- List elements are separated by newline and start with a hyphen
"""
text = ""
list_element_start = False
verbatim_output = 0
for e in document.descendants:
verbatim_output -= 1
if isinstance(e, bs4.element.NavigableString):
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
continue
element_text = e.text
if element_text:
if verbatim_output > 0:
text += element_text
else:
text += strip_newlines(element_text)
list_element_start = False
elif isinstance(e, bs4.element.Tag):
if e.name in ["p", "div"]:
if not list_element_start:
text += "\n"
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
text += "\n"
list_element_start = False
elif e.name == "li":
text += "\n- "
list_element_start = True
elif e.name == "pre":
if verbatim_output <= 0:
verbatim_output = len(list(e.childGenerator()))
return strip_excessive_newlines_and_spaces(text)
def start_playwright() -> Tuple[Playwright, BrowserContext]: def start_playwright() -> Tuple[Playwright, BrowserContext]:
playwright = sync_playwright().start() playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=True) browser = playwright.chromium.launch(headless=True)
@ -239,7 +182,7 @@ class WebConnector(LoadConnector):
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS: for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
[tag.extract() for tag in soup.find_all(undesired_tag)] [tag.extract() for tag in soup.find_all(undesired_tag)]
page_text = format_document(soup) page_text = format_document_soup(soup)
doc_batch.append( doc_batch.append(
Document( Document(
@ -267,3 +210,9 @@ class WebConnector(LoadConnector):
if doc_batch: if doc_batch:
playwright.stop() playwright.stop()
yield doc_batch yield doc_batch
if __name__ == "__main__":
connector = WebConnector("https://docs.danswer.dev/")
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@ -1,9 +1,8 @@
import re import re
import bs4
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from danswer.configs.constants import HTML_SEPARATOR
def clean_model_quote(quote: str, trim_length: int) -> str: def clean_model_quote(quote: str, trim_length: int) -> str:
quote_clean = quote.strip() quote_clean = quote.strip()
@ -32,6 +31,62 @@ def shared_precompare_cleanup(text: str) -> str:
return text return text
def strip_excessive_newlines_and_spaces(document: str) -> str:
# collapse repeated spaces into one
document = re.sub(r" +", " ", document)
# remove trailing spaces
document = re.sub(r" +[\n\r]", "\n", document)
# remove repeated newlines
document = re.sub(r"[\n\r]+", "\n", document)
return document.strip()
def strip_newlines(document: str) -> str:
# HTML might contain newlines which are just whitespaces to a browser
return re.sub(r"[\n\r]+", " ", document)
def format_document_soup(document: BeautifulSoup) -> str:
"""Format html to a flat text document.
The following goals:
- Newlines from within the HTML are removed (as browser would ignore them as well).
- Repeated newlines/spaces are removed (as browsers would ignore them).
- Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
- Table columns/rows are separated by newline
- List elements are separated by newline and start with a hyphen
"""
text = ""
list_element_start = False
verbatim_output = 0
for e in document.descendants:
verbatim_output -= 1
if isinstance(e, bs4.element.NavigableString):
if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
continue
element_text = e.text
if element_text:
if verbatim_output > 0:
text += element_text
else:
text += strip_newlines(element_text)
list_element_start = False
elif isinstance(e, bs4.element.Tag):
if e.name in ["p", "div"]:
if not list_element_start:
text += "\n"
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
text += "\n"
list_element_start = False
elif e.name == "li":
text += "\n- "
list_element_start = True
elif e.name == "pre":
if verbatim_output <= 0:
verbatim_output = len(list(e.childGenerator()))
return strip_excessive_newlines_and_spaces(text)
def parse_html_page_basic(text: str) -> str: def parse_html_page_basic(text: str) -> str:
soup = BeautifulSoup(text, "html.parser") soup = BeautifulSoup(text, "html.parser")
return soup.get_text(HTML_SEPARATOR) return format_document_soup(soup)