diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 7ce83b49d..62a648a91 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -14,7 +14,6 @@ ALLOWED_USERS = "allowed_users" ALLOWED_GROUPS = "allowed_groups" METADATA = "metadata" GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key" -HTML_SEPARATOR = "\n" PUBLIC_DOC_PAT = "PUBLIC" QUOTE = "quote" BOOST = "boost" diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 884e2da6d..b12fdf969 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -1,3 +1,4 @@ +import os from collections.abc import Callable from collections.abc import Collection from datetime import datetime @@ -101,12 +102,43 @@ class ConfluenceConnector(LoadConnector, PollConnector): start_ind: int, ) -> Collection[dict[str, Any]]: def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]: - return confluence_client.get_all_pages_from_space( - self.space, - start=start_ind, - limit=batch_size, - expand="body.storage.value,version", - ) + try: + return confluence_client.get_all_pages_from_space( + self.space, + start=start_ind, + limit=batch_size, + expand="body.storage.value,version", + ) + except: + logger.warning( + f"Batch failed with space {self.space} at offset {start_ind}" + ) + + view_pages: list[dict[str, Any]] = [] + for i in range(self.batch_size): + try: + # Could be that one of the pages here failed due to this bug: + # https://jira.atlassian.com/browse/CONFCLOUD-76433 + view_pages.extend( + confluence_client.get_all_pages_from_space( + self.space, + start=start_ind + i, + limit=1, + expand="body.storage.value,version", + ) + ) + except: + # Use view instead, which captures most info but is less complete + view_pages.extend( + confluence_client.get_all_pages_from_space( + self.space, + start=start_ind + i, + limit=1, + expand="body.view.value,version", + ) + ) + + return view_pages try: return _fetch(start_ind, self.batch_size) @@ -162,7 +194,10 @@ class ConfluenceConnector(LoadConnector, PollConnector): last_modified = datetime.fromisoformat(last_modified_str) if time_filter is None or time_filter(last_modified): - page_html = page["body"]["storage"]["value"] + page_html = ( + page["body"].get("storage", {}).get("value") + or page["body"]["view"]["value"] + ) page_text = ( page.get("title", "") + "\n" + parse_html_page_basic(page_html) ) @@ -219,3 +254,15 @@ class ConfluenceConnector(LoadConnector, PollConnector): if num_pages < self.batch_size: break + + +if __name__ == "__main__": + connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"]) + connector.load_credentials( + { + "confluence_username": os.environ["CONFLUENCE_USER_NAME"], + "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"], + } + ) + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 1780120f7..7979b5b8d 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -1,5 +1,4 @@ import io -import re from datetime import datetime from typing import Any from typing import cast @@ -7,7 +6,6 @@ from typing import Tuple from urllib.parse import urljoin from urllib.parse import urlparse -import bs4 import requests from bs4 import BeautifulSoup from oauthlib.oauth2 import BackendApplicationClient @@ -29,6 +27,7 @@ from danswer.connectors.interfaces import LoadConnector from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger +from danswer.utils.text_processing import format_document_soup logger = setup_logger() @@ -62,62 +61,6 @@ def get_internal_links( return internal_links -def strip_excessive_newlines_and_spaces(document: str) -> str: - # collapse repeated spaces into one - document = re.sub(r" +", " ", document) - # remove trailing spaces - document = re.sub(r" +[\n\r]", "\n", document) - # remove repeated newlines - document = re.sub(r"[\n\r]+", "\n", document) - return document.strip() - - -def strip_newlines(document: str) -> str: - # HTML might contain newlines which are just whitespaces to a browser - return re.sub(r"[\n\r]+", " ", document) - - -def format_document(document: BeautifulSoup) -> str: - """Format html to a flat text document. - - The following goals: - - Newlines from within the HTML are removed (as browser would ignore them as well). - - Repeated newlines/spaces are removed (as browsers would ignore them). - - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag) - - Table columns/rows are separated by newline - - List elements are separated by newline and start with a hyphen - """ - text = "" - list_element_start = False - verbatim_output = 0 - for e in document.descendants: - verbatim_output -= 1 - if isinstance(e, bs4.element.NavigableString): - if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)): - continue - element_text = e.text - if element_text: - if verbatim_output > 0: - text += element_text - else: - text += strip_newlines(element_text) - list_element_start = False - elif isinstance(e, bs4.element.Tag): - if e.name in ["p", "div"]: - if not list_element_start: - text += "\n" - elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]: - text += "\n" - list_element_start = False - elif e.name == "li": - text += "\n- " - list_element_start = True - elif e.name == "pre": - if verbatim_output <= 0: - verbatim_output = len(list(e.childGenerator())) - return strip_excessive_newlines_and_spaces(text) - - def start_playwright() -> Tuple[Playwright, BrowserContext]: playwright = sync_playwright().start() browser = playwright.chromium.launch(headless=True) @@ -239,7 +182,7 @@ class WebConnector(LoadConnector): for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS: [tag.extract() for tag in soup.find_all(undesired_tag)] - page_text = format_document(soup) + page_text = format_document_soup(soup) doc_batch.append( Document( @@ -267,3 +210,9 @@ class WebConnector(LoadConnector): if doc_batch: playwright.stop() yield doc_batch + + +if __name__ == "__main__": + connector = WebConnector("https://docs.danswer.dev/") + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index 718988953..c27c323f0 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -1,9 +1,8 @@ import re +import bs4 from bs4 import BeautifulSoup -from danswer.configs.constants import HTML_SEPARATOR - def clean_model_quote(quote: str, trim_length: int) -> str: quote_clean = quote.strip() @@ -32,6 +31,62 @@ def shared_precompare_cleanup(text: str) -> str: return text +def strip_excessive_newlines_and_spaces(document: str) -> str: + # collapse repeated spaces into one + document = re.sub(r" +", " ", document) + # remove trailing spaces + document = re.sub(r" +[\n\r]", "\n", document) + # remove repeated newlines + document = re.sub(r"[\n\r]+", "\n", document) + return document.strip() + + +def strip_newlines(document: str) -> str: + # HTML might contain newlines which are just whitespaces to a browser + return re.sub(r"[\n\r]+", " ", document) + + +def format_document_soup(document: BeautifulSoup) -> str: + """Format html to a flat text document. + + The following goals: + - Newlines from within the HTML are removed (as browser would ignore them as well). + - Repeated newlines/spaces are removed (as browsers would ignore them). + - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag) + - Table columns/rows are separated by newline + - List elements are separated by newline and start with a hyphen + """ + text = "" + list_element_start = False + verbatim_output = 0 + for e in document.descendants: + verbatim_output -= 1 + if isinstance(e, bs4.element.NavigableString): + if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)): + continue + element_text = e.text + if element_text: + if verbatim_output > 0: + text += element_text + else: + text += strip_newlines(element_text) + list_element_start = False + elif isinstance(e, bs4.element.Tag): + if e.name in ["p", "div"]: + if not list_element_start: + text += "\n" + elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]: + text += "\n" + list_element_start = False + elif e.name == "li": + text += "\n- " + list_element_start = True + elif e.name == "pre": + if verbatim_output <= 0: + verbatim_output = len(list(e.childGenerator())) + return strip_excessive_newlines_and_spaces(text) + + def parse_html_page_basic(text: str) -> str: soup = BeautifulSoup(text, "html.parser") - return soup.get_text(HTML_SEPARATOR) + return format_document_soup(soup)