Confluence handle pages without body.storage.value (#347)

Workaround for: https://jira.atlassian.com/browse/CONFCLOUD-76433
2025-05-17 23:30:07 +02:00 · 2023-08-28 18:35:13 -07:00 · 2023-08-28 18:35:13 -07:00 · 548f0a41cb
commit 548f0a41cb
parent b2a51283d1
4 changed files with 120 additions and 70 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -14,7 +14,6 @@ ALLOWED_USERS = "allowed_users"
 ALLOWED_GROUPS = "allowed_groups"
 METADATA = "metadata"
 GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
-HTML_SEPARATOR = "\n"
 PUBLIC_DOC_PAT = "PUBLIC"
 QUOTE = "quote"
 BOOST = "boost"
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@ -1,3 +1,4 @@
+import os
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@ -101,12 +102,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        start_ind: int,
    ) -> Collection[dict[str, Any]]:
        def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
-            return confluence_client.get_all_pages_from_space(
-                self.space,
-                start=start_ind,
-                limit=batch_size,
-                expand="body.storage.value,version",
-            )
+            try:
+                return confluence_client.get_all_pages_from_space(
+                    self.space,
+                    start=start_ind,
+                    limit=batch_size,
+                    expand="body.storage.value,version",
+                )
+            except:
+                logger.warning(
+                    f"Batch failed with space {self.space} at offset {start_ind}"
+                )
+
+                view_pages: list[dict[str, Any]] = []
+                for i in range(self.batch_size):
+                    try:
+                        # Could be that one of the pages here failed due to this bug:
+                        # https://jira.atlassian.com/browse/CONFCLOUD-76433
+                        view_pages.extend(
+                            confluence_client.get_all_pages_from_space(
+                                self.space,
+                                start=start_ind + i,
+                                limit=1,
+                                expand="body.storage.value,version",
+                            )
+                        )
+                    except:
+                        # Use view instead, which captures most info but is less complete
+                        view_pages.extend(
+                            confluence_client.get_all_pages_from_space(
+                                self.space,
+                                start=start_ind + i,
+                                limit=1,
+                                expand="body.view.value,version",
+                            )
+                        )
+
+                return view_pages

        try:
            return _fetch(start_ind, self.batch_size)
@ -162,7 +194,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            last_modified = datetime.fromisoformat(last_modified_str)

            if time_filter is None or time_filter(last_modified):
-                page_html = page["body"]["storage"]["value"]
+                page_html = (
+                    page["body"].get("storage", {}).get("value")
+                    or page["body"]["view"]["value"]
+                )
                page_text = (
                    page.get("title", "") + "\n" + parse_html_page_basic(page_html)
                )
@ -219,3 +254,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):

            if num_pages < self.batch_size:
                break
+
+
+if __name__ == "__main__":
+    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
+    connector.load_credentials(
+        {
+            "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
+            "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
+        }
+    )
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@ -1,5 +1,4 @@
 import io
-import re
 from datetime import datetime
 from typing import Any
 from typing import cast
@ -7,7 +6,6 @@ from typing import Tuple
 from urllib.parse import urljoin
 from urllib.parse import urlparse

-import bs4
 import requests
 from bs4 import BeautifulSoup
 from oauthlib.oauth2 import BackendApplicationClient
@ -29,6 +27,7 @@ from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
+from danswer.utils.text_processing import format_document_soup

 logger = setup_logger()

@ -62,62 +61,6 @@ def get_internal_links(
    return internal_links


-def strip_excessive_newlines_and_spaces(document: str) -> str:
-    # collapse repeated spaces into one
-    document = re.sub(r" +", " ", document)
-    # remove trailing spaces
-    document = re.sub(r" +[\n\r]", "\n", document)
-    # remove repeated newlines
-    document = re.sub(r"[\n\r]+", "\n", document)
-    return document.strip()
-
-
-def strip_newlines(document: str) -> str:
-    # HTML might contain newlines which are just whitespaces to a browser
-    return re.sub(r"[\n\r]+", " ", document)
-
-
-def format_document(document: BeautifulSoup) -> str:
-    """Format html to a flat text document.
-
-    The following goals:
-    - Newlines from within the HTML are removed (as browser would ignore them as well).
-    - Repeated newlines/spaces are removed (as browsers would ignore them).
-    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
-    - Table columns/rows are separated by newline
-    - List elements are separated by newline and start with a hyphen
-    """
-    text = ""
-    list_element_start = False
-    verbatim_output = 0
-    for e in document.descendants:
-        verbatim_output -= 1
-        if isinstance(e, bs4.element.NavigableString):
-            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
-                continue
-            element_text = e.text
-            if element_text:
-                if verbatim_output > 0:
-                    text += element_text
-                else:
-                    text += strip_newlines(element_text)
-                list_element_start = False
-        elif isinstance(e, bs4.element.Tag):
-            if e.name in ["p", "div"]:
-                if not list_element_start:
-                    text += "\n"
-            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
-                text += "\n"
-                list_element_start = False
-            elif e.name == "li":
-                text += "\n- "
-                list_element_start = True
-            elif e.name == "pre":
-                if verbatim_output <= 0:
-                    verbatim_output = len(list(e.childGenerator()))
-    return strip_excessive_newlines_and_spaces(text)
-
-
 def start_playwright() -> Tuple[Playwright, BrowserContext]:
    playwright = sync_playwright().start()
    browser = playwright.chromium.launch(headless=True)
@ -239,7 +182,7 @@ class WebConnector(LoadConnector):
                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
                    [tag.extract() for tag in soup.find_all(undesired_tag)]

-                page_text = format_document(soup)
+                page_text = format_document_soup(soup)

                doc_batch.append(
                    Document(
@ -267,3 +210,9 @@ class WebConnector(LoadConnector):
        if doc_batch:
            playwright.stop()
            yield doc_batch
+
+
+if __name__ == "__main__":
+    connector = WebConnector("https://docs.danswer.dev/")
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@ -1,9 +1,8 @@
 import re

+import bs4
 from bs4 import BeautifulSoup

-from danswer.configs.constants import HTML_SEPARATOR
-

 def clean_model_quote(quote: str, trim_length: int) -> str:
    quote_clean = quote.strip()
@ -32,6 +31,62 @@ def shared_precompare_cleanup(text: str) -> str:
    return text


+def strip_excessive_newlines_and_spaces(document: str) -> str:
+    # collapse repeated spaces into one
+    document = re.sub(r" +", " ", document)
+    # remove trailing spaces
+    document = re.sub(r" +[\n\r]", "\n", document)
+    # remove repeated newlines
+    document = re.sub(r"[\n\r]+", "\n", document)
+    return document.strip()
+
+
+def strip_newlines(document: str) -> str:
+    # HTML might contain newlines which are just whitespaces to a browser
+    return re.sub(r"[\n\r]+", " ", document)
+
+
+def format_document_soup(document: BeautifulSoup) -> str:
+    """Format html to a flat text document.
+
+    The following goals:
+    - Newlines from within the HTML are removed (as browser would ignore them as well).
+    - Repeated newlines/spaces are removed (as browsers would ignore them).
+    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
+    - Table columns/rows are separated by newline
+    - List elements are separated by newline and start with a hyphen
+    """
+    text = ""
+    list_element_start = False
+    verbatim_output = 0
+    for e in document.descendants:
+        verbatim_output -= 1
+        if isinstance(e, bs4.element.NavigableString):
+            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
+                continue
+            element_text = e.text
+            if element_text:
+                if verbatim_output > 0:
+                    text += element_text
+                else:
+                    text += strip_newlines(element_text)
+                list_element_start = False
+        elif isinstance(e, bs4.element.Tag):
+            if e.name in ["p", "div"]:
+                if not list_element_start:
+                    text += "\n"
+            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+                text += "\n"
+                list_element_start = False
+            elif e.name == "li":
+                text += "\n- "
+                list_element_start = True
+            elif e.name == "pre":
+                if verbatim_output <= 0:
+                    verbatim_output = len(list(e.childGenerator()))
+    return strip_excessive_newlines_and_spaces(text)
+
+
 def parse_html_page_basic(text: str) -> str:
    soup = BeautifulSoup(text, "html.parser")
-    return soup.get_text(HTML_SEPARATOR)
+    return format_document_soup(soup)