Confluence handle pages without body.storage.value (#347)

Workaround for: https://jira.atlassian.com/browse/CONFCLOUD-76433
2025-05-18 15:50:13 +02:00 · 2023-08-28 18:35:13 -07:00 · 2023-08-28 18:35:13 -07:00 · 548f0a41cb
commit 548f0a41cb
parent b2a51283d1
4 changed files with 120 additions and 70 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -14,7 +14,6 @@ ALLOWED_USERS = "allowed_users"
 ALLOWED_GROUPS = "allowed_groups"
 METADATA = "metadata"
 GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
 HTML_SEPARATOR = "\n"
 PUBLIC_DOC_PAT = "PUBLIC"
 QUOTE = "quote"
 BOOST = "boost"
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@ -1,3 +1,4 @@
 import os
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@ -101,12 +102,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        start_ind: int,
    ) -> Collection[dict[str, Any]]:
        def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
            try:
                return confluence_client.get_all_pages_from_space(
                    self.space,
                    start=start_ind,
                    limit=batch_size,
                    expand="body.storage.value,version",
                )
            except:
                logger.warning(
                    f"Batch failed with space {self.space} at offset {start_ind}"
                )
                view_pages: list[dict[str, Any]] = []
                for i in range(self.batch_size):
                    try:
                        # Could be that one of the pages here failed due to this bug:
                        # https://jira.atlassian.com/browse/CONFCLOUD-76433
                        view_pages.extend(
                            confluence_client.get_all_pages_from_space(
                                self.space,
                                start=start_ind + i,
                                limit=1,
                                expand="body.storage.value,version",
                            )
                        )
                    except:
                        # Use view instead, which captures most info but is less complete
                        view_pages.extend(
                            confluence_client.get_all_pages_from_space(
                                self.space,
                                start=start_ind + i,
                                limit=1,
                                expand="body.view.value,version",
                            )
                        )
                return view_pages
        try:
            return _fetch(start_ind, self.batch_size)
@ -162,7 +194,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            last_modified = datetime.fromisoformat(last_modified_str)
            if time_filter is None or time_filter(last_modified):
-                page_html = page["body"]["storage"]["value"]
+                page_html = (
                    page["body"].get("storage", {}).get("value")
                    or page["body"]["view"]["value"]
                )
                page_text = (
                    page.get("title", "") + "\n" + parse_html_page_basic(page_html)
                )
@ -219,3 +254,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            if num_pages < self.batch_size:
                break
 if __name__ == "__main__":
    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
    connector.load_credentials(
        {
            "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
            "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
        }
    )
    document_batches = connector.load_from_state()
    print(next(document_batches))
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@ -1,5 +1,4 @@
 import io
 import re
 from datetime import datetime
 from typing import Any
 from typing import cast
@ -7,7 +6,6 @@ from typing import Tuple
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 import bs4
 import requests
 from bs4 import BeautifulSoup
 from oauthlib.oauth2 import BackendApplicationClient
@ -29,6 +27,7 @@ from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
 from danswer.utils.text_processing import format_document_soup
 logger = setup_logger()
@ -62,62 +61,6 @@ def get_internal_links(
    return internal_links
 def strip_excessive_newlines_and_spaces(document: str) -> str:
    # collapse repeated spaces into one
    document = re.sub(r" +", " ", document)
    # remove trailing spaces
    document = re.sub(r" +[\n\r]", "\n", document)
    # remove repeated newlines
    document = re.sub(r"[\n\r]+", "\n", document)
    return document.strip()
 def strip_newlines(document: str) -> str:
    # HTML might contain newlines which are just whitespaces to a browser
    return re.sub(r"[\n\r]+", " ", document)
 def format_document(document: BeautifulSoup) -> str:
    """Format html to a flat text document.
    The following goals:
    - Newlines from within the HTML are removed (as browser would ignore them as well).
    - Repeated newlines/spaces are removed (as browsers would ignore them).
    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
    - Table columns/rows are separated by newline
    - List elements are separated by newline and start with a hyphen
    """
    text = ""
    list_element_start = False
    verbatim_output = 0
    for e in document.descendants:
        verbatim_output -= 1
        if isinstance(e, bs4.element.NavigableString):
            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
                continue
            element_text = e.text
            if element_text:
                if verbatim_output > 0:
                    text += element_text
                else:
                    text += strip_newlines(element_text)
                list_element_start = False
        elif isinstance(e, bs4.element.Tag):
            if e.name in ["p", "div"]:
                if not list_element_start:
                    text += "\n"
            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
                text += "\n"
                list_element_start = False
            elif e.name == "li":
                text += "\n- "
                list_element_start = True
            elif e.name == "pre":
                if verbatim_output <= 0:
                    verbatim_output = len(list(e.childGenerator()))
    return strip_excessive_newlines_and_spaces(text)
 def start_playwright() -> Tuple[Playwright, BrowserContext]:
    playwright = sync_playwright().start()
    browser = playwright.chromium.launch(headless=True)
@ -239,7 +182,7 @@ class WebConnector(LoadConnector):
                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
                    [tag.extract() for tag in soup.find_all(undesired_tag)]
-                page_text = format_document(soup)
+                page_text = format_document_soup(soup)
                doc_batch.append(
                    Document(
@ -267,3 +210,9 @@ class WebConnector(LoadConnector):
        if doc_batch:
            playwright.stop()
            yield doc_batch
 if __name__ == "__main__":
    connector = WebConnector("https://docs.danswer.dev/")
    document_batches = connector.load_from_state()
    print(next(document_batches))
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@ -1,9 +1,8 @@
 import re
 import bs4
 from bs4 import BeautifulSoup
 from danswer.configs.constants import HTML_SEPARATOR
 def clean_model_quote(quote: str, trim_length: int) -> str:
    quote_clean = quote.strip()
@ -32,6 +31,62 @@ def shared_precompare_cleanup(text: str) -> str:
    return text
 def strip_excessive_newlines_and_spaces(document: str) -> str:
    # collapse repeated spaces into one
    document = re.sub(r" +", " ", document)
    # remove trailing spaces
    document = re.sub(r" +[\n\r]", "\n", document)
    # remove repeated newlines
    document = re.sub(r"[\n\r]+", "\n", document)
    return document.strip()
 def strip_newlines(document: str) -> str:
    # HTML might contain newlines which are just whitespaces to a browser
    return re.sub(r"[\n\r]+", " ", document)
 def format_document_soup(document: BeautifulSoup) -> str:
    """Format html to a flat text document.
    The following goals:
    - Newlines from within the HTML are removed (as browser would ignore them as well).
    - Repeated newlines/spaces are removed (as browsers would ignore them).
    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
    - Table columns/rows are separated by newline
    - List elements are separated by newline and start with a hyphen
    """
    text = ""
    list_element_start = False
    verbatim_output = 0
    for e in document.descendants:
        verbatim_output -= 1
        if isinstance(e, bs4.element.NavigableString):
            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
                continue
            element_text = e.text
            if element_text:
                if verbatim_output > 0:
                    text += element_text
                else:
                    text += strip_newlines(element_text)
                list_element_start = False
        elif isinstance(e, bs4.element.Tag):
            if e.name in ["p", "div"]:
                if not list_element_start:
                    text += "\n"
            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
                text += "\n"
                list_element_start = False
            elif e.name == "li":
                text += "\n- "
                list_element_start = True
            elif e.name == "pre":
                if verbatim_output <= 0:
                    verbatim_output = len(list(e.childGenerator()))
    return strip_excessive_newlines_and_spaces(text)
 def parse_html_page_basic(text: str) -> str:
    soup = BeautifulSoup(text, "html.parser")
-    return soup.get_text(HTML_SEPARATOR)
+    return format_document_soup(soup)