diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index 7ce83b49d..62a648a91 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -14,7 +14,6 @@ ALLOWED_USERS = "allowed_users"
 ALLOWED_GROUPS = "allowed_groups"
 METADATA = "metadata"
 GEN_AI_API_KEY_STORAGE_KEY = "genai_api_key"
-HTML_SEPARATOR = "\n"
 PUBLIC_DOC_PAT = "PUBLIC"
 QUOTE = "quote"
 BOOST = "boost"
diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py
index 884e2da6d..b12fdf969 100644
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -1,3 +1,4 @@
+import os
 from collections.abc import Callable
 from collections.abc import Collection
 from datetime import datetime
@@ -101,12 +102,43 @@ class ConfluenceConnector(LoadConnector, PollConnector):
         start_ind: int,
     ) -> Collection[dict[str, Any]]:
         def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
-            return confluence_client.get_all_pages_from_space(
-                self.space,
-                start=start_ind,
-                limit=batch_size,
-                expand="body.storage.value,version",
-            )
+            try:
+                return confluence_client.get_all_pages_from_space(
+                    self.space,
+                    start=start_ind,
+                    limit=batch_size,
+                    expand="body.storage.value,version",
+                )
+            except:
+                logger.warning(
+                    f"Batch failed with space {self.space} at offset {start_ind}"
+                )
+
+                view_pages: list[dict[str, Any]] = []
+                for i in range(self.batch_size):
+                    try:
+                        # Could be that one of the pages here failed due to this bug:
+                        # https://jira.atlassian.com/browse/CONFCLOUD-76433
+                        view_pages.extend(
+                            confluence_client.get_all_pages_from_space(
+                                self.space,
+                                start=start_ind + i,
+                                limit=1,
+                                expand="body.storage.value,version",
+                            )
+                        )
+                    except:
+                        # Use view instead, which captures most info but is less complete
+                        view_pages.extend(
+                            confluence_client.get_all_pages_from_space(
+                                self.space,
+                                start=start_ind + i,
+                                limit=1,
+                                expand="body.view.value,version",
+                            )
+                        )
+
+                return view_pages
 
         try:
             return _fetch(start_ind, self.batch_size)
@@ -162,7 +194,10 @@ class ConfluenceConnector(LoadConnector, PollConnector):
             last_modified = datetime.fromisoformat(last_modified_str)
 
             if time_filter is None or time_filter(last_modified):
-                page_html = page["body"]["storage"]["value"]
+                page_html = (
+                    page["body"].get("storage", {}).get("value")
+                    or page["body"]["view"]["value"]
+                )
                 page_text = (
                     page.get("title", "") + "\n" + parse_html_page_basic(page_html)
                 )
@@ -219,3 +254,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
 
             if num_pages < self.batch_size:
                 break
+
+
+if __name__ == "__main__":
+    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
+    connector.load_credentials(
+        {
+            "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
+            "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
+        }
+    )
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py
index 1780120f7..7979b5b8d 100644
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
-import re
 from datetime import datetime
 from typing import Any
 from typing import cast
@@ -7,7 +6,6 @@ from typing import Tuple
 from urllib.parse import urljoin
 from urllib.parse import urlparse
 
-import bs4
 import requests
 from bs4 import BeautifulSoup
 from oauthlib.oauth2 import BackendApplicationClient
@@ -29,6 +27,7 @@ from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
+from danswer.utils.text_processing import format_document_soup
 
 logger = setup_logger()
 
@@ -62,62 +61,6 @@ def get_internal_links(
     return internal_links
 
 
-def strip_excessive_newlines_and_spaces(document: str) -> str:
-    # collapse repeated spaces into one
-    document = re.sub(r" +", " ", document)
-    # remove trailing spaces
-    document = re.sub(r" +[\n\r]", "\n", document)
-    # remove repeated newlines
-    document = re.sub(r"[\n\r]+", "\n", document)
-    return document.strip()
-
-
-def strip_newlines(document: str) -> str:
-    # HTML might contain newlines which are just whitespaces to a browser
-    return re.sub(r"[\n\r]+", " ", document)
-
-
-def format_document(document: BeautifulSoup) -> str:
-    """Format html to a flat text document.
-
-    The following goals:
-    - Newlines from within the HTML are removed (as browser would ignore them as well).
-    - Repeated newlines/spaces are removed (as browsers would ignore them).
-    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
-    - Table columns/rows are separated by newline
-    - List elements are separated by newline and start with a hyphen
-    """
-    text = ""
-    list_element_start = False
-    verbatim_output = 0
-    for e in document.descendants:
-        verbatim_output -= 1
-        if isinstance(e, bs4.element.NavigableString):
-            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
-                continue
-            element_text = e.text
-            if element_text:
-                if verbatim_output > 0:
-                    text += element_text
-                else:
-                    text += strip_newlines(element_text)
-                list_element_start = False
-        elif isinstance(e, bs4.element.Tag):
-            if e.name in ["p", "div"]:
-                if not list_element_start:
-                    text += "\n"
-            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
-                text += "\n"
-                list_element_start = False
-            elif e.name == "li":
-                text += "\n- "
-                list_element_start = True
-            elif e.name == "pre":
-                if verbatim_output <= 0:
-                    verbatim_output = len(list(e.childGenerator()))
-    return strip_excessive_newlines_and_spaces(text)
-
-
 def start_playwright() -> Tuple[Playwright, BrowserContext]:
     playwright = sync_playwright().start()
     browser = playwright.chromium.launch(headless=True)
@@ -239,7 +182,7 @@ class WebConnector(LoadConnector):
                 for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
                     [tag.extract() for tag in soup.find_all(undesired_tag)]
 
-                page_text = format_document(soup)
+                page_text = format_document_soup(soup)
 
                 doc_batch.append(
                     Document(
@@ -267,3 +210,9 @@ class WebConnector(LoadConnector):
         if doc_batch:
             playwright.stop()
             yield doc_batch
+
+
+if __name__ == "__main__":
+    connector = WebConnector("https://docs.danswer.dev/")
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py
index 718988953..c27c323f0 100644
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -1,9 +1,8 @@
 import re
 
+import bs4
 from bs4 import BeautifulSoup
 
-from danswer.configs.constants import HTML_SEPARATOR
-
 
 def clean_model_quote(quote: str, trim_length: int) -> str:
     quote_clean = quote.strip()
@@ -32,6 +31,62 @@ def shared_precompare_cleanup(text: str) -> str:
     return text
 
 
+def strip_excessive_newlines_and_spaces(document: str) -> str:
+    # collapse repeated spaces into one
+    document = re.sub(r" +", " ", document)
+    # remove trailing spaces
+    document = re.sub(r" +[\n\r]", "\n", document)
+    # remove repeated newlines
+    document = re.sub(r"[\n\r]+", "\n", document)
+    return document.strip()
+
+
+def strip_newlines(document: str) -> str:
+    # HTML might contain newlines which are just whitespaces to a browser
+    return re.sub(r"[\n\r]+", " ", document)
+
+
+def format_document_soup(document: BeautifulSoup) -> str:
+    """Format html to a flat text document.
+
+    The following goals:
+    - Newlines from within the HTML are removed (as browser would ignore them as well).
+    - Repeated newlines/spaces are removed (as browsers would ignore them).
+    - Newlines only before and after headlines and paragraphs or when explicit (br or pre tag)
+    - Table columns/rows are separated by newline
+    - List elements are separated by newline and start with a hyphen
+    """
+    text = ""
+    list_element_start = False
+    verbatim_output = 0
+    for e in document.descendants:
+        verbatim_output -= 1
+        if isinstance(e, bs4.element.NavigableString):
+            if isinstance(e, (bs4.element.Comment, bs4.element.Doctype)):
+                continue
+            element_text = e.text
+            if element_text:
+                if verbatim_output > 0:
+                    text += element_text
+                else:
+                    text += strip_newlines(element_text)
+                list_element_start = False
+        elif isinstance(e, bs4.element.Tag):
+            if e.name in ["p", "div"]:
+                if not list_element_start:
+                    text += "\n"
+            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+                text += "\n"
+                list_element_start = False
+            elif e.name == "li":
+                text += "\n- "
+                list_element_start = True
+            elif e.name == "pre":
+                if verbatim_output <= 0:
+                    verbatim_output = len(list(e.childGenerator()))
+    return strip_excessive_newlines_and_spaces(text)
+
+
 def parse_html_page_basic(text: str) -> str:
     soup = BeautifulSoup(text, "html.parser")
-    return soup.get_text(HTML_SEPARATOR)
+    return format_document_soup(soup)