Fix confluence permission syncing at scale (#4129)

* Fix confluence permission syncing at scale * Remove line * Better log message * Adjust log
2025-09-18 19:43:26 +02:00 · 2025-02-25 19:22:52 -08:00
parent 9ffc8cb2c4
commit f767b1f476
2 changed files with 79 additions and 11 deletions
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -11,6 +11,8 @@ from atlassian import Confluence  # type:ignore
 from pydantic import BaseModel
 from requests import HTTPError

+from onyx.connectors.confluence.utils import get_start_param_from_url
+from onyx.connectors.confluence.utils import update_param_in_path
 from onyx.connectors.exceptions import ConnectorValidationError
 from onyx.utils.logger import setup_logger

@@ -161,7 +163,7 @@ class OnyxConfluence(Confluence):
                )

    def _paginate_url(
-        self, url_suffix: str, limit: int | None = None
+        self, url_suffix: str, limit: int | None = None, auto_paginate: bool = False
    ) -> Iterator[dict[str, Any]]:
        """
        This will paginate through the top level query.
@@ -236,9 +238,41 @@ class OnyxConfluence(Confluence):
                raise e

            # yield the results individually
-            yield from next_response.get("results", [])
+            results = cast(list[dict[str, Any]], next_response.get("results", []))
+            yield from results

-            url_suffix = next_response.get("_links", {}).get("next")
+            old_url_suffix = url_suffix
+            url_suffix = cast(str, next_response.get("_links", {}).get("next", ""))
+
+            # make sure we don't update the start by more than the amount
+            # of results we were able to retrieve. The Confluence API has a
+            # weird behavior where if you pass in a limit that is too large for
+            # the configured server, it will artificially limit the amount of
+            # results returned BUT will not apply this to the start parameter.
+            # This will cause us to miss results.
+            if url_suffix and "start" in url_suffix:
+                new_start = get_start_param_from_url(url_suffix)
+                previous_start = get_start_param_from_url(old_url_suffix)
+                if new_start - previous_start > len(results):
+                    logger.warning(
+                        f"Start was updated by more than the amount of results "
+                        f"retrieved. This is a bug with Confluence. Start: {new_start}, "
+                        f"Previous Start: {previous_start}, Len Results: {len(results)}."
+                    )
+
+                    # Update the url_suffix to use the adjusted start
+                    adjusted_start = previous_start + len(results)
+                    url_suffix = update_param_in_path(
+                        url_suffix, "start", str(adjusted_start)
+                    )
+
+            # some APIs don't properly paginate, so we need to manually update the `start` param
+            if auto_paginate and len(results) > 0:
+                previous_start = get_start_param_from_url(old_url_suffix)
+                updated_start = previous_start + len(results)
+                url_suffix = update_param_in_path(
+                    old_url_suffix, "start", str(updated_start)
+                )

    def paginated_cql_retrieval(
        self,
@@ -298,7 +332,9 @@ class OnyxConfluence(Confluence):
            url = "rest/api/search/user"
            expand_string = f"&expand={expand}" if expand else ""
            url += f"?cql={cql}{expand_string}"
-            for user_result in self._paginate_url(url, limit):
+            # endpoint doesn't properly paginate, so we need to manually update the `start` param
+            # thus the auto_paginate flag
+            for user_result in self._paginate_url(url, limit, auto_paginate=True):
                # Example response:
                # {
                #     'user': {
--- a/backend/onyx/connectors/confluence/utils.py
+++ b/backend/onyx/connectors/confluence/utils.py
@@ -2,7 +2,10 @@ import io
 from datetime import datetime
 from datetime import timezone
 from typing import Any
+from typing import TYPE_CHECKING
+from urllib.parse import parse_qs
 from urllib.parse import quote
+from urllib.parse import urlparse

 import bs4

@@ -10,13 +13,13 @@ from onyx.configs.app_configs import (
    CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD,
 )
 from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD
-from onyx.connectors.confluence.onyx_confluence import (
-    OnyxConfluence,
-)
 from onyx.file_processing.extract_file_text import extract_file_text
 from onyx.file_processing.html_utils import format_document_soup
 from onyx.utils.logger import setup_logger

+if TYPE_CHECKING:
+    from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
+
 logger = setup_logger()


@@ -24,7 +27,7 @@ _USER_EMAIL_CACHE: dict[str, str | None] = {}


 def get_user_email_from_username__server(
-    confluence_client: OnyxConfluence, user_name: str
+    confluence_client: "OnyxConfluence", user_name: str
 ) -> str | None:
    global _USER_EMAIL_CACHE
    if _USER_EMAIL_CACHE.get(user_name) is None:
@@ -47,7 +50,7 @@ _USER_NOT_FOUND = "Unknown Confluence User"
 _USER_ID_TO_DISPLAY_NAME_CACHE: dict[str, str | None] = {}


-def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:
+def _get_user(confluence_client: "OnyxConfluence", user_id: str) -> str:
    """Get Confluence Display Name based on the account-id or userkey value

    Args:
@@ -78,7 +81,7 @@ def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:


 def extract_text_from_confluence_html(
-    confluence_client: OnyxConfluence,
+    confluence_client: "OnyxConfluence",
    confluence_object: dict[str, Any],
    fetched_titles: set[str],
 ) -> str:
@@ -191,7 +194,7 @@ def validate_attachment_filetype(attachment: dict[str, Any]) -> bool:


 def attachment_to_content(
-    confluence_client: OnyxConfluence,
+    confluence_client: "OnyxConfluence",
    attachment: dict[str, Any],
 ) -> str | None:
    """If it returns None, assume that we should skip this attachment."""
@@ -279,3 +282,32 @@ def datetime_from_string(datetime_string: str) -> datetime:
        datetime_object = datetime_object.astimezone(timezone.utc)

    return datetime_object
+
+
+def get_single_param_from_url(url: str, param: str) -> str | None:
+    """Get a parameter from a url"""
+    parsed_url = urlparse(url)
+    return parse_qs(parsed_url.query).get(param, [None])[0]
+
+
+def get_start_param_from_url(url: str) -> int:
+    """Get the start parameter from a url"""
+    start_str = get_single_param_from_url(url, "start")
+    if start_str is None:
+        return 0
+    return int(start_str)
+
+
+def update_param_in_path(path: str, param: str, value: str) -> str:
+    """Update a parameter in a path. Path should look something like:
+
+    /api/rest/users?start=0&limit=10
+    """
+    parsed_url = urlparse(path)
+    query_params = parse_qs(parsed_url.query)
+    query_params[param] = [value]
+    return (
+        path.split("?")[0]
+        + "?"
+        + "&".join(f"{k}={quote(v[0])}" for k, v in query_params.items())
+    )