Add confluence rate limit handling (#1174)

This commit is contained in:
Chris Weaver 2024-03-04 01:02:57 -08:00 committed by GitHub
parent 563df1f952
commit a7c820147e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 60 additions and 9 deletions

View File

@ -15,6 +15,9 @@ from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.confluence.rate_limit_handler import (
make_confluence_call_handle_rate_limit,
)
from danswer.connectors.cross_connector_utils.html_utils import format_document_soup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@ -100,10 +103,11 @@ def _get_user(user_id: str, confluence_client: Confluence) -> str:
"""
user_not_found = "Unknown User"
get_user_details_by_accountid = make_confluence_call_handle_rate_limit(
confluence_client.get_user_details_by_accountid
)
try:
return confluence_client.get_user_details_by_accountid(user_id).get(
"displayName", user_not_found
)
return get_user_details_by_accountid(user_id).get("displayName", user_not_found)
except Exception as e:
logger.warning(
f"Unable to get the User Display Name with the id: '{user_id}' - {e}"
@ -144,12 +148,16 @@ def _comment_dfs(
comment_pages: Collection[dict[str, Any]],
confluence_client: Confluence,
) -> str:
get_page_child_by_type = make_confluence_call_handle_rate_limit(
confluence_client.get_page_child_by_type
)
for comment_page in comment_pages:
comment_html = comment_page["body"]["storage"]["value"]
comments_str += "\nComment:\n" + parse_html_page(
comment_html, confluence_client
)
child_comment_pages = confluence_client.get_page_child_by_type(
child_comment_pages = get_page_child_by_type(
comment_page["id"],
type="comment",
start=None,
@ -200,8 +208,11 @@ class ConfluenceConnector(LoadConnector, PollConnector):
start_ind: int,
) -> Collection[dict[str, Any]]:
def _fetch(start_ind: int, batch_size: int) -> Collection[dict[str, Any]]:
get_all_pages_from_space = make_confluence_call_handle_rate_limit(
confluence_client.get_all_pages_from_space
)
try:
return confluence_client.get_all_pages_from_space(
return get_all_pages_from_space(
self.space,
start=start_ind,
limit=batch_size,
@ -219,7 +230,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
# Could be that one of the pages here failed due to this bug:
# https://jira.atlassian.com/browse/CONFCLOUD-76433
view_pages.extend(
confluence_client.get_all_pages_from_space(
get_all_pages_from_space(
self.space,
start=start_ind + i,
limit=1,
@ -233,7 +244,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
)
# Use view instead, which captures most info but is less complete
view_pages.extend(
confluence_client.get_all_pages_from_space(
get_all_pages_from_space(
self.space,
start=start_ind + i,
limit=1,
@ -262,10 +273,13 @@ class ConfluenceConnector(LoadConnector, PollConnector):
return pages
def _fetch_comments(self, confluence_client: Confluence, page_id: str) -> str:
get_page_child_by_type = make_confluence_call_handle_rate_limit(
confluence_client.get_page_child_by_type
)
try:
comment_pages = cast(
Collection[dict[str, Any]],
confluence_client.get_page_child_by_type(
get_page_child_by_type(
page_id,
type="comment",
start=None,
@ -284,8 +298,11 @@ class ConfluenceConnector(LoadConnector, PollConnector):
return ""
def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]:
get_page_labels = make_confluence_call_handle_rate_limit(
confluence_client.get_page_labels
)
try:
labels_response = confluence_client.get_page_labels(page_id)
labels_response = get_page_labels(page_id)
return [label["name"] for label in labels_response["results"]]
except Exception as e:
if not self.continue_on_failure:

View File

@ -0,0 +1,34 @@
from collections.abc import Callable
from typing import Any
from typing import cast
from typing import TypeVar
from requests import HTTPError
from retry import retry
F = TypeVar("F", bound=Callable[..., Any])
class ConfluenceRateLimitError(Exception):
pass
def make_confluence_call_handle_rate_limit(confluence_call: F) -> F:
@retry(
exceptions=ConfluenceRateLimitError,
tries=10,
delay=1,
max_delay=600, # 10 minutes
backoff=2,
jitter=1,
)
def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:
try:
return confluence_call(*args, **kwargs)
except HTTPError as e:
if e.response.status_code == 429:
raise ConfluenceRateLimitError()
raise
return cast(F, wrapped_call)