Reduce errors in workers (#3962)

2025-07-08 13:40:46 +02:00 · 2025-02-13 15:59:44 -08:00
parent 3260d793d1
commit eff433bdc5
5 changed files with 48 additions and 34 deletions
--- a/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@ -478,14 +478,15 @@ def update_external_document_permissions_task(
    )
    doc_id = document_external_access.doc_id
    external_access = document_external_access.external_access
+
    try:
        with get_session_with_tenant(tenant_id) as db_session:
-            # Add the users to the DB if they don't exist
            batch_add_ext_perm_user_if_not_exists(
                db_session=db_session,
                emails=list(external_access.external_user_emails),
+                continue_on_error=True,
            )
-            # Then we upsert the document's external permissions in postgres
+            # Then upsert the document's external permissions
            created_new_doc = upsert_document_external_perms(
                db_session=db_session,
                doc_id=doc_id,
@ -509,11 +510,11 @@ def update_external_document_permissions_task(
                f"action=update_permissions "
                f"elapsed={elapsed:.2f}"
            )
+
    except Exception:
        task_logger.exception(
            f"Exception in update_external_document_permissions_task: "
-            f"connector_id={connector_id} "
-            f"doc_id={doc_id}"
+            f"connector_id={connector_id} doc_id={doc_id}"
        )
        return False

--- a/backend/onyx/background/celery/tasks/monitoring/tasks.py
+++ b/backend/onyx/background/celery/tasks/monitoring/tasks.py
@ -421,6 +421,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
      - Throughput (docs/min) (only if success)
      - Raw start/end times for each sync
    """
+
    one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)

    # Get all sync records that ended in the last hour
@ -588,6 +589,10 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
                entity = db_session.scalar(
                    select(UserGroup).where(UserGroup.id == sync_record.entity_id)
                )
+            else:
+                # Only user groups and document set sync records have
+                #  an associated entity we can use for latency metrics
+                continue

            if entity is None:
                task_logger.error(
@ -778,7 +783,7 @@ def cloud_check_alembic() -> bool | None:

                    tenant_to_revision[tenant_id] = result_scalar
                except Exception:
-                    task_logger.warning(f"Tenant {tenant_id} has no revision!")
+                    task_logger.error(f"Tenant {tenant_id} has no revision!")
                    tenant_to_revision[tenant_id] = ALEMBIC_NULL_REVISION

        # get the total count of each revision
--- a/backend/onyx/connectors/slack/utils.py
+++ b/backend/onyx/connectors/slack/utils.py
@ -39,19 +39,6 @@ def get_message_link(
    return permalink


-def _make_slack_api_call_logged(
-    call: Callable[..., SlackResponse],
-) -> Callable[..., SlackResponse]:
-    @wraps(call)
-    def logged_call(**kwargs: Any) -> SlackResponse:
-        logger.debug(f"Making call to Slack API '{call.__name__}' with args '{kwargs}'")
-        result = call(**kwargs)
-        logger.debug(f"Call to Slack API '{call.__name__}' returned '{result}'")
-        return result
-
-    return logged_call
-
-
 def _make_slack_api_call_paginated(
    call: Callable[..., SlackResponse],
 ) -> Callable[..., Generator[dict[str, Any], None, None]]:
@ -127,18 +114,14 @@ def make_slack_api_rate_limited(
 def make_slack_api_call_w_retries(
    call: Callable[..., SlackResponse], **kwargs: Any
 ) -> SlackResponse:
-    return basic_retry_wrapper(
-        make_slack_api_rate_limited(_make_slack_api_call_logged(call))
-    )(**kwargs)
+    return basic_retry_wrapper(make_slack_api_rate_limited(call))(**kwargs)


 def make_paginated_slack_api_call_w_retries(
    call: Callable[..., SlackResponse], **kwargs: Any
 ) -> Generator[dict[str, Any], None, None]:
    return _make_slack_api_call_paginated(
-        basic_retry_wrapper(
-            make_slack_api_rate_limited(_make_slack_api_call_logged(call))
-        )
+        basic_retry_wrapper(make_slack_api_rate_limited(call))
    )(**kwargs)


--- a/backend/onyx/db/users.py
+++ b/backend/onyx/db/users.py
@ -6,6 +6,7 @@ from fastapi import HTTPException
 from fastapi_users.password import PasswordHelper
 from sqlalchemy import func
 from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session
 from sqlalchemy.sql import expression
 from sqlalchemy.sql.elements import ColumnElement
@ -274,7 +275,7 @@ def _generate_ext_permissioned_user(email: str) -> User:


 def batch_add_ext_perm_user_if_not_exists(
-    db_session: Session, emails: list[str]
+    db_session: Session, emails: list[str], continue_on_error: bool = False
 ) -> list[User]:
    lower_emails = [email.lower() for email in emails]
    found_users, missing_lower_emails = _get_users_by_emails(db_session, lower_emails)
@ -283,10 +284,23 @@ def batch_add_ext_perm_user_if_not_exists(
    for email in missing_lower_emails:
        new_users.append(_generate_ext_permissioned_user(email=email))

-    db_session.add_all(new_users)
-    db_session.commit()
-
-    return found_users + new_users
+    try:
+        db_session.add_all(new_users)
+        db_session.commit()
+    except IntegrityError:
+        db_session.rollback()
+        if not continue_on_error:
+            raise
+        for user in new_users:
+            try:
+                db_session.add(user)
+                db_session.commit()
+            except IntegrityError:
+                db_session.rollback()
+                continue
+    # Fetch all users again to ensure we have the most up-to-date list
+    all_users, _ = _get_users_by_emails(db_session, lower_emails)
+    return all_users


 def delete_user_from_db(
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@ -17,6 +17,7 @@ from uuid import UUID

 import httpx  # type: ignore
 import requests  # type: ignore
+from retry import retry

 from onyx.configs.chat_configs import DOC_TIME_DECAY
 from onyx.configs.chat_configs import NUM_RETURNED_HITS
@ -549,6 +550,11 @@ class VespaIndex(DocumentIndex):
            time.monotonic() - update_start,
        )

+    @retry(
+        tries=3,
+        delay=1,
+        backoff=2,
+    )
    def _update_single_chunk(
        self,
        doc_chunk_id: UUID,
@ -559,6 +565,7 @@ class VespaIndex(DocumentIndex):
    ) -> None:
        """
        Update a single "chunk" (document) in Vespa using its chunk ID.
+        Retries if we encounter transient HTTPStatusError (e.g., overload).
        """

        update_dict: dict[str, dict] = {"fields": {}}
@ -567,13 +574,11 @@ class VespaIndex(DocumentIndex):
            update_dict["fields"][BOOST] = {"assign": fields.boost}

        if fields.document_sets is not None:
-            # WeightedSet<string> needs a map { item: weight, ... }
            update_dict["fields"][DOCUMENT_SETS] = {
                "assign": {document_set: 1 for document_set in fields.document_sets}
            }

        if fields.access is not None:
-            # Similar to above
            update_dict["fields"][ACCESS_CONTROL_LIST] = {
                "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
            }
@ -585,7 +590,10 @@ class VespaIndex(DocumentIndex):
            logger.error("Update request received but nothing to update.")
            return

-        vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true"
+        vespa_url = (
+            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
+            "?create=true"
+        )

        try:
            resp = http_client.put(
@ -595,8 +603,11 @@ class VespaIndex(DocumentIndex):
            )
            resp.raise_for_status()
        except httpx.HTTPStatusError as e:
-            error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}"
-            logger.error(error_message)
+            logger.error(
+                f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). "
+                f"Details: {e.response.text}"
+            )
+            # Re-raise so the @retry decorator will catch and retry
            raise

    def update_single(