Reduce errors in workers (#3962)

This commit is contained in:
pablonyx
2025-02-13 15:59:44 -08:00
committed by GitHub
parent 3260d793d1
commit eff433bdc5
5 changed files with 48 additions and 34 deletions

View File

@@ -478,14 +478,15 @@ def update_external_document_permissions_task(
) )
doc_id = document_external_access.doc_id doc_id = document_external_access.doc_id
external_access = document_external_access.external_access external_access = document_external_access.external_access
try: try:
with get_session_with_tenant(tenant_id) as db_session: with get_session_with_tenant(tenant_id) as db_session:
# Add the users to the DB if they don't exist
batch_add_ext_perm_user_if_not_exists( batch_add_ext_perm_user_if_not_exists(
db_session=db_session, db_session=db_session,
emails=list(external_access.external_user_emails), emails=list(external_access.external_user_emails),
continue_on_error=True,
) )
# Then we upsert the document's external permissions in postgres # Then upsert the document's external permissions
created_new_doc = upsert_document_external_perms( created_new_doc = upsert_document_external_perms(
db_session=db_session, db_session=db_session,
doc_id=doc_id, doc_id=doc_id,
@@ -509,11 +510,11 @@ def update_external_document_permissions_task(
f"action=update_permissions " f"action=update_permissions "
f"elapsed={elapsed:.2f}" f"elapsed={elapsed:.2f}"
) )
except Exception: except Exception:
task_logger.exception( task_logger.exception(
f"Exception in update_external_document_permissions_task: " f"Exception in update_external_document_permissions_task: "
f"connector_id={connector_id} " f"connector_id={connector_id} doc_id={doc_id}"
f"doc_id={doc_id}"
) )
return False return False

View File

@@ -421,6 +421,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
- Throughput (docs/min) (only if success) - Throughput (docs/min) (only if success)
- Raw start/end times for each sync - Raw start/end times for each sync
""" """
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1) one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
# Get all sync records that ended in the last hour # Get all sync records that ended in the last hour
@@ -588,6 +589,10 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
entity = db_session.scalar( entity = db_session.scalar(
select(UserGroup).where(UserGroup.id == sync_record.entity_id) select(UserGroup).where(UserGroup.id == sync_record.entity_id)
) )
else:
# Only user groups and document set sync records have
# an associated entity we can use for latency metrics
continue
if entity is None: if entity is None:
task_logger.error( task_logger.error(
@@ -778,7 +783,7 @@ def cloud_check_alembic() -> bool | None:
tenant_to_revision[tenant_id] = result_scalar tenant_to_revision[tenant_id] = result_scalar
except Exception: except Exception:
task_logger.warning(f"Tenant {tenant_id} has no revision!") task_logger.error(f"Tenant {tenant_id} has no revision!")
tenant_to_revision[tenant_id] = ALEMBIC_NULL_REVISION tenant_to_revision[tenant_id] = ALEMBIC_NULL_REVISION
# get the total count of each revision # get the total count of each revision

View File

@@ -39,19 +39,6 @@ def get_message_link(
return permalink return permalink
def _make_slack_api_call_logged(
call: Callable[..., SlackResponse],
) -> Callable[..., SlackResponse]:
@wraps(call)
def logged_call(**kwargs: Any) -> SlackResponse:
logger.debug(f"Making call to Slack API '{call.__name__}' with args '{kwargs}'")
result = call(**kwargs)
logger.debug(f"Call to Slack API '{call.__name__}' returned '{result}'")
return result
return logged_call
def _make_slack_api_call_paginated( def _make_slack_api_call_paginated(
call: Callable[..., SlackResponse], call: Callable[..., SlackResponse],
) -> Callable[..., Generator[dict[str, Any], None, None]]: ) -> Callable[..., Generator[dict[str, Any], None, None]]:
@@ -127,18 +114,14 @@ def make_slack_api_rate_limited(
def make_slack_api_call_w_retries( def make_slack_api_call_w_retries(
call: Callable[..., SlackResponse], **kwargs: Any call: Callable[..., SlackResponse], **kwargs: Any
) -> SlackResponse: ) -> SlackResponse:
return basic_retry_wrapper( return basic_retry_wrapper(make_slack_api_rate_limited(call))(**kwargs)
make_slack_api_rate_limited(_make_slack_api_call_logged(call))
)(**kwargs)
def make_paginated_slack_api_call_w_retries( def make_paginated_slack_api_call_w_retries(
call: Callable[..., SlackResponse], **kwargs: Any call: Callable[..., SlackResponse], **kwargs: Any
) -> Generator[dict[str, Any], None, None]: ) -> Generator[dict[str, Any], None, None]:
return _make_slack_api_call_paginated( return _make_slack_api_call_paginated(
basic_retry_wrapper( basic_retry_wrapper(make_slack_api_rate_limited(call))
make_slack_api_rate_limited(_make_slack_api_call_logged(call))
)
)(**kwargs) )(**kwargs)

View File

@@ -6,6 +6,7 @@ from fastapi import HTTPException
from fastapi_users.password import PasswordHelper from fastapi_users.password import PasswordHelper
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy.sql import expression from sqlalchemy.sql import expression
from sqlalchemy.sql.elements import ColumnElement from sqlalchemy.sql.elements import ColumnElement
@@ -274,7 +275,7 @@ def _generate_ext_permissioned_user(email: str) -> User:
def batch_add_ext_perm_user_if_not_exists( def batch_add_ext_perm_user_if_not_exists(
db_session: Session, emails: list[str] db_session: Session, emails: list[str], continue_on_error: bool = False
) -> list[User]: ) -> list[User]:
lower_emails = [email.lower() for email in emails] lower_emails = [email.lower() for email in emails]
found_users, missing_lower_emails = _get_users_by_emails(db_session, lower_emails) found_users, missing_lower_emails = _get_users_by_emails(db_session, lower_emails)
@@ -283,10 +284,23 @@ def batch_add_ext_perm_user_if_not_exists(
for email in missing_lower_emails: for email in missing_lower_emails:
new_users.append(_generate_ext_permissioned_user(email=email)) new_users.append(_generate_ext_permissioned_user(email=email))
db_session.add_all(new_users) try:
db_session.commit() db_session.add_all(new_users)
db_session.commit()
return found_users + new_users except IntegrityError:
db_session.rollback()
if not continue_on_error:
raise
for user in new_users:
try:
db_session.add(user)
db_session.commit()
except IntegrityError:
db_session.rollback()
continue
# Fetch all users again to ensure we have the most up-to-date list
all_users, _ = _get_users_by_emails(db_session, lower_emails)
return all_users
def delete_user_from_db( def delete_user_from_db(

View File

@@ -17,6 +17,7 @@ from uuid import UUID
import httpx # type: ignore import httpx # type: ignore
import requests # type: ignore import requests # type: ignore
from retry import retry
from onyx.configs.chat_configs import DOC_TIME_DECAY from onyx.configs.chat_configs import DOC_TIME_DECAY
from onyx.configs.chat_configs import NUM_RETURNED_HITS from onyx.configs.chat_configs import NUM_RETURNED_HITS
@@ -549,6 +550,11 @@ class VespaIndex(DocumentIndex):
time.monotonic() - update_start, time.monotonic() - update_start,
) )
@retry(
tries=3,
delay=1,
backoff=2,
)
def _update_single_chunk( def _update_single_chunk(
self, self,
doc_chunk_id: UUID, doc_chunk_id: UUID,
@@ -559,6 +565,7 @@ class VespaIndex(DocumentIndex):
) -> None: ) -> None:
""" """
Update a single "chunk" (document) in Vespa using its chunk ID. Update a single "chunk" (document) in Vespa using its chunk ID.
Retries if we encounter transient HTTPStatusError (e.g., overload).
""" """
update_dict: dict[str, dict] = {"fields": {}} update_dict: dict[str, dict] = {"fields": {}}
@@ -567,13 +574,11 @@ class VespaIndex(DocumentIndex):
update_dict["fields"][BOOST] = {"assign": fields.boost} update_dict["fields"][BOOST] = {"assign": fields.boost}
if fields.document_sets is not None: if fields.document_sets is not None:
# WeightedSet<string> needs a map { item: weight, ... }
update_dict["fields"][DOCUMENT_SETS] = { update_dict["fields"][DOCUMENT_SETS] = {
"assign": {document_set: 1 for document_set in fields.document_sets} "assign": {document_set: 1 for document_set in fields.document_sets}
} }
if fields.access is not None: if fields.access is not None:
# Similar to above
update_dict["fields"][ACCESS_CONTROL_LIST] = { update_dict["fields"][ACCESS_CONTROL_LIST] = {
"assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()} "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()}
} }
@@ -585,7 +590,10 @@ class VespaIndex(DocumentIndex):
logger.error("Update request received but nothing to update.") logger.error("Update request received but nothing to update.")
return return
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}?create=true" vespa_url = (
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
"?create=true"
)
try: try:
resp = http_client.put( resp = http_client.put(
@@ -595,8 +603,11 @@ class VespaIndex(DocumentIndex):
) )
resp.raise_for_status() resp.raise_for_status()
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
error_message = f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). Details: {e.response.text}" logger.error(
logger.error(error_message) f"Failed to update doc chunk {doc_chunk_id} (doc_id={doc_id}). "
f"Details: {e.response.text}"
)
# Re-raise so the @retry decorator will catch and retry
raise raise
def update_single( def update_single(