mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-08 14:10:30 +02:00
196 lines
7.8 KiB
Python
196 lines
7.8 KiB
Python
from http import HTTPStatus
|
|
|
|
import httpx
|
|
from celery import shared_task
|
|
from celery import Task
|
|
from celery.exceptions import SoftTimeLimitExceeded
|
|
from tenacity import RetryError
|
|
|
|
from onyx.access.access import get_access_for_document
|
|
from onyx.background.celery.apps.app_base import task_logger
|
|
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
|
|
from onyx.configs.constants import OnyxCeleryTask
|
|
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
|
|
from onyx.db.document import delete_documents_complete__no_commit
|
|
from onyx.db.document import get_document
|
|
from onyx.db.document import get_document_connector_count
|
|
from onyx.db.document import mark_document_as_modified
|
|
from onyx.db.document import mark_document_as_synced
|
|
from onyx.db.document_set import fetch_document_sets_for_document
|
|
from onyx.db.engine import get_session_with_tenant
|
|
from onyx.document_index.document_index_utils import get_both_index_names
|
|
from onyx.document_index.factory import get_default_document_index
|
|
from onyx.document_index.interfaces import VespaDocumentFields
|
|
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
|
|
|
|
DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES = 3
|
|
|
|
|
|
# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
|
|
LIGHT_SOFT_TIME_LIMIT = 105
|
|
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
|
|
|
|
|
|
@shared_task(
|
|
name=OnyxCeleryTask.DOCUMENT_BY_CC_PAIR_CLEANUP_TASK,
|
|
soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
|
|
time_limit=LIGHT_TIME_LIMIT,
|
|
max_retries=DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES,
|
|
bind=True,
|
|
)
|
|
def document_by_cc_pair_cleanup_task(
|
|
self: Task,
|
|
document_id: str,
|
|
connector_id: int,
|
|
credential_id: int,
|
|
tenant_id: str | None,
|
|
) -> bool:
|
|
"""A lightweight subtask used to clean up document to cc pair relationships.
|
|
Created by connection deletion and connector pruning parent tasks."""
|
|
|
|
"""
|
|
To delete a connector / credential pair:
|
|
(1) find all documents associated with connector / credential pair where there
|
|
this the is only connector / credential pair that has indexed it
|
|
(2) delete all documents from document stores
|
|
(3) delete all entries from postgres
|
|
(4) find all documents associated with connector / credential pair where there
|
|
are multiple connector / credential pairs that have indexed it
|
|
(5) update document store entries to remove access associated with the
|
|
connector / credential pair from the access list
|
|
(6) delete all relevant entries from postgres
|
|
"""
|
|
task_logger.debug(f"Task start: tenant={tenant_id} doc={document_id}")
|
|
|
|
try:
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
action = "skip"
|
|
chunks_affected = 0
|
|
|
|
curr_ind_name, sec_ind_name = get_both_index_names(db_session)
|
|
doc_index = get_default_document_index(
|
|
primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
|
|
)
|
|
|
|
retry_index = RetryDocumentIndex(doc_index)
|
|
|
|
count = get_document_connector_count(db_session, document_id)
|
|
if count == 1:
|
|
# count == 1 means this is the only remaining cc_pair reference to the doc
|
|
# delete it from vespa and the db
|
|
action = "delete"
|
|
|
|
chunks_affected = retry_index.delete_single(document_id)
|
|
delete_documents_complete__no_commit(
|
|
db_session=db_session,
|
|
document_ids=[document_id],
|
|
)
|
|
elif count > 1:
|
|
action = "update"
|
|
|
|
# count > 1 means the document still has cc_pair references
|
|
doc = get_document(document_id, db_session)
|
|
if not doc:
|
|
return False
|
|
|
|
# the below functions do not include cc_pairs being deleted.
|
|
# i.e. they will correctly omit access for the current cc_pair
|
|
doc_access = get_access_for_document(
|
|
document_id=document_id, db_session=db_session
|
|
)
|
|
|
|
doc_sets = fetch_document_sets_for_document(document_id, db_session)
|
|
update_doc_sets: set[str] = set(doc_sets)
|
|
|
|
fields = VespaDocumentFields(
|
|
document_sets=update_doc_sets,
|
|
access=doc_access,
|
|
boost=doc.boost,
|
|
hidden=doc.hidden,
|
|
)
|
|
|
|
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
|
|
chunks_affected = retry_index.update_single(document_id, fields=fields)
|
|
|
|
# there are still other cc_pair references to the doc, so just resync to Vespa
|
|
delete_document_by_connector_credential_pair__no_commit(
|
|
db_session=db_session,
|
|
document_id=document_id,
|
|
connector_credential_pair_identifier=ConnectorCredentialPairIdentifier(
|
|
connector_id=connector_id,
|
|
credential_id=credential_id,
|
|
),
|
|
)
|
|
|
|
mark_document_as_synced(document_id, db_session)
|
|
else:
|
|
pass
|
|
|
|
db_session.commit()
|
|
|
|
task_logger.info(
|
|
f"tenant={tenant_id} "
|
|
f"doc={document_id} "
|
|
f"action={action} "
|
|
f"refcount={count} "
|
|
f"chunks={chunks_affected}"
|
|
)
|
|
except SoftTimeLimitExceeded:
|
|
task_logger.info(
|
|
f"SoftTimeLimitExceeded exception. tenant={tenant_id} doc={document_id}"
|
|
)
|
|
return False
|
|
except Exception as ex:
|
|
if isinstance(ex, RetryError):
|
|
task_logger.warning(
|
|
f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
|
|
)
|
|
|
|
# only set the inner exception if it is of type Exception
|
|
e_temp = ex.last_attempt.exception()
|
|
if isinstance(e_temp, Exception):
|
|
e = e_temp
|
|
else:
|
|
e = ex
|
|
|
|
if isinstance(e, httpx.HTTPStatusError):
|
|
if e.response.status_code == HTTPStatus.BAD_REQUEST:
|
|
task_logger.exception(
|
|
f"Non-retryable HTTPStatusError: "
|
|
f"tenant={tenant_id} "
|
|
f"doc={document_id} "
|
|
f"status={e.response.status_code}"
|
|
)
|
|
return False
|
|
|
|
task_logger.exception(
|
|
f"Unexpected exception: tenant={tenant_id} doc={document_id}"
|
|
)
|
|
|
|
if self.request.retries < DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES:
|
|
# Still retrying. Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64
|
|
countdown = 2 ** (self.request.retries + 4)
|
|
self.retry(exc=e, countdown=countdown)
|
|
else:
|
|
# This is the last attempt! mark the document as dirty in the db so that it
|
|
# eventually gets fixed out of band via stale document reconciliation
|
|
task_logger.warning(
|
|
f"Max celery task retries reached. Marking doc as dirty for reconciliation: "
|
|
f"tenant={tenant_id} doc={document_id}"
|
|
)
|
|
with get_session_with_tenant(tenant_id) as db_session:
|
|
# delete the cc pair relationship now and let reconciliation clean it up
|
|
# in vespa
|
|
delete_document_by_connector_credential_pair__no_commit(
|
|
db_session=db_session,
|
|
document_id=document_id,
|
|
connector_credential_pair_identifier=ConnectorCredentialPairIdentifier(
|
|
connector_id=connector_id,
|
|
credential_id=credential_id,
|
|
),
|
|
)
|
|
mark_document_as_modified(document_id, db_session)
|
|
return False
|
|
|
|
return True
|