From 55e4465782f034f63a1adf97591913ca0249e9ea Mon Sep 17 00:00:00 2001 From: Evan Lohn Date: Fri, 2 May 2025 10:22:59 -0700 Subject: [PATCH] orphan tag cleanup optimization (#4651) * move orphan tag cleanup to final cleanup section of associated tparent tasks * naming --- .../onyx/background/celery/tasks/connector_deletion/tasks.py | 4 ++++ backend/onyx/background/celery/tasks/pruning/tasks.py | 3 +++ backend/onyx/db/tag.py | 2 ++ backend/scripts/force_delete_connector_by_id.py | 2 ++ backend/scripts/orphan_doc_cleanup_script.py | 2 ++ 5 files changed, 13 insertions(+) diff --git a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py index d86db37ad67a..9526b32709fa 100644 --- a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py +++ b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py @@ -44,6 +44,7 @@ from onyx.db.search_settings import get_all_search_settings from onyx.db.sync_record import cleanup_sync_records from onyx.db.sync_record import insert_sync_record from onyx.db.sync_record import update_sync_record_status +from onyx.db.tag import delete_orphan_tags__no_commit from onyx.redis.redis_connector import RedisConnector from onyx.redis.redis_connector_delete import RedisConnectorDelete from onyx.redis.redis_connector_delete import RedisConnectorDeletePayload @@ -445,6 +446,9 @@ def monitor_connector_deletion_taskset( db_session=db_session, ) + # delete orphan tags + delete_orphan_tags__no_commit(db_session) + # Store IDs before potentially expiring cc_pair connector_id_to_delete = cc_pair.connector_id credential_id_to_delete = cc_pair.credential_id diff --git a/backend/onyx/background/celery/tasks/pruning/tasks.py b/backend/onyx/background/celery/tasks/pruning/tasks.py index 739eba3b0aea..13cb6fd67ffc 100644 --- a/backend/onyx/background/celery/tasks/pruning/tasks.py +++ b/backend/onyx/background/celery/tasks/pruning/tasks.py @@ -49,6 +49,7 @@ from onyx.db.models import ConnectorCredentialPair from onyx.db.search_settings import get_current_search_settings from onyx.db.sync_record import insert_sync_record from onyx.db.sync_record import update_sync_record_status +from onyx.db.tag import delete_orphan_tags__no_commit from onyx.redis.redis_connector import RedisConnector from onyx.redis.redis_connector_prune import RedisConnectorPrune from onyx.redis.redis_connector_prune import RedisConnectorPrunePayload @@ -561,6 +562,8 @@ def monitor_ccpair_pruning_taskset( num_docs_synced=initial, ) + delete_orphan_tags__no_commit(db_session) + redis_connector.prune.taskset_clear() redis_connector.prune.generator_clear() redis_connector.prune.set_fence(None) diff --git a/backend/onyx/db/tag.py b/backend/onyx/db/tag.py index 0c6ebb42df92..643b79642104 100644 --- a/backend/onyx/db/tag.py +++ b/backend/onyx/db/tag.py @@ -148,6 +148,8 @@ def delete_document_tags_for_documents__no_commit( stmt = delete(Document__Tag).where(Document__Tag.document_id.in_(document_ids)) db_session.execute(stmt) + +def delete_orphan_tags__no_commit(db_session: Session) -> None: orphan_tags_query = select(Tag.id).where( ~db_session.query(Document__Tag.tag_id) .filter(Document__Tag.tag_id == Tag.id) diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py index 8f3e120a8740..f8cc22ba0a4c 100755 --- a/backend/scripts/force_delete_connector_by_id.py +++ b/backend/scripts/force_delete_connector_by_id.py @@ -8,6 +8,7 @@ from sqlalchemy.orm import Session from onyx.db.document import delete_documents_complete__no_commit from onyx.db.enums import ConnectorCredentialPairStatus from onyx.db.search_settings import get_active_search_settings +from onyx.db.tag import delete_orphan_tags__no_commit from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA # Modify sys.path @@ -83,6 +84,7 @@ def _unsafe_deletion( db_session=db_session, document_ids=[document.id for document in documents], ) + delete_orphan_tags__no_commit(db_session=db_session) num_docs_deleted += len(documents) diff --git a/backend/scripts/orphan_doc_cleanup_script.py b/backend/scripts/orphan_doc_cleanup_script.py index 413039936611..b4f0a37cdc88 100644 --- a/backend/scripts/orphan_doc_cleanup_script.py +++ b/backend/scripts/orphan_doc_cleanup_script.py @@ -16,6 +16,7 @@ from onyx.context.search.models import IndexFilters # noqa: E402 from onyx.document_index.interfaces import VespaChunkRequest # noqa: E402 from onyx.db.engine import get_session_context_manager # noqa: E402 from onyx.db.document import delete_documents_complete__no_commit # noqa: E402 +from onyx.db.tag import delete_orphan_tags__no_commit # noqa: E402 from onyx.db.search_settings import get_current_search_settings # noqa: E402 from onyx.document_index.vespa.index import VespaIndex # noqa: E402 from onyx.db.document import get_document # noqa: E402 @@ -128,6 +129,7 @@ def main() -> None: delete_documents_complete__no_commit( db_session, successfully_vespa_deleted_doc_ids ) + delete_orphan_tags__no_commit(db_session) db_session.commit() except Exception as e: print(f"Error deleting documents from Postgres: {e}")