Related permission docs to cc_pair to prevent orphan docs (#3336)

* Related permission docs to cc_pair to prevent orphan docs * added script * group sync deduping * logging
2025-07-08 21:50:12 +02:00 · 2024-12-04 13:00:54 -08:00
parent 993acec5e9
commit ef9942b751
9 changed files with 152 additions and 9 deletions
--- a/backend/scripts/orphan_doc_cleanup_script.py
+++ b/backend/scripts/orphan_doc_cleanup_script.py
@ -0,0 +1,79 @@
+import os
+import sys
+
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+# makes it so `PYTHONPATH=.` is not required when running this script
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from danswer.db.engine import get_session_context_manager  # noqa: E402
+from danswer.db.document import delete_documents_complete__no_commit  # noqa: E402
+from danswer.db.search_settings import get_current_search_settings  # noqa: E402
+from danswer.document_index.vespa.index import VespaIndex  # noqa: E402
+from danswer.background.celery.tasks.shared.RetryDocumentIndex import (  # noqa: E402
+    RetryDocumentIndex,
+)
+
+
+def _get_orphaned_document_ids(db_session: Session) -> list[str]:
+    """Get document IDs that don't have any entries in document_by_connector_credential_pair"""
+    query = text(
+        """
+        SELECT d.id
+        FROM document d
+        LEFT JOIN document_by_connector_credential_pair dbcc ON d.id = dbcc.id
+        WHERE dbcc.id IS NULL
+    """
+    )
+    orphaned_ids = [doc_id[0] for doc_id in db_session.execute(query)]
+    print(f"Found {len(orphaned_ids)} orphaned documents")
+    return orphaned_ids
+
+
+def main() -> None:
+    with get_session_context_manager() as db_session:
+        # Get orphaned document IDs
+        orphaned_ids = _get_orphaned_document_ids(db_session)
+        if not orphaned_ids:
+            print("No orphaned documents found")
+            return
+
+        # Setup Vespa index
+        search_settings = get_current_search_settings(db_session)
+        index_name = search_settings.index_name
+        vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
+        retry_index = RetryDocumentIndex(vespa_index)
+
+        # Delete chunks from Vespa first
+        print("Deleting orphaned document chunks from Vespa")
+        successfully_vespa_deleted_doc_ids = []
+        for doc_id in orphaned_ids:
+            try:
+                chunks_deleted = retry_index.delete_single(doc_id)
+                successfully_vespa_deleted_doc_ids.append(doc_id)
+                if chunks_deleted > 0:
+                    print(f"Deleted {chunks_deleted} chunks for document {doc_id}")
+            except Exception as e:
+                print(
+                    f"Error deleting document {doc_id} in Vespa and will not delete from Postgres: {e}"
+                )
+
+        # Delete documents from Postgres
+        print("Deleting orphaned documents from Postgres")
+        try:
+            delete_documents_complete__no_commit(
+                db_session, successfully_vespa_deleted_doc_ids
+            )
+            db_session.commit()
+        except Exception as e:
+            print(f"Error deleting documents from Postgres: {e}")
+
+    print(
+        f"Successfully cleaned up {len(successfully_vespa_deleted_doc_ids)} orphaned documents"
+    )
+
+
+if __name__ == "__main__":
+    main()