mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-08 21:50:12 +02:00
Related permission docs to cc_pair to prevent orphan docs (#3336)
* Related permission docs to cc_pair to prevent orphan docs * added script * group sync deduping * logging
This commit is contained in:
79
backend/scripts/orphan_doc_cleanup_script.py
Normal file
79
backend/scripts/orphan_doc_cleanup_script.py
Normal file
@ -0,0 +1,79 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
# makes it so `PYTHONPATH=.` is not required when running this script
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from danswer.db.engine import get_session_context_manager # noqa: E402
|
||||
from danswer.db.document import delete_documents_complete__no_commit # noqa: E402
|
||||
from danswer.db.search_settings import get_current_search_settings # noqa: E402
|
||||
from danswer.document_index.vespa.index import VespaIndex # noqa: E402
|
||||
from danswer.background.celery.tasks.shared.RetryDocumentIndex import ( # noqa: E402
|
||||
RetryDocumentIndex,
|
||||
)
|
||||
|
||||
|
||||
def _get_orphaned_document_ids(db_session: Session) -> list[str]:
|
||||
"""Get document IDs that don't have any entries in document_by_connector_credential_pair"""
|
||||
query = text(
|
||||
"""
|
||||
SELECT d.id
|
||||
FROM document d
|
||||
LEFT JOIN document_by_connector_credential_pair dbcc ON d.id = dbcc.id
|
||||
WHERE dbcc.id IS NULL
|
||||
"""
|
||||
)
|
||||
orphaned_ids = [doc_id[0] for doc_id in db_session.execute(query)]
|
||||
print(f"Found {len(orphaned_ids)} orphaned documents")
|
||||
return orphaned_ids
|
||||
|
||||
|
||||
def main() -> None:
|
||||
with get_session_context_manager() as db_session:
|
||||
# Get orphaned document IDs
|
||||
orphaned_ids = _get_orphaned_document_ids(db_session)
|
||||
if not orphaned_ids:
|
||||
print("No orphaned documents found")
|
||||
return
|
||||
|
||||
# Setup Vespa index
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
index_name = search_settings.index_name
|
||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||
retry_index = RetryDocumentIndex(vespa_index)
|
||||
|
||||
# Delete chunks from Vespa first
|
||||
print("Deleting orphaned document chunks from Vespa")
|
||||
successfully_vespa_deleted_doc_ids = []
|
||||
for doc_id in orphaned_ids:
|
||||
try:
|
||||
chunks_deleted = retry_index.delete_single(doc_id)
|
||||
successfully_vespa_deleted_doc_ids.append(doc_id)
|
||||
if chunks_deleted > 0:
|
||||
print(f"Deleted {chunks_deleted} chunks for document {doc_id}")
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Error deleting document {doc_id} in Vespa and will not delete from Postgres: {e}"
|
||||
)
|
||||
|
||||
# Delete documents from Postgres
|
||||
print("Deleting orphaned documents from Postgres")
|
||||
try:
|
||||
delete_documents_complete__no_commit(
|
||||
db_session, successfully_vespa_deleted_doc_ids
|
||||
)
|
||||
db_session.commit()
|
||||
except Exception as e:
|
||||
print(f"Error deleting documents from Postgres: {e}")
|
||||
|
||||
print(
|
||||
f"Successfully cleaned up {len(successfully_vespa_deleted_doc_ids)} orphaned documents"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user