Speedup orphan doc cleanup script (#3596)

* Speedup orphan doc cleanup script

* Fix mypy
This commit is contained in:
Chris Weaver
2025-01-05 06:28:25 -08:00
committed by GitHub
parent 2fc58252f4
commit f895e5f7d0

View File

@ -1,3 +1,4 @@
import concurrent.futures
import os import os
import sys import sys
@ -8,16 +9,17 @@ from sqlalchemy.orm import Session
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir) sys.path.append(parent_dir)
from onyx.context.search.models import IndexFilters # noqa: E402
from onyx.document_index.interfaces import VespaChunkRequest # noqa: E402
from onyx.db.engine import get_session_context_manager # noqa: E402 from onyx.db.engine import get_session_context_manager # noqa: E402
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402 from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
from onyx.db.search_settings import get_current_search_settings # noqa: E402 from onyx.db.search_settings import get_current_search_settings # noqa: E402
from onyx.document_index.vespa.index import VespaIndex # noqa: E402 from onyx.document_index.vespa.index import VespaIndex # noqa: E402
from onyx.background.celery.tasks.shared.RetryDocumentIndex import ( # noqa: E402
RetryDocumentIndex, BATCH_SIZE = 100
)
def _get_orphaned_document_ids(db_session: Session) -> list[str]: def _get_orphaned_document_ids(db_session: Session, limit: int) -> list[str]:
"""Get document IDs that don't have any entries in document_by_connector_credential_pair""" """Get document IDs that don't have any entries in document_by_connector_credential_pair"""
query = text( query = text(
""" """
@ -25,40 +27,83 @@ def _get_orphaned_document_ids(db_session: Session) -> list[str]:
FROM document d FROM document d
LEFT JOIN document_by_connector_credential_pair dbcc ON d.id = dbcc.id LEFT JOIN document_by_connector_credential_pair dbcc ON d.id = dbcc.id
WHERE dbcc.id IS NULL WHERE dbcc.id IS NULL
LIMIT :limit
""" """
) )
orphaned_ids = [doc_id[0] for doc_id in db_session.execute(query)] orphaned_ids = [doc_id[0] for doc_id in db_session.execute(query, {"limit": limit})]
print(f"Found {len(orphaned_ids)} orphaned documents") print(f"Found {len(orphaned_ids)} orphaned documents in this batch")
return orphaned_ids return orphaned_ids
def main() -> None: def main() -> None:
with get_session_context_manager() as db_session: with get_session_context_manager() as db_session:
# Get orphaned document IDs total_processed = 0
orphaned_ids = _get_orphaned_document_ids(db_session) while True:
# Get orphaned document IDs in batches
orphaned_ids = _get_orphaned_document_ids(db_session, BATCH_SIZE)
if not orphaned_ids: if not orphaned_ids:
if total_processed == 0:
print("No orphaned documents found") print("No orphaned documents found")
else:
print(
f"Finished processing all batches. Total documents "
f"processed: {total_processed}"
)
return return
# Setup Vespa index # Setup Vespa index
search_settings = get_current_search_settings(db_session) search_settings = get_current_search_settings(db_session)
index_name = search_settings.index_name index_name = search_settings.index_name
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None) vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
retry_index = RetryDocumentIndex(vespa_index)
# Delete chunks from Vespa first # Delete chunks from Vespa first
print("Deleting orphaned document chunks from Vespa") print("Deleting orphaned document chunks from Vespa")
successfully_vespa_deleted_doc_ids = [] successfully_vespa_deleted_doc_ids: list[str] = []
for doc_id in orphaned_ids: # Process documents in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
def process_doc(doc_id: str) -> str | None:
# Check if document exists in Vespa first
try: try:
chunks_deleted = retry_index.delete_single(doc_id) chunks = vespa_index.id_based_retrieval(
successfully_vespa_deleted_doc_ids.append(doc_id) chunk_requests=[
if chunks_deleted > 0: VespaChunkRequest(document_id=doc_id, max_chunk_ind=2)
print(f"Deleted {chunks_deleted} chunks for document {doc_id}") ],
filters=IndexFilters(access_control_list=None),
batch_retrieval=True,
)
if not chunks:
print(f"Document {doc_id} not found in Vespa")
return doc_id
except Exception as e: except Exception as e:
print( print(
f"Error deleting document {doc_id} in Vespa and will not delete from Postgres: {e}" f"Error checking if document {doc_id} exists in Vespa: {e}"
) )
return None
try:
print(f"Deleting document {doc_id} in Vespa")
chunks_deleted = vespa_index.delete_single(doc_id)
if chunks_deleted > 0:
print(
f"Deleted {chunks_deleted} chunks for document {doc_id}"
)
return doc_id
except Exception as e:
print(
f"Error deleting document {doc_id} in Vespa and "
f"will not delete from Postgres: {e}"
)
return None
# Submit all tasks and gather results
futures = [
executor.submit(process_doc, doc_id) for doc_id in orphaned_ids
]
for future in concurrent.futures.as_completed(futures):
doc_id = future.result()
if doc_id:
successfully_vespa_deleted_doc_ids.append(doc_id)
# Delete documents from Postgres # Delete documents from Postgres
print("Deleting orphaned documents from Postgres") print("Deleting orphaned documents from Postgres")
@ -69,10 +114,14 @@ def main() -> None:
db_session.commit() db_session.commit()
except Exception as e: except Exception as e:
print(f"Error deleting documents from Postgres: {e}") print(f"Error deleting documents from Postgres: {e}")
break
total_processed += len(successfully_vespa_deleted_doc_ids)
print( print(
f"Successfully cleaned up {len(successfully_vespa_deleted_doc_ids)} orphaned documents" f"Successfully cleaned up {len(successfully_vespa_deleted_doc_ids)}"
f" orphaned documents in this batch"
) )
print(f"Total documents processed so far: {total_processed}")
if __name__ == "__main__": if __name__ == "__main__":