mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-30 09:40:50 +02:00
Speedup orphan doc cleanup script (#3596)
* Speedup orphan doc cleanup script * Fix mypy
This commit is contained in:
@ -1,3 +1,4 @@
|
|||||||
|
import concurrent.futures
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@ -8,16 +9,17 @@ from sqlalchemy.orm import Session
|
|||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
sys.path.append(parent_dir)
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
|
from onyx.context.search.models import IndexFilters # noqa: E402
|
||||||
|
from onyx.document_index.interfaces import VespaChunkRequest # noqa: E402
|
||||||
from onyx.db.engine import get_session_context_manager # noqa: E402
|
from onyx.db.engine import get_session_context_manager # noqa: E402
|
||||||
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
|
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
|
||||||
from onyx.db.search_settings import get_current_search_settings # noqa: E402
|
from onyx.db.search_settings import get_current_search_settings # noqa: E402
|
||||||
from onyx.document_index.vespa.index import VespaIndex # noqa: E402
|
from onyx.document_index.vespa.index import VespaIndex # noqa: E402
|
||||||
from onyx.background.celery.tasks.shared.RetryDocumentIndex import ( # noqa: E402
|
|
||||||
RetryDocumentIndex,
|
BATCH_SIZE = 100
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_orphaned_document_ids(db_session: Session) -> list[str]:
|
def _get_orphaned_document_ids(db_session: Session, limit: int) -> list[str]:
|
||||||
"""Get document IDs that don't have any entries in document_by_connector_credential_pair"""
|
"""Get document IDs that don't have any entries in document_by_connector_credential_pair"""
|
||||||
query = text(
|
query = text(
|
||||||
"""
|
"""
|
||||||
@ -25,40 +27,83 @@ def _get_orphaned_document_ids(db_session: Session) -> list[str]:
|
|||||||
FROM document d
|
FROM document d
|
||||||
LEFT JOIN document_by_connector_credential_pair dbcc ON d.id = dbcc.id
|
LEFT JOIN document_by_connector_credential_pair dbcc ON d.id = dbcc.id
|
||||||
WHERE dbcc.id IS NULL
|
WHERE dbcc.id IS NULL
|
||||||
|
LIMIT :limit
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
orphaned_ids = [doc_id[0] for doc_id in db_session.execute(query)]
|
orphaned_ids = [doc_id[0] for doc_id in db_session.execute(query, {"limit": limit})]
|
||||||
print(f"Found {len(orphaned_ids)} orphaned documents")
|
print(f"Found {len(orphaned_ids)} orphaned documents in this batch")
|
||||||
return orphaned_ids
|
return orphaned_ids
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
with get_session_context_manager() as db_session:
|
with get_session_context_manager() as db_session:
|
||||||
# Get orphaned document IDs
|
total_processed = 0
|
||||||
orphaned_ids = _get_orphaned_document_ids(db_session)
|
while True:
|
||||||
|
# Get orphaned document IDs in batches
|
||||||
|
orphaned_ids = _get_orphaned_document_ids(db_session, BATCH_SIZE)
|
||||||
if not orphaned_ids:
|
if not orphaned_ids:
|
||||||
|
if total_processed == 0:
|
||||||
print("No orphaned documents found")
|
print("No orphaned documents found")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"Finished processing all batches. Total documents "
|
||||||
|
f"processed: {total_processed}"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Setup Vespa index
|
# Setup Vespa index
|
||||||
search_settings = get_current_search_settings(db_session)
|
search_settings = get_current_search_settings(db_session)
|
||||||
index_name = search_settings.index_name
|
index_name = search_settings.index_name
|
||||||
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None)
|
||||||
retry_index = RetryDocumentIndex(vespa_index)
|
|
||||||
|
|
||||||
# Delete chunks from Vespa first
|
# Delete chunks from Vespa first
|
||||||
print("Deleting orphaned document chunks from Vespa")
|
print("Deleting orphaned document chunks from Vespa")
|
||||||
successfully_vespa_deleted_doc_ids = []
|
successfully_vespa_deleted_doc_ids: list[str] = []
|
||||||
for doc_id in orphaned_ids:
|
# Process documents in parallel using ThreadPoolExecutor
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
|
||||||
|
|
||||||
|
def process_doc(doc_id: str) -> str | None:
|
||||||
|
# Check if document exists in Vespa first
|
||||||
try:
|
try:
|
||||||
chunks_deleted = retry_index.delete_single(doc_id)
|
chunks = vespa_index.id_based_retrieval(
|
||||||
successfully_vespa_deleted_doc_ids.append(doc_id)
|
chunk_requests=[
|
||||||
if chunks_deleted > 0:
|
VespaChunkRequest(document_id=doc_id, max_chunk_ind=2)
|
||||||
print(f"Deleted {chunks_deleted} chunks for document {doc_id}")
|
],
|
||||||
|
filters=IndexFilters(access_control_list=None),
|
||||||
|
batch_retrieval=True,
|
||||||
|
)
|
||||||
|
if not chunks:
|
||||||
|
print(f"Document {doc_id} not found in Vespa")
|
||||||
|
return doc_id
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(
|
print(
|
||||||
f"Error deleting document {doc_id} in Vespa and will not delete from Postgres: {e}"
|
f"Error checking if document {doc_id} exists in Vespa: {e}"
|
||||||
)
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f"Deleting document {doc_id} in Vespa")
|
||||||
|
chunks_deleted = vespa_index.delete_single(doc_id)
|
||||||
|
if chunks_deleted > 0:
|
||||||
|
print(
|
||||||
|
f"Deleted {chunks_deleted} chunks for document {doc_id}"
|
||||||
|
)
|
||||||
|
return doc_id
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
f"Error deleting document {doc_id} in Vespa and "
|
||||||
|
f"will not delete from Postgres: {e}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Submit all tasks and gather results
|
||||||
|
futures = [
|
||||||
|
executor.submit(process_doc, doc_id) for doc_id in orphaned_ids
|
||||||
|
]
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
doc_id = future.result()
|
||||||
|
if doc_id:
|
||||||
|
successfully_vespa_deleted_doc_ids.append(doc_id)
|
||||||
|
|
||||||
# Delete documents from Postgres
|
# Delete documents from Postgres
|
||||||
print("Deleting orphaned documents from Postgres")
|
print("Deleting orphaned documents from Postgres")
|
||||||
@ -69,10 +114,14 @@ def main() -> None:
|
|||||||
db_session.commit()
|
db_session.commit()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error deleting documents from Postgres: {e}")
|
print(f"Error deleting documents from Postgres: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
total_processed += len(successfully_vespa_deleted_doc_ids)
|
||||||
print(
|
print(
|
||||||
f"Successfully cleaned up {len(successfully_vespa_deleted_doc_ids)} orphaned documents"
|
f"Successfully cleaned up {len(successfully_vespa_deleted_doc_ids)}"
|
||||||
|
f" orphaned documents in this batch"
|
||||||
)
|
)
|
||||||
|
print(f"Total documents processed so far: {total_processed}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Reference in New Issue
Block a user