Fully remove visit API (#3621)

* v1 * update indexing logic * update updates * nit * clean up args * update for clarity + best practices * nit + logs * fix * minor clean up * remove logs * quick nit
2025-06-30 01:30:45 +02:00 · 2025-01-08 13:49:01 -08:00
parent eac73a1bf1
commit d7bc32c0ec
15 changed files with 397 additions and 254 deletions
--- a/backend/scripts/force_delete_connector_by_id.py
+++ b/backend/scripts/force_delete_connector_by_id.py
@ -5,6 +5,7 @@ import sys
 from sqlalchemy import delete
 from sqlalchemy.orm import Session

+from onyx.db.document import delete_documents_complete__no_commit
 from onyx.db.enums import ConnectorCredentialPairStatus

 # Modify sys.path
@ -38,7 +39,6 @@ from onyx.db.engine import get_session_context_manager
 from onyx.document_index.factory import get_default_document_index
 from onyx.file_store.file_store import get_default_file_store
 from onyx.document_index.document_index_utils import get_both_index_names
-from onyx.db.document import delete_documents_complete__no_commit

 # pylint: enable=E402
 # flake8: noqa: E402
@ -71,13 +71,16 @@ def _unsafe_deletion(
        if not documents:
            break

-        document_ids = [document.id for document in documents]
-        for doc_id in document_ids:
-            document_index.delete_single(doc_id)
+        for document in documents:
+            document_index.delete_single(
+                doc_id=document.id,
+                tenant_id=None,
+                chunk_count=document.chunk_count,
+            )

        delete_documents_complete__no_commit(
            db_session=db_session,
-            document_ids=document_ids,
+            document_ids=[document.id for document in documents],
        )

        num_docs_deleted += len(documents)
@ -216,6 +219,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "connector_id", type=int, help="The ID of the connector to delete"
    )
+
    args = parser.parse_args()
    with get_session_context_manager() as db_session:
        _delete_connector(args.connector_id, db_session)
--- a/backend/scripts/orphan_doc_cleanup_script.py
+++ b/backend/scripts/orphan_doc_cleanup_script.py
@ -15,6 +15,7 @@ from onyx.db.engine import get_session_context_manager  # noqa: E402
 from onyx.db.document import delete_documents_complete__no_commit  # noqa: E402
 from onyx.db.search_settings import get_current_search_settings  # noqa: E402
 from onyx.document_index.vespa.index import VespaIndex  # noqa: E402
+from onyx.db.document import get_document  # noqa: E402

 BATCH_SIZE = 100

@ -63,6 +64,9 @@ def main() -> None:
            with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:

                def process_doc(doc_id: str) -> str | None:
+                    document = get_document(doc_id, db_session)
+                    if not document:
+                        return None
                    # Check if document exists in Vespa first
                    try:
                        chunks = vespa_index.id_based_retrieval(
@ -83,7 +87,9 @@ def main() -> None:

                    try:
                        print(f"Deleting document {doc_id} in Vespa")
-                        chunks_deleted = vespa_index.delete_single(doc_id)
+                        chunks_deleted = vespa_index.delete_single(
+                            doc_id, tenant_id=None, chunk_count=document.chunk_count
+                        )
                        if chunks_deleted > 0:
                            print(
                                f"Deleted {chunks_deleted} chunks for document {doc_id}"