Fully remove visit API (#3621)

* v1

* update indexing logic

* update updates

* nit

* clean up args

* update for clarity + best practices

* nit + logs

* fix

* minor clean up

* remove logs

* quick nit
This commit is contained in:
pablonyx
2025-01-08 13:49:01 -08:00
committed by GitHub
parent eac73a1bf1
commit d7bc32c0ec
15 changed files with 397 additions and 254 deletions

View File

@ -5,6 +5,7 @@ import sys
from sqlalchemy import delete
from sqlalchemy.orm import Session
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.enums import ConnectorCredentialPairStatus
# Modify sys.path
@ -38,7 +39,6 @@ from onyx.db.engine import get_session_context_manager
from onyx.document_index.factory import get_default_document_index
from onyx.file_store.file_store import get_default_file_store
from onyx.document_index.document_index_utils import get_both_index_names
from onyx.db.document import delete_documents_complete__no_commit
# pylint: enable=E402
# flake8: noqa: E402
@ -71,13 +71,16 @@ def _unsafe_deletion(
if not documents:
break
document_ids = [document.id for document in documents]
for doc_id in document_ids:
document_index.delete_single(doc_id)
for document in documents:
document_index.delete_single(
doc_id=document.id,
tenant_id=None,
chunk_count=document.chunk_count,
)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=document_ids,
document_ids=[document.id for document in documents],
)
num_docs_deleted += len(documents)
@ -216,6 +219,7 @@ if __name__ == "__main__":
parser.add_argument(
"connector_id", type=int, help="The ID of the connector to delete"
)
args = parser.parse_args()
with get_session_context_manager() as db_session:
_delete_connector(args.connector_id, db_session)

View File

@ -15,6 +15,7 @@ from onyx.db.engine import get_session_context_manager # noqa: E402
from onyx.db.document import delete_documents_complete__no_commit # noqa: E402
from onyx.db.search_settings import get_current_search_settings # noqa: E402
from onyx.document_index.vespa.index import VespaIndex # noqa: E402
from onyx.db.document import get_document # noqa: E402
BATCH_SIZE = 100
@ -63,6 +64,9 @@ def main() -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
def process_doc(doc_id: str) -> str | None:
document = get_document(doc_id, db_session)
if not document:
return None
# Check if document exists in Vespa first
try:
chunks = vespa_index.id_based_retrieval(
@ -83,7 +87,9 @@ def main() -> None:
try:
print(f"Deleting document {doc_id} in Vespa")
chunks_deleted = vespa_index.delete_single(doc_id)
chunks_deleted = vespa_index.delete_single(
doc_id, tenant_id=None, chunk_count=document.chunk_count
)
if chunks_deleted > 0:
print(
f"Deleted {chunks_deleted} chunks for document {doc_id}"