hotfix for vespa delete performance

2025-09-26 20:08:38 +02:00 · 2024-10-03 09:43:02 -07:00
parent 3e511497d2
commit f9a5b227a1
3 changed files with 72 additions and 1 deletions
--- a/backend/danswer/background/connector_deletion.py
+++ b/backend/danswer/background/connector_deletion.py
@@ -148,7 +148,7 @@ def document_by_cc_pair_cleanup_task(
            if count == 1:
                # count == 1 means this is the only remaining cc_pair reference to the doc
                # delete it from vespa and the db
-                document_index.delete(doc_ids=[document_id])
+                document_index.delete_single(doc_id=document_id)
                delete_documents_complete__no_commit(
                    db_session=db_session,
                    document_ids=[document_id],
--- a/backend/danswer/document_index/interfaces.py
+++ b/backend/danswer/document_index/interfaces.py
@@ -156,6 +156,16 @@ class Deletable(abc.ABC):
    Class must implement the ability to delete document by their unique document ids.
    """
    @abc.abstractmethod
    def delete_single(self, doc_id: str) -> None:
        """
        Given a single document id, hard delete it from the document index
        Parameters:
        - doc_id: document id as specified by the connector
        """
        raise NotImplementedError
    @abc.abstractmethod
    def delete(self, doc_ids: list[str]) -> None:
        """
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -13,6 +13,7 @@ from typing import cast
 import httpx
 import requests
 from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
 from danswer.configs.chat_configs import DOC_TIME_DECAY
 from danswer.configs.chat_configs import NUM_RETURNED_HITS
 from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
@@ -479,6 +480,66 @@ class VespaIndex(DocumentIndex):
                    document_ids=doc_ids, index_name=index_name, http_client=http_client
                )
    def delete_single(self, doc_id: str) -> None:
        """Possibly faster overall than the delete method due to using a single
        delete call with a selection query."""
        # Vespa deletion is poorly documented ... luckily we found this
        # https://docs.vespa.ai/en/operations/batch-delete.html#example
        doc_id = replace_invalid_doc_id_characters(doc_id)
        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
        # indexing / updates / deletes since we have to make a large volume of requests.
        index_names = [self.index_name]
        if self.secondary_index_name:
            index_names.append(self.secondary_index_name)
        with httpx.Client(http2=True) as http_client:
            for index_name in index_names:
                params = httpx.QueryParams(
                    {
                        "selection": f"{index_name}.document_id=='{doc_id}'",
                        "cluster": DOCUMENT_INDEX_NAME,
                    }
                )
                total_chunks_deleted = 0
                while True:
                    try:
                        resp = http_client.delete(
                            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
                            params=params,
                        )
                        resp.raise_for_status()
                    except httpx.HTTPStatusError as e:
                        logger.error(
                            f"Failed to delete chunk, details: {e.response.text}"
                        )
                        raise
                    resp_data = resp.json()
                    if "documentCount" in resp_data:
                        chunks_deleted = resp_data["documentCount"]
                        total_chunks_deleted += chunks_deleted
                    # Check for continuation token to handle pagination
                    if "continuation" not in resp_data:
                        break  # Exit loop if no continuation token
                    if not resp_data["continuation"]:
                        break  # Exit loop if continuation token is empty
                    params = params.set("continuation", resp_data["continuation"])
                logger.debug(
                    f"VespaIndex.delete_single: "
                    f"index={index_name} "
                    f"doc={doc_id} "
                    f"chunks_deleted={total_chunks_deleted}"
                )
    def id_based_retrieval(
        self,
        chunk_requests: list[VespaChunkRequest],