Add multi-threading to improve speed of updates / indexing

2025-09-27 20:38:32 +02:00 · 2023-10-02 00:02:43 -07:00
parent 351475de28
commit 829d04c904
1 changed files with 163 additions and 99 deletions
--- a/backend/danswer/datastores/vespa/store.py
+++ b/backend/danswer/datastores/vespa/store.py
@@ -1,6 +1,9 @@
 import concurrent.futures
 import json
 import string
 import time
 from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import Any
 from typing import cast
 from uuid import UUID
@@ -42,6 +45,7 @@ from danswer.datastores.interfaces import IndexFilter
 from danswer.datastores.interfaces import UpdateRequest
 from danswer.datastores.vespa.utils import remove_invalid_unicode_chars
 from danswer.search.semantic_search import embed_query
 from danswer.utils.batching import batch_generator
 from danswer.utils.logger import setup_logger
 logger = setup_logger()
@@ -56,10 +60,20 @@ DOCUMENT_ID_ENDPOINT = (
 )
 SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/"
 _BATCH_SIZE = 100  # Specific to Vespa
 _NUM_THREADS = (
    16  # since Vespa doesn't allow batching of inserts / updates, we use threads
 )
 # Specific to Vespa, needed for highlighting matching keywords / section
 CONTENT_SUMMARY = "content_summary"
@dataclass
 class _VespaUpdateRequest:
    document_id: str
    url: str
    update_request: dict[str, dict]
 def _does_document_exist(
    doc_chunk_id: str,
 ) -> bool:
@@ -108,16 +122,12 @@ def _delete_vespa_doc_chunks(document_id: str) -> bool:
    return not any(failures)
-def _index_vespa_chunks(
+def _index_vespa_chunk(
-    chunks: list[DocMetadataAwareIndexChunk],
+    chunk: DocMetadataAwareIndexChunk, already_existing_documents: set[str]
-) -> set[DocumentInsertionRecord]:
+) -> bool:
    json_header = {
        "Content-Type": "application/json",
    }
    insertion_records: set[DocumentInsertionRecord] = set()
    # document ids of documents that existed BEFORE this indexing
    already_existing_documents: set[str] = set()
    for chunk in chunks:
    document = chunk.source_document
    # No minichunk documents in vespa, minichunk vectors are stored in the chunk itself
    vespa_chunk_id = str(get_uuid_from_chunk(chunk))
@@ -131,7 +141,6 @@ def _index_vespa_chunks(
            raise RuntimeError(
                f"Failed to delete pre-existing chunks for with document with id: {document.id}"
            )
            already_existing_documents.add(document.id)
    embeddings = chunk.embeddings
    embeddings_name_vector_map = {"full_chunk": embeddings.full_embedding}
@@ -199,10 +208,36 @@ def _index_vespa_chunks(
        )
        _index_chunk(vespa_url, json_header, vespa_document_fields)
    return chunk_exists
 def _index_vespa_chunks(
    chunks: list[DocMetadataAwareIndexChunk],
 ) -> set[DocumentInsertionRecord]:
    insertion_records: set[DocumentInsertionRecord] = set()
    # document ids of documents that existed BEFORE this indexing
    already_existing_documents: set[str] = set()
    # use threads to parallelize since Vespa doesn't allow batching of updates
    with concurrent.futures.ThreadPoolExecutor(max_workers=_NUM_THREADS) as executor:
        for chunk_batch in batch_generator(chunks, _BATCH_SIZE):
            future_to_chunk = {
                executor.submit(
                    _index_vespa_chunk, chunk, already_existing_documents
                ): chunk
                for chunk in chunk_batch
            }
            for future in concurrent.futures.as_completed(future_to_chunk):
                chunk = future_to_chunk[future]
                chunk_already_existed = future.result()
                if chunk_already_existed:
                    already_existing_documents.add(chunk.source_document.id)
                insertion_records.add(
                    DocumentInsertionRecord(
-                document_id=document.id,
+                        document_id=chunk.source_document.id,
-                already_existed=document.id in already_existing_documents,
+                        already_existed=chunk.source_document.id
                        in already_existing_documents,
                    )
                )
@@ -368,11 +403,38 @@ class VespaIndex(DocumentIndex):
    ) -> set[DocumentInsertionRecord]:
        return _index_vespa_chunks(chunks=chunks)
    @staticmethod
    def _apply_updates_batched(
        updates: list[_VespaUpdateRequest],
        batch_size: int = _BATCH_SIZE,
    ) -> None:
        """Runs a batch of updates in parallel via the ThreadPoolExecutor."""
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=_NUM_THREADS
        ) as executor:
            for update_batch in batch_generator(updates, batch_size):
                future_to_document_id = {
                    executor.submit(
                        requests.put,
                        update.url,
                        headers={"Content-Type": "application/json"},
                        data=json.dumps(update.update_request),
                    ): update.document_id
                    for update in update_batch
                }
                for future in concurrent.futures.as_completed(future_to_document_id):
                    res = future.result()
                    try:
                        res.raise_for_status()
                    except requests.HTTPError as e:
                        failure_msg = f"Failed to update document: {future_to_document_id[future]}"
                        raise requests.HTTPError(failure_msg) from e
    def update(self, update_requests: list[UpdateRequest]) -> None:
        logger.info(f"Updating {len(update_requests)} documents in Vespa")
        start = time.time()
-        json_header = {"Content-Type": "application/json"}
+        processed_updates_requests: list[_VespaUpdateRequest] = []
        for update_request in update_requests:
            if (
                update_request.boost is None
@@ -400,16 +462,18 @@ class VespaIndex(DocumentIndex):
            for document_id in update_request.document_ids:
                for doc_chunk_id in _get_vespa_chunk_ids_by_document_id(document_id):
-                    url = f"{DOCUMENT_ID_ENDPOINT}/{doc_chunk_id}"
+                    processed_updates_requests.append(
-                    res = requests.put(
+                        _VespaUpdateRequest(
-                        url, headers=json_header, data=json.dumps(update_dict)
+                            document_id=document_id,
                            url=f"{DOCUMENT_ID_ENDPOINT}/{doc_chunk_id}",
                            update_request=update_dict,
                        )
                    )
-                    try:
+        self._apply_updates_batched(processed_updates_requests)
-                        res.raise_for_status()
+        logger.info(
-                    except requests.HTTPError as e:
+            "Finished updating Vespa documents in %s seconds", time.time() - start
-                        failure_msg = f"Failed to update document: {document_id}"
+        )
                        raise requests.HTTPError(failure_msg) from e
    def delete(self, doc_ids: list[str]) -> None:
        logger.info(f"Deleting {len(doc_ids)} documents from Vespa")