Improved indexing (#3594)

* nit * k * add steps * main util functions * functioning fully * quick nit * k * typing fix * k * address comments
2025-07-08 13:40:46 +02:00 · 2025-01-05 15:31:53 -08:00
parent e83542f572
commit ddec239fef
18 changed files with 419 additions and 150 deletions
--- a/backend/onyx/document_index/document_index_utils.py
+++ b/backend/onyx/document_index/document_index_utils.py
@ -1,12 +1,13 @@
 import math
 import uuid
+from uuid import UUID

 from sqlalchemy.orm import Session

-from onyx.context.search.models import InferenceChunk
 from onyx.db.search_settings import get_current_search_settings
 from onyx.db.search_settings import get_secondary_search_settings
-from onyx.indexing.models import IndexChunk
+from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
+from onyx.indexing.models import DocMetadataAwareIndexChunk


 DEFAULT_BATCH_SIZE = 30
@ -36,25 +37,118 @@ def translate_boost_count_to_multiplier(boost: int) -> float:
    return 2 / (1 + math.exp(-1 * boost / 3))


-def get_uuid_from_chunk(
-    chunk: IndexChunk | InferenceChunk, mini_chunk_ind: int = 0
-) -> uuid.UUID:
-    doc_str = (
-        chunk.document_id
-        if isinstance(chunk, InferenceChunk)
-        else chunk.source_document.id
-    )
+def assemble_document_chunk_info(
+    enriched_document_info_list: list[EnrichedDocumentIndexingInfo],
+    tenant_id: str | None,
+    large_chunks_enabled: bool,
+) -> list[UUID]:
+    doc_chunk_ids = []
+
+    for enriched_document_info in enriched_document_info_list:
+        for chunk_index in range(
+            enriched_document_info.chunk_start_index,
+            enriched_document_info.chunk_end_index,
+        ):
+            if not enriched_document_info.old_version:
+                doc_chunk_ids.append(
+                    get_uuid_from_chunk_info(
+                        document_id=enriched_document_info.doc_id,
+                        chunk_id=chunk_index,
+                        tenant_id=tenant_id,
+                    )
+                )
+            else:
+                doc_chunk_ids.append(
+                    get_uuid_from_chunk_info_old(
+                        document_id=enriched_document_info.doc_id,
+                        chunk_id=chunk_index,
+                    )
+                )
+
+            if large_chunks_enabled and chunk_index % 4 == 0:
+                large_chunk_id = int(chunk_index / 4)
+                large_chunk_reference_ids = [
+                    large_chunk_id + i
+                    for i in range(4)
+                    if large_chunk_id + i < enriched_document_info.chunk_end_index
+                ]
+                if enriched_document_info.old_version:
+                    doc_chunk_ids.append(
+                        get_uuid_from_chunk_info_old(
+                            document_id=enriched_document_info.doc_id,
+                            chunk_id=large_chunk_id,
+                            large_chunk_reference_ids=large_chunk_reference_ids,
+                        )
+                    )
+                else:
+                    doc_chunk_ids.append(
+                        get_uuid_from_chunk_info(
+                            document_id=enriched_document_info.doc_id,
+                            chunk_id=large_chunk_id,
+                            tenant_id=tenant_id,
+                            large_chunk_id=large_chunk_id,
+                        )
+                    )
+
+    return doc_chunk_ids
+
+
+def get_uuid_from_chunk_info(
+    *,
+    document_id: str,
+    chunk_id: int,
+    tenant_id: str | None,
+    large_chunk_id: int | None = None,
+) -> UUID:
+    doc_str = document_id
+
    # Web parsing URL duplicate catching
    if doc_str and doc_str[-1] == "/":
        doc_str = doc_str[:-1]
-    unique_identifier_string = "_".join(
-        [doc_str, str(chunk.chunk_id), str(mini_chunk_ind)]
+
+    chunk_index = (
+        "large_" + str(large_chunk_id) if large_chunk_id is not None else str(chunk_id)
    )
-    if chunk.large_chunk_reference_ids:
+    unique_identifier_string = "_".join([doc_str, chunk_index])
+    if tenant_id:
+        unique_identifier_string += "_" + tenant_id
+
+    return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
+
+
+def get_uuid_from_chunk_info_old(
+    *, document_id: str, chunk_id: int, large_chunk_reference_ids: list[int] = []
+) -> UUID:
+    doc_str = document_id
+
+    # Web parsing URL duplicate catching
+    if doc_str and doc_str[-1] == "/":
+        doc_str = doc_str[:-1]
+    unique_identifier_string = "_".join([doc_str, str(chunk_id), "0"])
+    if large_chunk_reference_ids:
        unique_identifier_string += "_large" + "_".join(
            [
                str(referenced_chunk_id)
-                for referenced_chunk_id in chunk.large_chunk_reference_ids
+                for referenced_chunk_id in large_chunk_reference_ids
            ]
        )
    return uuid.uuid5(uuid.NAMESPACE_X500, unique_identifier_string)
+
+
+def get_uuid_from_chunk(chunk: DocMetadataAwareIndexChunk) -> uuid.UUID:
+    return get_uuid_from_chunk_info(
+        document_id=chunk.source_document.id,
+        chunk_id=chunk.chunk_id,
+        tenant_id=chunk.tenant_id,
+        large_chunk_id=chunk.large_chunk_id,
+    )
+
+
+def get_uuid_from_chunk_old(
+    chunk: DocMetadataAwareIndexChunk, large_chunk_reference_ids: list[int] = []
+) -> UUID:
+    return get_uuid_from_chunk_info_old(
+        document_id=chunk.source_document.id,
+        chunk_id=chunk.chunk_id,
+        large_chunk_reference_ids=large_chunk_reference_ids,
+    )
--- a/backend/onyx/document_index/interfaces.py
+++ b/backend/onyx/document_index/interfaces.py
@ -35,6 +35,38 @@ class VespaChunkRequest:
        return None


+@dataclass
+class IndexBatchParams:
+    """
+    Information necessary for efficiently indexing a batch of documents
+    """
+
+    doc_id_to_previous_chunk_cnt: dict[str, int | None]
+    doc_id_to_new_chunk_cnt: dict[str, int]
+    tenant_id: str | None
+    large_chunks_enabled: bool
+
+
+@dataclass
+class MinimalDocumentIndexingInfo:
+    """
+    Minimal information necessary for indexing a document
+    """
+
+    doc_id: str
+    chunk_start_index: int
+
+
+@dataclass
+class EnrichedDocumentIndexingInfo(MinimalDocumentIndexingInfo):
+    """
+    Enriched information necessary for indexing a document, including version and chunk range.
+    """
+
+    old_version: bool
+    chunk_end_index: int
+
+
@dataclass
 class DocumentMetadata:
    """
@ -148,7 +180,7 @@ class Indexable(abc.ABC):
    def index(
        self,
        chunks: list[DocMetadataAwareIndexChunk],
-        fresh_index: bool = False,
+        index_batch_params: IndexBatchParams,
    ) -> set[DocumentInsertionRecord]:
        """
        Takes a list of document chunks and indexes them in the document index
@ -166,14 +198,11 @@ class Indexable(abc.ABC):
        only needs to index chunks into the PRIMARY index. Do not update the secondary index here,
        it is done automatically outside of this code.

-        NOTE: The fresh_index parameter, when set to True, assumes no documents have been previously
-        indexed for the given index/tenant. This can be used to optimize the indexing process for
-        new or empty indices.
-
        Parameters:
        - chunks: Document chunks with all of the information needed for indexing to the document
                index.
-        - fresh_index: Boolean indicating whether this is a fresh index with no existing documents.
+        - tenant_id: The tenant id of the user whose chunks are being indexed
+        - large_chunks_enabled: Whether large chunks are enabled

        Returns:
            List of document ids which map to unique documents and are used for deduping chunks
@ -185,7 +214,7 @@ class Indexable(abc.ABC):

 class Deletable(abc.ABC):
    """
-    Class must implement the ability to delete document by their unique document ids.
+    Class must implement the ability to delete document by a given unique document id.
    """

    @abc.abstractmethod
@ -198,16 +227,6 @@ class Deletable(abc.ABC):
        """
        raise NotImplementedError

-    @abc.abstractmethod
-    def delete(self, doc_ids: list[str]) -> None:
-        """
-        Given a list of document ids, hard delete them from the document index
-
-        Parameters:
-        - doc_ids: list of document ids as specified by the connector
-        """
-        raise NotImplementedError
-

 class Updatable(abc.ABC):
    """
--- a/backend/onyx/document_index/vespa/deletion.py
+++ b/backend/onyx/document_index/vespa/deletion.py
@ -1,11 +1,9 @@
 import concurrent.futures
+from uuid import UUID

 import httpx
 from retry import retry

-from onyx.document_index.vespa.chunk_retrieval import (
-    get_all_vespa_ids_for_document_id,
-)
 from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
 from onyx.document_index.vespa_constants import NUM_THREADS
 from onyx.utils.logger import setup_logger
@ -22,29 +20,21 @@ def _retryable_http_delete(http_client: httpx.Client, url: str) -> None:
    res.raise_for_status()


-@retry(tries=3, delay=1, backoff=2)
-def _delete_vespa_doc_chunks(
-    document_id: str, index_name: str, http_client: httpx.Client
+def _delete_vespa_chunk(
+    doc_chunk_id: UUID, index_name: str, http_client: httpx.Client
 ) -> None:
-    doc_chunk_ids = get_all_vespa_ids_for_document_id(
-        document_id=document_id,
-        index_name=index_name,
-        get_large_chunks=True,
-    )
-
-    for chunk_id in doc_chunk_ids:
-        try:
-            _retryable_http_delete(
-                http_client,
-                f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{chunk_id}",
-            )
-        except httpx.HTTPStatusError as e:
-            logger.error(f"Failed to delete chunk, details: {e.response.text}")
-            raise
+    try:
+        _retryable_http_delete(
+            http_client,
+            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}",
+        )
+    except httpx.HTTPStatusError as e:
+        logger.error(f"Failed to delete chunk, details: {e.response.text}")
+        raise


-def delete_vespa_docs(
-    document_ids: list[str],
+def delete_vespa_chunks(
+    doc_chunk_ids: list[UUID],
    index_name: str,
    http_client: httpx.Client,
    executor: concurrent.futures.ThreadPoolExecutor | None = None,
@ -56,13 +46,13 @@ def delete_vespa_docs(
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS)

    try:
-        doc_deletion_future = {
+        chunk_deletion_future = {
            executor.submit(
-                _delete_vespa_doc_chunks, doc_id, index_name, http_client
-            ): doc_id
-            for doc_id in document_ids
+                _delete_vespa_chunk, doc_chunk_id, index_name, http_client
+            ): doc_chunk_id
+            for doc_chunk_id in doc_chunk_ids
        }
-        for future in concurrent.futures.as_completed(doc_deletion_future):
+        for future in concurrent.futures.as_completed(chunk_deletion_future):
            # Will raise exception if the deletion raised an exception
            future.result()

--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@ -25,8 +25,12 @@ from onyx.configs.chat_configs import VESPA_SEARCHER_THREADS
 from onyx.configs.constants import KV_REINDEX_KEY
 from onyx.context.search.models import IndexFilters
 from onyx.context.search.models import InferenceChunkUncleaned
+from onyx.document_index.document_index_utils import assemble_document_chunk_info
 from onyx.document_index.interfaces import DocumentIndex
 from onyx.document_index.interfaces import DocumentInsertionRecord
+from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
+from onyx.document_index.interfaces import IndexBatchParams
+from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
 from onyx.document_index.interfaces import UpdateRequest
 from onyx.document_index.interfaces import VespaChunkRequest
 from onyx.document_index.interfaces import VespaDocumentFields
@ -38,12 +42,10 @@ from onyx.document_index.vespa.chunk_retrieval import (
    parallel_visit_api_retrieval,
 )
 from onyx.document_index.vespa.chunk_retrieval import query_vespa
-from onyx.document_index.vespa.deletion import delete_vespa_docs
+from onyx.document_index.vespa.deletion import delete_vespa_chunks
 from onyx.document_index.vespa.indexing_utils import batch_index_vespa_chunks
+from onyx.document_index.vespa.indexing_utils import check_for_final_chunk_existence
 from onyx.document_index.vespa.indexing_utils import clean_chunk_id_copy
-from onyx.document_index.vespa.indexing_utils import (
-    get_existing_documents_from_chunks,
-)
 from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
 from onyx.document_index.vespa.shared_utils.utils import (
    replace_invalid_doc_id_characters,
@ -307,12 +309,18 @@ class VespaIndex(DocumentIndex):
    def index(
        self,
        chunks: list[DocMetadataAwareIndexChunk],
-        fresh_index: bool = False,
+        index_batch_params: IndexBatchParams,
    ) -> set[DocumentInsertionRecord]:
        """Receive a list of chunks from a batch of documents and index the chunks into Vespa along
        with updating the associated permissions. Assumes that a document will not be split into
        multiple chunk batches calling this function multiple times, otherwise only the last set of
        chunks will be kept"""
+
+        doc_id_to_previous_chunk_cnt = index_batch_params.doc_id_to_previous_chunk_cnt
+        doc_id_to_new_chunk_cnt = index_batch_params.doc_id_to_new_chunk_cnt
+        tenant_id = index_batch_params.tenant_id
+        large_chunks_enabled = index_batch_params.large_chunks_enabled
+
        # IMPORTANT: This must be done one index at a time, do not use secondary index here
        cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]

@ -324,30 +332,55 @@ class VespaIndex(DocumentIndex):
            concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor,
            get_vespa_http_client() as http_client,
        ):
-            if not fresh_index:
-                # Check for existing documents, existing documents need to have all of their chunks deleted
-                # prior to indexing as the document size (num chunks) may have shrunk
-                first_chunks = [
-                    chunk for chunk in cleaned_chunks if chunk.chunk_id == 0
-                ]
-                for chunk_batch in batch_generator(first_chunks, BATCH_SIZE):
-                    existing_docs.update(
-                        get_existing_documents_from_chunks(
-                            chunks=chunk_batch,
-                            index_name=self.index_name,
-                            http_client=http_client,
-                            executor=executor,
-                        )
-                    )
+            # We require the start and end index for each document in order to
+            # know precisely which chunks to delete. This information exists for
+            # documents that have `chunk_count` in the database, but not for
+            # `old_version` documents.

-                for doc_id_batch in batch_generator(existing_docs, BATCH_SIZE):
-                    delete_vespa_docs(
-                        document_ids=doc_id_batch,
+            enriched_doc_infos: list[EnrichedDocumentIndexingInfo] = []
+            for document_id, _ in doc_id_to_previous_chunk_cnt.items():
+                last_indexed_chunk = doc_id_to_previous_chunk_cnt.get(document_id, None)
+                # If the document has no `chunk_count` in the database, we know that it
+                # has the old chunk ID system and we must check for the final chunk index
+                is_old_version = False
+                if last_indexed_chunk is None:
+                    is_old_version = True
+                    minimal_doc_info = MinimalDocumentIndexingInfo(
+                        doc_id=document_id,
+                        chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
+                    )
+                    last_indexed_chunk = check_for_final_chunk_existence(
+                        minimal_doc_info=minimal_doc_info,
+                        start_index=doc_id_to_new_chunk_cnt[document_id],
                        index_name=self.index_name,
                        http_client=http_client,
-                        executor=executor,
                    )

+                enriched_doc_info = EnrichedDocumentIndexingInfo(
+                    doc_id=document_id,
+                    chunk_start_index=doc_id_to_new_chunk_cnt.get(document_id, 0),
+                    chunk_end_index=last_indexed_chunk,
+                    old_version=is_old_version,
+                )
+                enriched_doc_infos.append(enriched_doc_info)
+
+            # Now, for each doc, we know exactly where to start and end our deletion
+            # So let's generate the chunk IDs for each chunk to delete
+            chunks_to_delete = assemble_document_chunk_info(
+                enriched_document_info_list=enriched_doc_infos,
+                tenant_id=tenant_id,
+                large_chunks_enabled=large_chunks_enabled,
+            )
+
+            # Delete old Vespa documents
+            for doc_chunk_ids_batch in batch_generator(chunks_to_delete, BATCH_SIZE):
+                delete_vespa_chunks(
+                    doc_chunk_ids=doc_chunk_ids_batch,
+                    index_name=self.index_name,
+                    http_client=http_client,
+                    executor=executor,
+                )
+
            for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
                batch_index_vespa_chunks(
                    chunks=chunk_batch,
@ -588,24 +621,6 @@ class VespaIndex(DocumentIndex):

        return total_chunks_updated

-    def delete(self, doc_ids: list[str]) -> None:
-        logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
-
-        doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids]
-
-        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
-        # indexing / updates / deletes since we have to make a large volume of requests.
-        with get_vespa_http_client() as http_client:
-            index_names = [self.index_name]
-            if self.secondary_index_name:
-                index_names.append(self.secondary_index_name)
-
-            for index_name in index_names:
-                delete_vespa_docs(
-                    document_ids=doc_ids, index_name=index_name, http_client=http_client
-                )
-        return
-
    def delete_single(self, doc_id: str) -> int:
        """Possibly faster overall than the delete method due to using a single
        delete call with a selection query."""
--- a/backend/onyx/document_index/vespa/indexing_utils.py
+++ b/backend/onyx/document_index/vespa/indexing_utils.py
@ -1,5 +1,6 @@
 import concurrent.futures
 import json
+import uuid
 from datetime import datetime
 from datetime import timezone
 from http import HTTPStatus
@ -11,6 +12,8 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
    get_experts_stores_representations,
 )
 from onyx.document_index.document_index_utils import get_uuid_from_chunk
+from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
+from onyx.document_index.interfaces import MinimalDocumentIndexingInfo
 from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars
 from onyx.document_index.vespa.shared_utils.utils import (
    replace_invalid_doc_id_characters,
@ -48,14 +51,9 @@ logger = setup_logger()


@retry(tries=3, delay=1, backoff=2)
-def _does_document_exist(
-    doc_chunk_id: str,
-    index_name: str,
-    http_client: httpx.Client,
+def _does_doc_chunk_exist(
+    doc_chunk_id: uuid.UUID, index_name: str, http_client: httpx.Client
 ) -> bool:
-    """Returns whether the document already exists and the users/group whitelists
-    Specifically in this case, document refers to a vespa document which is equivalent to a Onyx
-    chunk. This checks for whether the chunk exists already in the index"""
    doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}"
    doc_fetch_response = http_client.get(doc_url)
    if doc_fetch_response.status_code == 404:
@ -98,8 +96,8 @@ def get_existing_documents_from_chunks(
    try:
        chunk_existence_future = {
            executor.submit(
-                _does_document_exist,
-                str(get_uuid_from_chunk(chunk)),
+                _does_doc_chunk_exist,
+                get_uuid_from_chunk(chunk),
                index_name,
                http_client,
            ): chunk
@ -248,3 +246,22 @@ def clean_chunk_id_copy(
        }
    )
    return clean_chunk
+
+
+def check_for_final_chunk_existence(
+    minimal_doc_info: MinimalDocumentIndexingInfo,
+    start_index: int,
+    index_name: str,
+    http_client: httpx.Client,
+) -> int:
+    index = start_index
+    while True:
+        doc_chunk_id = get_uuid_from_chunk_info_old(
+            document_id=minimal_doc_info.doc_id,
+            chunk_id=index,
+            large_chunk_reference_ids=[],
+        )
+        if not _does_doc_chunk_exist(doc_chunk_id, index_name, http_client):
+            return index
+
+        index += 1
--- a/backend/onyx/document_index/vespa_constants.py
+++ b/backend/onyx/document_index/vespa_constants.py
@ -35,6 +35,8 @@ DOCUMENT_ID_ENDPOINT = (
    f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid"
 )

+# the default document id endpoint is http://localhost:8080/document/v1/default/danswer_chunk/docid
+
 SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/"

 NUM_THREADS = (