Fix issue causing file connector to fail

2025-07-12 14:12:53 +02:00 · 2025-02-04 19:38:57 -08:00
parent 4affc259a6
commit 6889152d81
5 changed files with 47 additions and 29 deletions
--- a/backend/onyx/document_index/document_index_utils.py
+++ b/backend/onyx/document_index/document_index_utils.py
@ -142,6 +142,8 @@ def get_uuid_from_chunk_info(
    tenant_id: str | None,
    large_chunk_id: int | None = None,
 ) -> UUID:
+    """NOTE: be VERY carefuly about changing this function. If changed without a migration,
+    this can cause deletion/update/insertion to function incorrectly."""
    doc_str = document_id

    # Web parsing URL duplicate catching
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@ -346,6 +346,14 @@ class VespaIndex(DocumentIndex):
        # IMPORTANT: This must be done one index at a time, do not use secondary index here
        cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]

+        # needed so the final DocumentInsertionRecord returned can have the original document ID
+        new_document_id_to_original_document_id: dict[str, str] = {}
+        for ind, chunk in enumerate(cleaned_chunks):
+            old_chunk = chunks[ind]
+            new_document_id_to_original_document_id[
+                chunk.source_document.id
+            ] = old_chunk.source_document.id
+
        existing_docs: set[str] = set()

        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
@ -401,14 +409,14 @@ class VespaIndex(DocumentIndex):
                    executor=executor,
                )

-        all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
+        all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}

        return {
            DocumentInsertionRecord(
-                document_id=doc_id,
-                already_existed=doc_id in existing_docs,
+                document_id=new_document_id_to_original_document_id[cleaned_doc_id],
+                already_existed=cleaned_doc_id in existing_docs,
            )
-            for doc_id in all_doc_ids
+            for cleaned_doc_id in all_cleaned_doc_ids
        }

    @classmethod
@ -541,7 +549,7 @@ class VespaIndex(DocumentIndex):
            time.monotonic() - update_start,
        )

-    def update_single_chunk(
+    def _update_single_chunk(
        self,
        doc_chunk_id: UUID,
        index_name: str,
@ -605,6 +613,8 @@ class VespaIndex(DocumentIndex):
        """
        doc_chunk_count = 0

+        doc_id = replace_invalid_doc_id_characters(doc_id)
+
        with self.httpx_client_context as httpx_client:
            for (
                index_name,
@ -627,7 +637,7 @@ class VespaIndex(DocumentIndex):
                doc_chunk_count += len(doc_chunk_ids)

                for doc_chunk_id in doc_chunk_ids:
-                    self.update_single_chunk(
+                    self._update_single_chunk(
                        doc_chunk_id, index_name, fields, doc_id, httpx_client
                    )

@ -689,6 +699,18 @@ class VespaIndex(DocumentIndex):
        batch_retrieval: bool = False,
        get_large_chunks: bool = False,
    ) -> list[InferenceChunkUncleaned]:
+        # make sure to use the vespa-afied document IDs
+        chunk_requests = [
+            VespaChunkRequest(
+                document_id=replace_invalid_doc_id_characters(
+                    chunk_request.document_id
+                ),
+                min_chunk_ind=chunk_request.min_chunk_ind,
+                max_chunk_ind=chunk_request.max_chunk_ind,
+            )
+            for chunk_request in chunk_requests
+        ]
+
        if batch_retrieval:
            return batch_search_api_retrieval(
                index_name=self.index_name,
--- a/backend/onyx/document_index/vespa/indexing_utils.py
+++ b/backend/onyx/document_index/vespa/indexing_utils.py
@ -242,9 +242,9 @@ def batch_index_vespa_chunks(
 def clean_chunk_id_copy(
    chunk: DocMetadataAwareIndexChunk,
 ) -> DocMetadataAwareIndexChunk:
-    clean_chunk = chunk.copy(
+    clean_chunk = chunk.model_copy(
        update={
-            "source_document": chunk.source_document.copy(
+            "source_document": chunk.source_document.model_copy(
                update={
                    "id": replace_invalid_doc_id_characters(chunk.source_document.id)
                }
--- a/backend/onyx/document_index/vespa/shared_utils/utils.py
+++ b/backend/onyx/document_index/vespa/shared_utils/utils.py
@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool:


 def replace_invalid_doc_id_characters(text: str) -> str:
-    """Replaces invalid document ID characters in text."""
+    """Replaces invalid document ID characters in text.
+    NOTE: this must be called at the start of every vespa-related operation or else we
+    risk discrepancies -> silent failures on deletion/update/insertion."""
    # There may be a more complete set of replacements that need to be made but Vespa docs are unclear
    # and users only seem to be running into this error with single quotes
    return text.replace("'", "_")