Fix issue causing file connector to fail

2025-09-26 11:58:28 +02:00 · 2025-02-04 19:38:57 -08:00
parent 4affc259a6
commit 6889152d81
5 changed files with 47 additions and 29 deletions
--- a/backend/onyx/document_index/document_index_utils.py
+++ b/backend/onyx/document_index/document_index_utils.py
@@ -142,6 +142,8 @@ def get_uuid_from_chunk_info(
    tenant_id: str | None,
    large_chunk_id: int | None = None,
 ) -> UUID:
    """NOTE: be VERY carefuly about changing this function. If changed without a migration,
    this can cause deletion/update/insertion to function incorrectly."""
    doc_str = document_id
    # Web parsing URL duplicate catching
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@@ -346,6 +346,14 @@ class VespaIndex(DocumentIndex):
        # IMPORTANT: This must be done one index at a time, do not use secondary index here
        cleaned_chunks = [clean_chunk_id_copy(chunk) for chunk in chunks]
        # needed so the final DocumentInsertionRecord returned can have the original document ID
        new_document_id_to_original_document_id: dict[str, str] = {}
        for ind, chunk in enumerate(cleaned_chunks):
            old_chunk = chunks[ind]
            new_document_id_to_original_document_id[
                chunk.source_document.id
            ] = old_chunk.source_document.id
        existing_docs: set[str] = set()
        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
@@ -401,14 +409,14 @@ class VespaIndex(DocumentIndex):
                    executor=executor,
                )
-        all_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
+        all_cleaned_doc_ids = {chunk.source_document.id for chunk in cleaned_chunks}
        return {
            DocumentInsertionRecord(
-                document_id=doc_id,
+                document_id=new_document_id_to_original_document_id[cleaned_doc_id],
-                already_existed=doc_id in existing_docs,
+                already_existed=cleaned_doc_id in existing_docs,
            )
-            for doc_id in all_doc_ids
+            for cleaned_doc_id in all_cleaned_doc_ids
        }
    @classmethod
@@ -541,7 +549,7 @@ class VespaIndex(DocumentIndex):
            time.monotonic() - update_start,
        )
-    def update_single_chunk(
+    def _update_single_chunk(
        self,
        doc_chunk_id: UUID,
        index_name: str,
@@ -605,6 +613,8 @@ class VespaIndex(DocumentIndex):
        """
        doc_chunk_count = 0
        doc_id = replace_invalid_doc_id_characters(doc_id)
        with self.httpx_client_context as httpx_client:
            for (
                index_name,
@@ -627,7 +637,7 @@ class VespaIndex(DocumentIndex):
                doc_chunk_count += len(doc_chunk_ids)
                for doc_chunk_id in doc_chunk_ids:
-                    self.update_single_chunk(
+                    self._update_single_chunk(
                        doc_chunk_id, index_name, fields, doc_id, httpx_client
                    )
@@ -689,6 +699,18 @@ class VespaIndex(DocumentIndex):
        batch_retrieval: bool = False,
        get_large_chunks: bool = False,
    ) -> list[InferenceChunkUncleaned]:
        # make sure to use the vespa-afied document IDs
        chunk_requests = [
            VespaChunkRequest(
                document_id=replace_invalid_doc_id_characters(
                    chunk_request.document_id
                ),
                min_chunk_ind=chunk_request.min_chunk_ind,
                max_chunk_ind=chunk_request.max_chunk_ind,
            )
            for chunk_request in chunk_requests
        ]
        if batch_retrieval:
            return batch_search_api_retrieval(
                index_name=self.index_name,
--- a/backend/onyx/document_index/vespa/indexing_utils.py
+++ b/backend/onyx/document_index/vespa/indexing_utils.py
@@ -242,9 +242,9 @@ def batch_index_vespa_chunks(
 def clean_chunk_id_copy(
    chunk: DocMetadataAwareIndexChunk,
 ) -> DocMetadataAwareIndexChunk:
-    clean_chunk = chunk.copy(
+    clean_chunk = chunk.model_copy(
        update={
-            "source_document": chunk.source_document.copy(
+            "source_document": chunk.source_document.model_copy(
                update={
                    "id": replace_invalid_doc_id_characters(chunk.source_document.id)
                }
--- a/backend/onyx/document_index/vespa/shared_utils/utils.py
+++ b/backend/onyx/document_index/vespa/shared_utils/utils.py
@@ -45,7 +45,9 @@ def is_text_character(codepoint: int) -> bool:
 def replace_invalid_doc_id_characters(text: str) -> str:
-    """Replaces invalid document ID characters in text."""
+    """Replaces invalid document ID characters in text.
    NOTE: this must be called at the start of every vespa-related operation or else we
    risk discrepancies -> silent failures on deletion/update/insertion."""
    # There may be a more complete set of replacements that need to be made but Vespa docs are unclear
    # and users only seem to be running into this error with single quotes
    return text.replace("'", "_")
--- a/web/src/app/admin/connector/[ccPairId]/page.tsx
+++ b/web/src/app/admin/connector/[ccPairId]/page.tsx
@@ -33,11 +33,6 @@ import EditPropertyModal from "@/components/modals/EditPropertyModal";
 import * as Yup from "yup";
 // since the uploaded files are cleaned up after some period of time
 // re-indexing will not work for the file connector. Also, it would not
 // make sense to re-index, since the files will not have changed.
 const CONNECTOR_TYPES_THAT_CANT_REINDEX: ValidSources[] = [ValidSources.File];
 // synchronize these validations with the SQLAlchemy connector class until we have a
 // centralized schema for both frontend and backend
 const RefreshFrequencySchema = Yup.object().shape({
@@ -268,21 +263,18 @@ function Main({ ccPairId }: { ccPairId: number }) {
        {ccPair.is_editable_for_current_user && (
          <div className="ml-auto flex gap-x-2">
-            {!CONNECTOR_TYPES_THAT_CANT_REINDEX.includes(
+            <ReIndexButton
-              ccPair.connector.source
+              ccPairId={ccPair.id}
-            ) && (
+              connectorId={ccPair.connector.id}
-              <ReIndexButton
+              credentialId={ccPair.credential.id}
-                ccPairId={ccPair.id}
+              isDisabled={
-                connectorId={ccPair.connector.id}
+                ccPair.indexing ||
-                credentialId={ccPair.credential.id}
+                ccPair.status === ConnectorCredentialPairStatus.PAUSED
-                isDisabled={
+              }
-                  ccPair.indexing ||
+              isIndexing={ccPair.indexing}
-                  ccPair.status === ConnectorCredentialPairStatus.PAUSED
+              isDeleting={isDeleting}
-                }
+            />
-                isIndexing={ccPair.indexing}
+
                isDeleting={isDeleting}
              />
            )}
            {!isDeleting && <ModifyStatusButtonCluster ccPair={ccPair} />}
          </div>
        )}