Vespa remove apostrophe in URLs (#1618)

2025-09-27 04:18:35 +02:00 · 2024-06-10 17:19:47 -07:00
parent 7c9d037b7c
commit 36afa9370f
2 changed files with 71 additions and 4 deletions
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -61,6 +61,7 @@ from danswer.document_index.interfaces import DocumentIndex
 from danswer.document_index.interfaces import DocumentInsertionRecord
 from danswer.document_index.interfaces import UpdateRequest
 from danswer.document_index.vespa.utils import remove_invalid_unicode_chars
+from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters
 from danswer.indexing.models import DocMetadataAwareIndexChunk
 from danswer.search.models import IndexFilters
 from danswer.search.models import InferenceChunk
@@ -708,6 +709,21 @@ def _create_document_xml_lines(doc_names: list[str | None]) -> str:
    return "\n".join(doc_lines)


+def _clean_chunk_id_copy(
+    chunk: DocMetadataAwareIndexChunk,
+) -> DocMetadataAwareIndexChunk:
+    clean_chunk = chunk.copy(
+        update={
+            "source_document": chunk.source_document.copy(
+                update={
+                    "id": replace_invalid_doc_id_characters(chunk.source_document.id)
+                }
+            )
+        }
+    )
+    return clean_chunk
+
+
 class VespaIndex(DocumentIndex):
    yql_base = (
        f"select "
@@ -801,7 +817,10 @@ class VespaIndex(DocumentIndex):
        chunks: list[DocMetadataAwareIndexChunk],
    ) -> set[DocumentInsertionRecord]:
        # IMPORTANT: This must be done one index at a time, do not use secondary index here
-        return _clear_and_index_vespa_chunks(chunks=chunks, index_name=self.index_name)
+        cleaned_chunks = [_clean_chunk_id_copy(chunk) for chunk in chunks]
+        return _clear_and_index_vespa_chunks(
+            chunks=cleaned_chunks, index_name=self.index_name
+        )

    @staticmethod
    def _apply_updates_batched(
@@ -847,6 +866,15 @@ class VespaIndex(DocumentIndex):

    def update(self, update_requests: list[UpdateRequest]) -> None:
        logger.info(f"Updating {len(update_requests)} documents in Vespa")
+
+        # Handle Vespa character limitations
+        # Mutating update_requests but it's not used later anyway
+        for update_request in update_requests:
+            update_request.document_ids = [
+                replace_invalid_doc_id_characters(doc_id)
+                for doc_id in update_request.document_ids
+            ]
+
        update_start = time.monotonic()

        processed_updates_requests: list[_VespaUpdateRequest] = []
@@ -929,6 +957,8 @@ class VespaIndex(DocumentIndex):
    def delete(self, doc_ids: list[str]) -> None:
        logger.info(f"Deleting {len(doc_ids)} documents from Vespa")

+        doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids]
+
        # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
        # indexing / updates / deletes since we have to make a large volume of requests.
        with httpx.Client(http2=True) as http_client:
@@ -948,6 +978,8 @@ class VespaIndex(DocumentIndex):
        max_chunk_ind: int | None,
        user_access_control_list: list[str] | None = None,
    ) -> list[InferenceChunk]:
+        document_id = replace_invalid_doc_id_characters(document_id)
+
        vespa_chunks = _get_vespa_chunks_by_document_id(
            document_id=document_id,
            index_name=self.index_name,
--- a/backend/danswer/document_index/vespa/utils.py
+++ b/backend/danswer/document_index/vespa/utils.py
@@ -1,12 +1,47 @@
 import re

+# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
+# See here for reference: https://docs.vespa.ai/en/documents.html
+# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java

-_illegal_xml_chars_RE = re.compile(
-    "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
-)
+# Define allowed ASCII characters
+ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
+ALLOWED_ASCII_CHARS[0x9] = True  # tab
+ALLOWED_ASCII_CHARS[0xA] = True  # newline
+ALLOWED_ASCII_CHARS[0xD] = True  # carriage return
+for i in range(0x20, 0x7F):
+    ALLOWED_ASCII_CHARS[i] = True  # printable ASCII chars
+ALLOWED_ASCII_CHARS[0x7F] = True  # del - discouraged, but allowed
+
+
+def is_text_character(codepoint: int) -> bool:
+    """Returns whether the given codepoint is a valid text character."""
+    if codepoint < 0x80:
+        return ALLOWED_ASCII_CHARS[codepoint]
+    if codepoint < 0xD800:
+        return True
+    if codepoint <= 0xDFFF:
+        return False
+    if codepoint < 0xFDD0:
+        return True
+    if codepoint <= 0xFDEF:
+        return False
+    if codepoint >= 0x10FFFE:
+        return False
+    return (codepoint & 0xFFFF) < 0xFFFE
+
+
+def replace_invalid_doc_id_characters(text: str) -> str:
+    """Replaces invalid document ID characters in text."""
+    # There may be a more complete set of replacements that need to be made but Vespa docs are unclear
+    # and users only seem to be running into this error with single quotes
+    return text.replace("'", "_")


 def remove_invalid_unicode_chars(text: str) -> str:
    """Vespa does not take in unicode chars that aren't valid for XML.
    This removes them."""
+    _illegal_xml_chars_RE: re.Pattern = re.compile(
+        "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
+    )
    return _illegal_xml_chars_RE.sub("", text)