From 36afa9370f7a428210a4a3207fd061c2613d6a51 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Mon, 10 Jun 2024 17:19:47 -0700
Subject: [PATCH] Vespa remove apostrophe in URLs (#1618)

---
 backend/danswer/document_index/vespa/index.py | 34 ++++++++++++++-
 backend/danswer/document_index/vespa/utils.py | 41 +++++++++++++++++--
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py
index 1ec5fd787059..d96350199e99 100644
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -61,6 +61,7 @@ from danswer.document_index.interfaces import DocumentIndex
 from danswer.document_index.interfaces import DocumentInsertionRecord
 from danswer.document_index.interfaces import UpdateRequest
 from danswer.document_index.vespa.utils import remove_invalid_unicode_chars
+from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters
 from danswer.indexing.models import DocMetadataAwareIndexChunk
 from danswer.search.models import IndexFilters
 from danswer.search.models import InferenceChunk
@@ -708,6 +709,21 @@ def _create_document_xml_lines(doc_names: list[str | None]) -> str:
     return "\n".join(doc_lines)
 
 
+def _clean_chunk_id_copy(
+    chunk: DocMetadataAwareIndexChunk,
+) -> DocMetadataAwareIndexChunk:
+    clean_chunk = chunk.copy(
+        update={
+            "source_document": chunk.source_document.copy(
+                update={
+                    "id": replace_invalid_doc_id_characters(chunk.source_document.id)
+                }
+            )
+        }
+    )
+    return clean_chunk
+
+
 class VespaIndex(DocumentIndex):
     yql_base = (
         f"select "
@@ -801,7 +817,10 @@ class VespaIndex(DocumentIndex):
         chunks: list[DocMetadataAwareIndexChunk],
     ) -> set[DocumentInsertionRecord]:
         # IMPORTANT: This must be done one index at a time, do not use secondary index here
-        return _clear_and_index_vespa_chunks(chunks=chunks, index_name=self.index_name)
+        cleaned_chunks = [_clean_chunk_id_copy(chunk) for chunk in chunks]
+        return _clear_and_index_vespa_chunks(
+            chunks=cleaned_chunks, index_name=self.index_name
+        )
 
     @staticmethod
     def _apply_updates_batched(
@@ -847,6 +866,15 @@ class VespaIndex(DocumentIndex):
 
     def update(self, update_requests: list[UpdateRequest]) -> None:
         logger.info(f"Updating {len(update_requests)} documents in Vespa")
+
+        # Handle Vespa character limitations
+        # Mutating update_requests but it's not used later anyway
+        for update_request in update_requests:
+            update_request.document_ids = [
+                replace_invalid_doc_id_characters(doc_id)
+                for doc_id in update_request.document_ids
+            ]
+
         update_start = time.monotonic()
 
         processed_updates_requests: list[_VespaUpdateRequest] = []
@@ -929,6 +957,8 @@ class VespaIndex(DocumentIndex):
     def delete(self, doc_ids: list[str]) -> None:
         logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
 
+        doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids]
+
         # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
         # indexing / updates / deletes since we have to make a large volume of requests.
         with httpx.Client(http2=True) as http_client:
@@ -948,6 +978,8 @@ class VespaIndex(DocumentIndex):
         max_chunk_ind: int | None,
         user_access_control_list: list[str] | None = None,
     ) -> list[InferenceChunk]:
+        document_id = replace_invalid_doc_id_characters(document_id)
+
         vespa_chunks = _get_vespa_chunks_by_document_id(
             document_id=document_id,
             index_name=self.index_name,
diff --git a/backend/danswer/document_index/vespa/utils.py b/backend/danswer/document_index/vespa/utils.py
index 228b98dc95f0..c74afc9a6294 100644
--- a/backend/danswer/document_index/vespa/utils.py
+++ b/backend/danswer/document_index/vespa/utils.py
@@ -1,12 +1,47 @@
 import re
 
+# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
+# See here for reference: https://docs.vespa.ai/en/documents.html
+# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java
 
-_illegal_xml_chars_RE = re.compile(
-    "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
-)
+# Define allowed ASCII characters
+ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
+ALLOWED_ASCII_CHARS[0x9] = True  # tab
+ALLOWED_ASCII_CHARS[0xA] = True  # newline
+ALLOWED_ASCII_CHARS[0xD] = True  # carriage return
+for i in range(0x20, 0x7F):
+    ALLOWED_ASCII_CHARS[i] = True  # printable ASCII chars
+ALLOWED_ASCII_CHARS[0x7F] = True  # del - discouraged, but allowed
+
+
+def is_text_character(codepoint: int) -> bool:
+    """Returns whether the given codepoint is a valid text character."""
+    if codepoint < 0x80:
+        return ALLOWED_ASCII_CHARS[codepoint]
+    if codepoint < 0xD800:
+        return True
+    if codepoint <= 0xDFFF:
+        return False
+    if codepoint < 0xFDD0:
+        return True
+    if codepoint <= 0xFDEF:
+        return False
+    if codepoint >= 0x10FFFE:
+        return False
+    return (codepoint & 0xFFFF) < 0xFFFE
+
+
+def replace_invalid_doc_id_characters(text: str) -> str:
+    """Replaces invalid document ID characters in text."""
+    # There may be a more complete set of replacements that need to be made but Vespa docs are unclear
+    # and users only seem to be running into this error with single quotes
+    return text.replace("'", "_")
 
 
 def remove_invalid_unicode_chars(text: str) -> str:
     """Vespa does not take in unicode chars that aren't valid for XML.
     This removes them."""
+    _illegal_xml_chars_RE: re.Pattern = re.compile(
+        "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
+    )
     return _illegal_xml_chars_RE.sub("", text)