From 36afa9370f7a428210a4a3207fd061c2613d6a51 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Mon, 10 Jun 2024 17:19:47 -0700 Subject: [PATCH] Vespa remove apostrophe in URLs (#1618) --- backend/danswer/document_index/vespa/index.py | 34 ++++++++++++++- backend/danswer/document_index/vespa/utils.py | 41 +++++++++++++++++-- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 1ec5fd787059..d96350199e99 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -61,6 +61,7 @@ from danswer.document_index.interfaces import DocumentIndex from danswer.document_index.interfaces import DocumentInsertionRecord from danswer.document_index.interfaces import UpdateRequest from danswer.document_index.vespa.utils import remove_invalid_unicode_chars +from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters from danswer.indexing.models import DocMetadataAwareIndexChunk from danswer.search.models import IndexFilters from danswer.search.models import InferenceChunk @@ -708,6 +709,21 @@ def _create_document_xml_lines(doc_names: list[str | None]) -> str: return "\n".join(doc_lines) +def _clean_chunk_id_copy( + chunk: DocMetadataAwareIndexChunk, +) -> DocMetadataAwareIndexChunk: + clean_chunk = chunk.copy( + update={ + "source_document": chunk.source_document.copy( + update={ + "id": replace_invalid_doc_id_characters(chunk.source_document.id) + } + ) + } + ) + return clean_chunk + + class VespaIndex(DocumentIndex): yql_base = ( f"select " @@ -801,7 +817,10 @@ class VespaIndex(DocumentIndex): chunks: list[DocMetadataAwareIndexChunk], ) -> set[DocumentInsertionRecord]: # IMPORTANT: This must be done one index at a time, do not use secondary index here - return _clear_and_index_vespa_chunks(chunks=chunks, index_name=self.index_name) + cleaned_chunks = [_clean_chunk_id_copy(chunk) for chunk in chunks] + return _clear_and_index_vespa_chunks( + chunks=cleaned_chunks, index_name=self.index_name + ) @staticmethod def _apply_updates_batched( @@ -847,6 +866,15 @@ class VespaIndex(DocumentIndex): def update(self, update_requests: list[UpdateRequest]) -> None: logger.info(f"Updating {len(update_requests)} documents in Vespa") + + # Handle Vespa character limitations + # Mutating update_requests but it's not used later anyway + for update_request in update_requests: + update_request.document_ids = [ + replace_invalid_doc_id_characters(doc_id) + for doc_id in update_request.document_ids + ] + update_start = time.monotonic() processed_updates_requests: list[_VespaUpdateRequest] = [] @@ -929,6 +957,8 @@ class VespaIndex(DocumentIndex): def delete(self, doc_ids: list[str]) -> None: logger.info(f"Deleting {len(doc_ids)} documents from Vespa") + doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids] + # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for # indexing / updates / deletes since we have to make a large volume of requests. with httpx.Client(http2=True) as http_client: @@ -948,6 +978,8 @@ class VespaIndex(DocumentIndex): max_chunk_ind: int | None, user_access_control_list: list[str] | None = None, ) -> list[InferenceChunk]: + document_id = replace_invalid_doc_id_characters(document_id) + vespa_chunks = _get_vespa_chunks_by_document_id( document_id=document_id, index_name=self.index_name, diff --git a/backend/danswer/document_index/vespa/utils.py b/backend/danswer/document_index/vespa/utils.py index 228b98dc95f0..c74afc9a6294 100644 --- a/backend/danswer/document_index/vespa/utils.py +++ b/backend/danswer/document_index/vespa/utils.py @@ -1,12 +1,47 @@ import re +# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code +# See here for reference: https://docs.vespa.ai/en/documents.html +# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java -_illegal_xml_chars_RE = re.compile( - "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]" -) +# Define allowed ASCII characters +ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80 +ALLOWED_ASCII_CHARS[0x9] = True # tab +ALLOWED_ASCII_CHARS[0xA] = True # newline +ALLOWED_ASCII_CHARS[0xD] = True # carriage return +for i in range(0x20, 0x7F): + ALLOWED_ASCII_CHARS[i] = True # printable ASCII chars +ALLOWED_ASCII_CHARS[0x7F] = True # del - discouraged, but allowed + + +def is_text_character(codepoint: int) -> bool: + """Returns whether the given codepoint is a valid text character.""" + if codepoint < 0x80: + return ALLOWED_ASCII_CHARS[codepoint] + if codepoint < 0xD800: + return True + if codepoint <= 0xDFFF: + return False + if codepoint < 0xFDD0: + return True + if codepoint <= 0xFDEF: + return False + if codepoint >= 0x10FFFE: + return False + return (codepoint & 0xFFFF) < 0xFFFE + + +def replace_invalid_doc_id_characters(text: str) -> str: + """Replaces invalid document ID characters in text.""" + # There may be a more complete set of replacements that need to be made but Vespa docs are unclear + # and users only seem to be running into this error with single quotes + return text.replace("'", "_") def remove_invalid_unicode_chars(text: str) -> str: """Vespa does not take in unicode chars that aren't valid for XML. This removes them.""" + _illegal_xml_chars_RE: re.Pattern = re.compile( + "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]" + ) return _illegal_xml_chars_RE.sub("", text)