mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 04:18:35 +02:00
Vespa remove apostrophe in URLs (#1618)
This commit is contained in:
@@ -61,6 +61,7 @@ from danswer.document_index.interfaces import DocumentIndex
|
||||
from danswer.document_index.interfaces import DocumentInsertionRecord
|
||||
from danswer.document_index.interfaces import UpdateRequest
|
||||
from danswer.document_index.vespa.utils import remove_invalid_unicode_chars
|
||||
from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters
|
||||
from danswer.indexing.models import DocMetadataAwareIndexChunk
|
||||
from danswer.search.models import IndexFilters
|
||||
from danswer.search.models import InferenceChunk
|
||||
@@ -708,6 +709,21 @@ def _create_document_xml_lines(doc_names: list[str | None]) -> str:
|
||||
return "\n".join(doc_lines)
|
||||
|
||||
|
||||
def _clean_chunk_id_copy(
|
||||
chunk: DocMetadataAwareIndexChunk,
|
||||
) -> DocMetadataAwareIndexChunk:
|
||||
clean_chunk = chunk.copy(
|
||||
update={
|
||||
"source_document": chunk.source_document.copy(
|
||||
update={
|
||||
"id": replace_invalid_doc_id_characters(chunk.source_document.id)
|
||||
}
|
||||
)
|
||||
}
|
||||
)
|
||||
return clean_chunk
|
||||
|
||||
|
||||
class VespaIndex(DocumentIndex):
|
||||
yql_base = (
|
||||
f"select "
|
||||
@@ -801,7 +817,10 @@ class VespaIndex(DocumentIndex):
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
) -> set[DocumentInsertionRecord]:
|
||||
# IMPORTANT: This must be done one index at a time, do not use secondary index here
|
||||
return _clear_and_index_vespa_chunks(chunks=chunks, index_name=self.index_name)
|
||||
cleaned_chunks = [_clean_chunk_id_copy(chunk) for chunk in chunks]
|
||||
return _clear_and_index_vespa_chunks(
|
||||
chunks=cleaned_chunks, index_name=self.index_name
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _apply_updates_batched(
|
||||
@@ -847,6 +866,15 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
def update(self, update_requests: list[UpdateRequest]) -> None:
|
||||
logger.info(f"Updating {len(update_requests)} documents in Vespa")
|
||||
|
||||
# Handle Vespa character limitations
|
||||
# Mutating update_requests but it's not used later anyway
|
||||
for update_request in update_requests:
|
||||
update_request.document_ids = [
|
||||
replace_invalid_doc_id_characters(doc_id)
|
||||
for doc_id in update_request.document_ids
|
||||
]
|
||||
|
||||
update_start = time.monotonic()
|
||||
|
||||
processed_updates_requests: list[_VespaUpdateRequest] = []
|
||||
@@ -929,6 +957,8 @@ class VespaIndex(DocumentIndex):
|
||||
def delete(self, doc_ids: list[str]) -> None:
|
||||
logger.info(f"Deleting {len(doc_ids)} documents from Vespa")
|
||||
|
||||
doc_ids = [replace_invalid_doc_id_characters(doc_id) for doc_id in doc_ids]
|
||||
|
||||
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
|
||||
# indexing / updates / deletes since we have to make a large volume of requests.
|
||||
with httpx.Client(http2=True) as http_client:
|
||||
@@ -948,6 +978,8 @@ class VespaIndex(DocumentIndex):
|
||||
max_chunk_ind: int | None,
|
||||
user_access_control_list: list[str] | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
document_id = replace_invalid_doc_id_characters(document_id)
|
||||
|
||||
vespa_chunks = _get_vespa_chunks_by_document_id(
|
||||
document_id=document_id,
|
||||
index_name=self.index_name,
|
||||
|
@@ -1,12 +1,47 @@
|
||||
import re
|
||||
|
||||
# NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code
|
||||
# See here for reference: https://docs.vespa.ai/en/documents.html
|
||||
# https://github.com/vespa-engine/vespa/blob/master/vespajlib/src/main/java/com/yahoo/text/Text.java
|
||||
|
||||
_illegal_xml_chars_RE = re.compile(
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
|
||||
)
|
||||
# Define allowed ASCII characters
|
||||
ALLOWED_ASCII_CHARS: list[bool] = [False] * 0x80
|
||||
ALLOWED_ASCII_CHARS[0x9] = True # tab
|
||||
ALLOWED_ASCII_CHARS[0xA] = True # newline
|
||||
ALLOWED_ASCII_CHARS[0xD] = True # carriage return
|
||||
for i in range(0x20, 0x7F):
|
||||
ALLOWED_ASCII_CHARS[i] = True # printable ASCII chars
|
||||
ALLOWED_ASCII_CHARS[0x7F] = True # del - discouraged, but allowed
|
||||
|
||||
|
||||
def is_text_character(codepoint: int) -> bool:
|
||||
"""Returns whether the given codepoint is a valid text character."""
|
||||
if codepoint < 0x80:
|
||||
return ALLOWED_ASCII_CHARS[codepoint]
|
||||
if codepoint < 0xD800:
|
||||
return True
|
||||
if codepoint <= 0xDFFF:
|
||||
return False
|
||||
if codepoint < 0xFDD0:
|
||||
return True
|
||||
if codepoint <= 0xFDEF:
|
||||
return False
|
||||
if codepoint >= 0x10FFFE:
|
||||
return False
|
||||
return (codepoint & 0xFFFF) < 0xFFFE
|
||||
|
||||
|
||||
def replace_invalid_doc_id_characters(text: str) -> str:
|
||||
"""Replaces invalid document ID characters in text."""
|
||||
# There may be a more complete set of replacements that need to be made but Vespa docs are unclear
|
||||
# and users only seem to be running into this error with single quotes
|
||||
return text.replace("'", "_")
|
||||
|
||||
|
||||
def remove_invalid_unicode_chars(text: str) -> str:
|
||||
"""Vespa does not take in unicode chars that aren't valid for XML.
|
||||
This removes them."""
|
||||
_illegal_xml_chars_RE: re.Pattern = re.compile(
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
|
||||
)
|
||||
return _illegal_xml_chars_RE.sub("", text)
|
||||
|
Reference in New Issue
Block a user