From b82123563bca5ef653f9de6731fb4700021aa985 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 29 Jan 2025 18:32:00 +0000 Subject: [PATCH] Fix Unicode sanitization for Vespa document indexing (#3831) * Add support for filtering 0xFDD0-0xFDEF Unicode range - Update remove_invalid_unicode_chars to handle 0xFDD0-0xFDEF range - Add comprehensive test cases for Unicode character sanitization - Fix issue with illegal code point 0xFDDB in Vespa indexing Co-Authored-By: Chris Weaver * Remove unused pytest import Co-Authored-By: Chris Weaver --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Chris Weaver --- .../vespa/shared_utils/utils.py | 2 +- .../vespa/shared_utils/test_utils.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 backend/tests/unit/onyx/document_index/vespa/shared_utils/test_utils.py diff --git a/backend/onyx/document_index/vespa/shared_utils/utils.py b/backend/onyx/document_index/vespa/shared_utils/utils.py index c8a382e72..e8dd83a76 100644 --- a/backend/onyx/document_index/vespa/shared_utils/utils.py +++ b/backend/onyx/document_index/vespa/shared_utils/utils.py @@ -55,7 +55,7 @@ def remove_invalid_unicode_chars(text: str) -> str: """Vespa does not take in unicode chars that aren't valid for XML. This removes them.""" _illegal_xml_chars_RE: re.Pattern = re.compile( - "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]" + "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF]" ) return _illegal_xml_chars_RE.sub("", text) diff --git a/backend/tests/unit/onyx/document_index/vespa/shared_utils/test_utils.py b/backend/tests/unit/onyx/document_index/vespa/shared_utils/test_utils.py new file mode 100644 index 000000000..c5e4d9df5 --- /dev/null +++ b/backend/tests/unit/onyx/document_index/vespa/shared_utils/test_utils.py @@ -0,0 +1,20 @@ +from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars + + +def test_remove_invalid_unicode_chars() -> None: + """Test that invalid Unicode characters are properly removed.""" + # Test removal of illegal XML character 0xFDDB + text_with_illegal_char = "Valid text \uFDDB more text" + sanitized = remove_invalid_unicode_chars(text_with_illegal_char) + assert "\uFDDB" not in sanitized + assert sanitized == "Valid text more text" + + # Test that valid characters are preserved + valid_text = "Hello, world! 你好世界" + assert remove_invalid_unicode_chars(valid_text) == valid_text + + # Test multiple invalid characters including 0xFDDB + text_with_multiple_illegal = "\x00Hello\uFDDB World\uFFFE!" + sanitized = remove_invalid_unicode_chars(text_with_multiple_illegal) + assert all(c not in sanitized for c in ["\x00", "\uFDDB", "\uFFFE"]) + assert sanitized == "Hello World!"