Fix Unicode sanitization for Vespa document indexing (#3831)

* Add support for filtering 0xFDD0-0xFDEF Unicode range

- Update remove_invalid_unicode_chars to handle 0xFDD0-0xFDEF range
- Add comprehensive test cases for Unicode character sanitization
- Fix issue with illegal code point 0xFDDB in Vespa indexing

Co-Authored-By: Chris Weaver <chris@onyx.app>

* Remove unused pytest import

Co-Authored-By: Chris Weaver <chris@onyx.app>

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Chris Weaver <chris@onyx.app>
This commit is contained in:
devin-ai-integration[bot] 2025-01-29 18:32:00 +00:00 committed by GitHub
parent 787e25cd78
commit b82123563b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 1 deletions

View File

@ -55,7 +55,7 @@ def remove_invalid_unicode_chars(text: str) -> str:
"""Vespa does not take in unicode chars that aren't valid for XML.
This removes them."""
_illegal_xml_chars_RE: re.Pattern = re.compile(
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF]"
)
return _illegal_xml_chars_RE.sub("", text)

View File

@ -0,0 +1,20 @@
from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars
def test_remove_invalid_unicode_chars() -> None:
"""Test that invalid Unicode characters are properly removed."""
# Test removal of illegal XML character 0xFDDB
text_with_illegal_char = "Valid text \uFDDB more text"
sanitized = remove_invalid_unicode_chars(text_with_illegal_char)
assert "\uFDDB" not in sanitized
assert sanitized == "Valid text more text"
# Test that valid characters are preserved
valid_text = "Hello, world! 你好世界"
assert remove_invalid_unicode_chars(valid_text) == valid_text
# Test multiple invalid characters including 0xFDDB
text_with_multiple_illegal = "\x00Hello\uFDDB World\uFFFE!"
sanitized = remove_invalid_unicode_chars(text_with_multiple_illegal)
assert all(c not in sanitized for c in ["\x00", "\uFDDB", "\uFFFE"])
assert sanitized == "Hello World!"