mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-01 00:18:18 +02:00
Fix Unicode sanitization for Vespa document indexing (#3831)
* Add support for filtering 0xFDD0-0xFDEF Unicode range - Update remove_invalid_unicode_chars to handle 0xFDD0-0xFDEF range - Add comprehensive test cases for Unicode character sanitization - Fix issue with illegal code point 0xFDDB in Vespa indexing Co-Authored-By: Chris Weaver <chris@onyx.app> * Remove unused pytest import Co-Authored-By: Chris Weaver <chris@onyx.app> --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Chris Weaver <chris@onyx.app>
This commit is contained in:
parent
787e25cd78
commit
b82123563b
@ -55,7 +55,7 @@ def remove_invalid_unicode_chars(text: str) -> str:
|
||||
"""Vespa does not take in unicode chars that aren't valid for XML.
|
||||
This removes them."""
|
||||
_illegal_xml_chars_RE: re.Pattern = re.compile(
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]"
|
||||
"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF]"
|
||||
)
|
||||
return _illegal_xml_chars_RE.sub("", text)
|
||||
|
||||
|
@ -0,0 +1,20 @@
|
||||
from onyx.document_index.vespa.shared_utils.utils import remove_invalid_unicode_chars
|
||||
|
||||
|
||||
def test_remove_invalid_unicode_chars() -> None:
|
||||
"""Test that invalid Unicode characters are properly removed."""
|
||||
# Test removal of illegal XML character 0xFDDB
|
||||
text_with_illegal_char = "Valid text \uFDDB more text"
|
||||
sanitized = remove_invalid_unicode_chars(text_with_illegal_char)
|
||||
assert "\uFDDB" not in sanitized
|
||||
assert sanitized == "Valid text more text"
|
||||
|
||||
# Test that valid characters are preserved
|
||||
valid_text = "Hello, world! 你好世界"
|
||||
assert remove_invalid_unicode_chars(valid_text) == valid_text
|
||||
|
||||
# Test multiple invalid characters including 0xFDDB
|
||||
text_with_multiple_illegal = "\x00Hello\uFDDB World\uFFFE!"
|
||||
sanitized = remove_invalid_unicode_chars(text_with_multiple_illegal)
|
||||
assert all(c not in sanitized for c in ["\x00", "\uFDDB", "\uFFFE"])
|
||||
assert sanitized == "Hello World!"
|
Loading…
x
Reference in New Issue
Block a user