fallback to ignoring unicode chars when huggingface tokenizer fails (#4394)

This commit is contained in:
evan-danswer
2025-03-30 16:45:20 -07:00
committed by GitHub
parent db8d6ce538
commit a1cef389aa

View File

@@ -3,6 +3,8 @@ from abc import ABC
from abc import abstractmethod
from copy import copy
from tokenizers import Encoding # type: ignore
from tokenizers import Tokenizer # type: ignore
from transformers import logging as transformer_logging # type:ignore
from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
@@ -69,16 +71,27 @@ class TiktokenTokenizer(BaseTokenizer):
class HuggingFaceTokenizer(BaseTokenizer):
def __init__(self, model_name: str):
from tokenizers import Tokenizer # type: ignore
self.encoder: Tokenizer = Tokenizer.from_pretrained(model_name)
self.encoder = Tokenizer.from_pretrained(model_name)
def _safer_encode(self, string: str) -> Encoding:
"""
Encode a string using the HuggingFaceTokenizer, but if it fails,
encode the string as ASCII and decode it back to a string. This helps
in cases where the string has weird characters like \udeb4.
"""
try:
return self.encoder.encode(string, add_special_tokens=False)
except Exception:
return self.encoder.encode(
string.encode("ascii", "ignore").decode(), add_special_tokens=False
)
def encode(self, string: str) -> list[int]:
# this returns no special tokens
return self.encoder.encode(string, add_special_tokens=False).ids
return self._safer_encode(string).ids
def tokenize(self, string: str) -> list[str]:
return self.encoder.encode(string, add_special_tokens=False).tokens
return self._safer_encode(string).tokens
def decode(self, tokens: list[int]) -> str:
return self.encoder.decode(tokens)