mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-14 02:29:15 +02:00
fallback to ignoring unicode chars when huggingface tokenizer fails (#4394)
This commit is contained in:
@@ -3,6 +3,8 @@ from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from copy import copy
|
||||
|
||||
from tokenizers import Encoding # type: ignore
|
||||
from tokenizers import Tokenizer # type: ignore
|
||||
from transformers import logging as transformer_logging # type:ignore
|
||||
|
||||
from onyx.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||
@@ -69,16 +71,27 @@ class TiktokenTokenizer(BaseTokenizer):
|
||||
|
||||
class HuggingFaceTokenizer(BaseTokenizer):
|
||||
def __init__(self, model_name: str):
|
||||
from tokenizers import Tokenizer # type: ignore
|
||||
self.encoder: Tokenizer = Tokenizer.from_pretrained(model_name)
|
||||
|
||||
self.encoder = Tokenizer.from_pretrained(model_name)
|
||||
def _safer_encode(self, string: str) -> Encoding:
|
||||
"""
|
||||
Encode a string using the HuggingFaceTokenizer, but if it fails,
|
||||
encode the string as ASCII and decode it back to a string. This helps
|
||||
in cases where the string has weird characters like \udeb4.
|
||||
"""
|
||||
try:
|
||||
return self.encoder.encode(string, add_special_tokens=False)
|
||||
except Exception:
|
||||
return self.encoder.encode(
|
||||
string.encode("ascii", "ignore").decode(), add_special_tokens=False
|
||||
)
|
||||
|
||||
def encode(self, string: str) -> list[int]:
|
||||
# this returns no special tokens
|
||||
return self.encoder.encode(string, add_special_tokens=False).ids
|
||||
return self._safer_encode(string).ids
|
||||
|
||||
def tokenize(self, string: str) -> list[str]:
|
||||
return self.encoder.encode(string, add_special_tokens=False).tokens
|
||||
return self._safer_encode(string).tokens
|
||||
|
||||
def decode(self, tokens: list[int]) -> str:
|
||||
return self.encoder.decode(tokens)
|
||||
|
Reference in New Issue
Block a user