Empty embedding fix (#2853)

* account for malformed urls

* fix

* k
This commit is contained in:
pablodanswer 2024-10-19 10:55:39 -07:00 committed by GitHub
parent 8b220d2dba
commit f7d77a3c76
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 8 deletions

View File

@ -128,6 +128,9 @@ def get_internal_links(
if not href:
continue
# Account for malformed backslashes in URLs
href = href.replace("\\", "/")
if should_ignore_pound and "#" in href:
href = href.split("#")[0]

View File

@ -50,23 +50,26 @@ def clean_model_name(model_str: str) -> str:
return model_str.replace("/", "_").replace("-", "_").replace(".", "_")
_WHITELIST = set(
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\n\t"
)
_INITIAL_FILTER = re.compile(
"["
"\U00000080-\U0000FFFF" # All Unicode characters beyond ASCII
"\U00010000-\U0010FFFF" # All Unicode characters in supplementary planes
"\U0000FFF0-\U0000FFFF" # Specials
"\U0001F000-\U0001F9FF" # Emoticons
"\U00002000-\U0000206F" # General Punctuation
"\U00002190-\U000021FF" # Arrows
"\U00002700-\U000027BF" # Dingbats
"]+",
flags=re.UNICODE,
)
def clean_openai_text(text: str) -> str:
# First, remove all weird characters
# Remove specific Unicode ranges that might cause issues
cleaned = _INITIAL_FILTER.sub("", text)
# Then, keep only whitelisted characters
return "".join(char for char in cleaned if char in _WHITELIST)
# Remove any control characters except for newline and tab
cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t")
return cleaned
def build_model_server_url(