From f7d77a3c7627aaf1f055133c7ad3ed95f27f23c9 Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Sat, 19 Oct 2024 10:55:39 -0700 Subject: [PATCH] Empty embedding fix (#2853) * account for malformed urls * fix * k --- backend/danswer/connectors/web/connector.py | 3 +++ .../search_nlp_models.py | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index bb1f64efdf..9e0671ea24 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -128,6 +128,9 @@ def get_internal_links( if not href: continue + # Account for malformed backslashes in URLs + href = href.replace("\\", "/") + if should_ignore_pound and "#" in href: href = href.split("#")[0] diff --git a/backend/danswer/natural_language_processing/search_nlp_models.py b/backend/danswer/natural_language_processing/search_nlp_models.py index 700c8c08cf..d75fce304d 100644 --- a/backend/danswer/natural_language_processing/search_nlp_models.py +++ b/backend/danswer/natural_language_processing/search_nlp_models.py @@ -50,23 +50,26 @@ def clean_model_name(model_str: str) -> str: return model_str.replace("/", "_").replace("-", "_").replace(".", "_") -_WHITELIST = set( - " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\n\t" -) _INITIAL_FILTER = re.compile( "[" - "\U00000080-\U0000FFFF" # All Unicode characters beyond ASCII - "\U00010000-\U0010FFFF" # All Unicode characters in supplementary planes + "\U0000FFF0-\U0000FFFF" # Specials + "\U0001F000-\U0001F9FF" # Emoticons + "\U00002000-\U0000206F" # General Punctuation + "\U00002190-\U000021FF" # Arrows + "\U00002700-\U000027BF" # Dingbats "]+", flags=re.UNICODE, ) def clean_openai_text(text: str) -> str: - # First, remove all weird characters + # Remove specific Unicode ranges that might cause issues cleaned = _INITIAL_FILTER.sub("", text) - # Then, keep only whitelisted characters - return "".join(char for char in cleaned if char in _WHITELIST) + + # Remove any control characters except for newline and tab + cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t") + + return cleaned def build_model_server_url(