mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
import json
|
|
import re
|
|
|
|
|
|
def has_unescaped_quote(s: str) -> bool:
|
|
pattern = r'(?<!\\)"'
|
|
return bool(re.search(pattern, s))
|
|
|
|
|
|
def escape_newlines(s: str) -> str:
|
|
return re.sub(r"(?<!\\)\n", "\\\\n", s)
|
|
|
|
|
|
def replace_whitespaces_w_space(s: str) -> str:
|
|
return re.sub(r"\s", " ", s)
|
|
|
|
|
|
def extract_embedded_json(s: str) -> dict:
|
|
first_brace_index = s.find("{")
|
|
last_brace_index = s.rfind("}")
|
|
|
|
if first_brace_index == -1 or last_brace_index == -1:
|
|
raise ValueError("No valid json found")
|
|
|
|
return json.loads(s[first_brace_index : last_brace_index + 1], strict=False)
|
|
|
|
|
|
def clean_up_code_blocks(model_out_raw: str) -> str:
|
|
return model_out_raw.strip().strip("```").strip().replace("\\xa0", "")
|
|
|
|
|
|
def clean_model_quote(quote: str, trim_length: int) -> str:
|
|
quote_clean = quote.strip()
|
|
if quote_clean[0] == '"':
|
|
quote_clean = quote_clean[1:]
|
|
if quote_clean[-1] == '"':
|
|
quote_clean = quote_clean[:-1]
|
|
if trim_length > 0:
|
|
quote_clean = quote_clean[:trim_length]
|
|
return quote_clean
|
|
|
|
|
|
def shared_precompare_cleanup(text: str) -> str:
|
|
"""LLMs models sometime restructure whitespaces or edits special characters to fit a more likely
|
|
distribution of characters found in its training data, but this hurts exact quote matching
|
|
"""
|
|
text = text.lower()
|
|
|
|
# \s: matches any whitespace character (spaces, tabs, newlines, etc.)
|
|
# |: acts as an OR.
|
|
# \*: matches the asterisk character.
|
|
# \\": matches the \" sequence.
|
|
# [.,:`"#-]: matches any character inside the square brackets.
|
|
text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
|
|
|
|
return text
|