diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index f11388f53f64..718988953e20 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -1,3 +1,5 @@ +import re + from bs4 import BeautifulSoup from danswer.configs.constants import HTML_SEPARATOR @@ -15,23 +17,17 @@ def clean_model_quote(quote: str, trim_length: int) -> str: def shared_precompare_cleanup(text: str) -> str: + """LLMs models sometime restructure whitespaces or edits special characters to fit a more likely + distribution of characters found in its training data, but this hurts exact quote matching + """ text = text.lower() - # GPT models like to return cleaner spacing, not good for quote matching - text = "".join(text.split()) - - # GPT models sometimes like to clean up bulletpoints represented by * - text = text.replace("*", "") - - # GPT models sometimes like to edit the quoting, ie "Title: Contents" becomes Title: "Contents" - text = text.replace('\\"', "") - text = text.replace('"', "") - - # GPT models often change up punctuations to make the text flow better. - text = text.replace(".", "") - text = text.replace(":", "") - text = text.replace(",", "") - text = text.replace("-", "") + # \s: matches any whitespace character (spaces, tabs, newlines, etc.) + # |: acts as an OR. + # \*: matches the asterisk character. + # \\": matches the \" sequence. + # [.,:`"#-]: matches any character inside the square brackets. + text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text) return text