More permissive quote matching (#295)

2025-09-20 04:37:09 +02:00 · 2023-08-14 15:03:21 -07:00
parent 59db40cf36
commit 848e5653a9
1 changed files with 11 additions and 15 deletions
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -1,3 +1,5 @@
+import re
+
 from bs4 import BeautifulSoup

 from danswer.configs.constants import HTML_SEPARATOR
@@ -15,23 +17,17 @@ def clean_model_quote(quote: str, trim_length: int) -> str:


 def shared_precompare_cleanup(text: str) -> str:
+    """LLMs models sometime restructure whitespaces or edits special characters to fit a more likely
+    distribution of characters found in its training data, but this hurts exact quote matching
+    """
    text = text.lower()

-    # GPT models like to return cleaner spacing, not good for quote matching
-    text = "".join(text.split())
-
-    # GPT models sometimes like to clean up bulletpoints represented by *
-    text = text.replace("*", "")
-
-    # GPT models sometimes like to edit the quoting, ie "Title: Contents" becomes Title: "Contents"
-    text = text.replace('\\"', "")
-    text = text.replace('"', "")
-
-    # GPT models often change up punctuations to make the text flow better.
-    text = text.replace(".", "")
-    text = text.replace(":", "")
-    text = text.replace(",", "")
-    text = text.replace("-", "")
+    # \s: matches any whitespace character (spaces, tabs, newlines, etc.)
+    # |: acts as an OR.
+    # \*: matches the asterisk character.
+    # \\": matches the \" sequence.
+    # [.,:`"#-]: matches any character inside the square brackets.
+    text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)

    return text