More permissive quote matching (#295)

2025-09-22 17:16:20 +02:00 · 2023-08-14 15:03:21 -07:00
parent 59db40cf36
commit 848e5653a9
1 changed files with 11 additions and 15 deletions
--- a/backend/danswer/utils/text_processing.py
+++ b/backend/danswer/utils/text_processing.py
@@ -1,3 +1,5 @@
 import re
 from bs4 import BeautifulSoup
 from danswer.configs.constants import HTML_SEPARATOR
@@ -15,23 +17,17 @@ def clean_model_quote(quote: str, trim_length: int) -> str:
 def shared_precompare_cleanup(text: str) -> str:
    """LLMs models sometime restructure whitespaces or edits special characters to fit a more likely
    distribution of characters found in its training data, but this hurts exact quote matching
    """
    text = text.lower()
-    # GPT models like to return cleaner spacing, not good for quote matching
+    # \s: matches any whitespace character (spaces, tabs, newlines, etc.)
-    text = "".join(text.split())
+    # |: acts as an OR.
-
+    # \*: matches the asterisk character.
-    # GPT models sometimes like to clean up bulletpoints represented by *
+    # \\": matches the \" sequence.
-    text = text.replace("*", "")
+    # [.,:`"#-]: matches any character inside the square brackets.
-
+    text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
    # GPT models sometimes like to edit the quoting, ie "Title: Contents" becomes Title: "Contents"
    text = text.replace('\\"', "")
    text = text.replace('"', "")
    # GPT models often change up punctuations to make the text flow better.
    text = text.replace(".", "")
    text = text.replace(":", "")
    text = text.replace(",", "")
    text = text.replace("-", "")
    return text