mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-22 17:16:20 +02:00
More permissive quote matching (#295)
This commit is contained in:
@@ -1,3 +1,5 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from danswer.configs.constants import HTML_SEPARATOR
|
from danswer.configs.constants import HTML_SEPARATOR
|
||||||
@@ -15,23 +17,17 @@ def clean_model_quote(quote: str, trim_length: int) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def shared_precompare_cleanup(text: str) -> str:
|
def shared_precompare_cleanup(text: str) -> str:
|
||||||
|
"""LLMs models sometime restructure whitespaces or edits special characters to fit a more likely
|
||||||
|
distribution of characters found in its training data, but this hurts exact quote matching
|
||||||
|
"""
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
|
|
||||||
# GPT models like to return cleaner spacing, not good for quote matching
|
# \s: matches any whitespace character (spaces, tabs, newlines, etc.)
|
||||||
text = "".join(text.split())
|
# |: acts as an OR.
|
||||||
|
# \*: matches the asterisk character.
|
||||||
# GPT models sometimes like to clean up bulletpoints represented by *
|
# \\": matches the \" sequence.
|
||||||
text = text.replace("*", "")
|
# [.,:`"#-]: matches any character inside the square brackets.
|
||||||
|
text = re.sub(r'\s|\*|\\"|[.,:`"#-]', "", text)
|
||||||
# GPT models sometimes like to edit the quoting, ie "Title: Contents" becomes Title: "Contents"
|
|
||||||
text = text.replace('\\"', "")
|
|
||||||
text = text.replace('"', "")
|
|
||||||
|
|
||||||
# GPT models often change up punctuations to make the text flow better.
|
|
||||||
text = text.replace(".", "")
|
|
||||||
text = text.replace(":", "")
|
|
||||||
text = text.replace(",", "")
|
|
||||||
text = text.replace("-", "")
|
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user