improve llm-generated citations (account for edge case) (#2096)

* improve llm-generated citations (account for edge case)

* additional test case
This commit is contained in:
pablodanswer 2024-08-09 19:06:39 -07:00 committed by GitHub
parent 54d4526b73
commit cc8a6da8e3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 54 additions and 0 deletions

View File

@ -125,6 +125,30 @@ def extract_citations_from_stream(
length_to_add -= diff
continue
# Handle edge case where LLM outputs citation itself
# by allowing it to generate citations on its own.
if curr_segment.startswith("[["):
match = re.match(r"\[\[(\d+)\]\]", curr_segment)
if match:
try:
doc_id = int(match.group(1))
context_llm_doc = context_docs[doc_id - 1]
yield CitationInfo(
citation_num=target_citation_num,
document_id=context_llm_doc.document_id,
)
except Exception as e:
logger.warning(
f"Manual LLM citation didn't properly cite documents {e}"
)
else:
# Will continue attempt on next loops
logger.warning(
"Manual LLM citation wasn't able to close brackets"
)
continue
link = context_llm_doc.link
# Replace the citation in the current segment
@ -162,6 +186,7 @@ def extract_citations_from_stream(
+ curr_segment[end + length_to_add :]
)
length_to_add += len(curr_segment) - prev_length
last_citation_end = end + length_to_add
if last_citation_end > 0:

View File

@ -257,6 +257,35 @@ def process_text(
"[[1]](https://0.com)[[2]]()t]",
["doc_0", "doc_1"],
),
(
"Citations with extraneous citations",
[
"[[1]](https://0.com) Citation",
" at ",
"the beginning. ",
"[",
"3",
"]",
" In the mid",
"dle. At the end ",
"[",
"5",
"]",
".",
],
"[[1]](https://0.com) Citation at the beginning. [[2]]() In the middle. At the end [[3]](https://2.com).",
["doc_0", "doc_1", "doc_2"],
),
(
"Citations with extraneous citations, split up",
[
"[[1]](",
"https://0.com) Citation at ",
"the beginning. ",
],
"[[1]](https://0.com) Citation at the beginning. ",
["doc_0"],
),
],
)
def test_citation_extraction(