Fix citations + unit tests (#1760)

2025-07-12 14:12:53 +02:00 · 2024-07-10 10:05:20 -07:00
parent aa0f7abdac
commit 09a11b5e1a
11 changed files with 595 additions and 318 deletions
--- a/backend/tests/unit/danswer/chat/test_chat_llm.py
+++ b/backend/tests/unit/danswer/chat/test_chat_llm.py
@ -1,38 +0,0 @@
-import unittest
-
-
-class TestChatLlm(unittest.TestCase):
-    def test_citation_extraction(self) -> None:
-        pass  # May fix these tests some day
-        """
-        links: list[str | None] = [f"link_{i}" for i in range(1, 21)]
-
-        test_1 = "Something [1]"
-        res = "".join(list(extract_citations_from_stream(iter(test_1), links)))
-        self.assertEqual(res, "Something [[1]](link_1)")
-
-        test_2 = "Something [14]"
-        res = "".join(list(extract_citations_from_stream(iter(test_2), links)))
-        self.assertEqual(res, "Something [[14]](link_14)")
-
-        test_3 = "Something [14][15]"
-        res = "".join(list(extract_citations_from_stream(iter(test_3), links)))
-        self.assertEqual(res, "Something [[14]](link_14)[[15]](link_15)")
-
-        test_4 = ["Something ", "[", "3", "][", "4", "]."]
-        res = "".join(list(extract_citations_from_stream(iter(test_4), links)))
-        self.assertEqual(res, "Something [[3]](link_3)[[4]](link_4).")
-
-        test_5 = ["Something ", "[", "31", "][", "4", "]."]
-        res = "".join(list(extract_citations_from_stream(iter(test_5), links)))
-        self.assertEqual(res, "Something [31][[4]](link_4).")
-
-        links[3] = None
-        test_1 = "Something [2][4][5]"
-        res = "".join(list(extract_citations_from_stream(iter(test_1), links)))
-        self.assertEqual(res, "Something [[2]](link_2)[4][[5]](link_5)")
-        """
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py
+++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_html_utils.py
@ -1,19 +1,13 @@
 import pathlib
-import unittest

 from danswer.file_processing.html_utils import parse_html_page_basic


-class TestQAPostprocessing(unittest.TestCase):
-    def test_parse_table(self) -> None:
-        dir_path = pathlib.Path(__file__).parent.resolve()
-        with open(f"{dir_path}/test_table.html", "r") as file:
-            content = file.read()
+def test_parse_table() -> None:
+    dir_path = pathlib.Path(__file__).parent.resolve()
+    with open(f"{dir_path}/test_table.html", "r") as file:
+        content = file.read()

-        parsed = parse_html_page_basic(content)
-        expected = "\n\thello\tthere\tgeneral\n\tkenobi\ta\tb\n\tc\td\te"
-        self.assertIn(expected, parsed)
-
-
-if __name__ == "__main__":
-    unittest.main()
+    parsed = parse_html_page_basic(content)
+    expected = "\n\thello\tthere\tgeneral\n\tkenobi\ta\tb\n\tc\td\te"
+    assert expected in parsed
--- a/backend/tests/unit/danswer/connectors/cross_connector_utils/test_rate_limit.py
+++ b/backend/tests/unit/danswer/connectors/cross_connector_utils/test_rate_limit.py
@ -1,36 +1,29 @@
 import time
-import unittest

 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
 )


-class TestRateLimit(unittest.TestCase):
+def test_rate_limit_basic() -> None:
    call_cnt = 0

-    def test_rate_limit_basic(self) -> None:
-        self.call_cnt = 0
+    @rate_limit_builder(max_calls=2, period=5)
+    def func() -> None:
+        nonlocal call_cnt
+        call_cnt += 1

-        @rate_limit_builder(max_calls=2, period=5)
-        def func() -> None:
-            self.call_cnt += 1
+    start = time.time()

-        start = time.time()
+    # Make calls that shouldn't be rate-limited
+    func()
+    func()
+    time_to_finish_non_ratelimited = time.time() - start

-        # make calls that shouldn't be rate-limited
-        func()
-        func()
-        time_to_finish_non_ratelimited = time.time() - start
+    # Make a call which SHOULD be rate-limited
+    func()
+    time_to_finish_ratelimited = time.time() - start

-        # make a call which SHOULD be rate-limited
-        func()
-        time_to_finish_ratelimited = time.time() - start
-
-        self.assertEqual(self.call_cnt, 3)
-        self.assertLess(time_to_finish_non_ratelimited, 1)
-        self.assertGreater(time_to_finish_ratelimited, 5)
-
-
-if __name__ == "__main__":
-    unittest.main()
+    assert call_cnt == 3
+    assert time_to_finish_non_ratelimited < 1
+    assert time_to_finish_ratelimited > 5
--- a/backend/tests/unit/danswer/connectors/gmail/test_connector.py
+++ b/backend/tests/unit/danswer/connectors/gmail/test_connector.py
@ -1,5 +1,4 @@
 import datetime
-from unittest.mock import MagicMock

 import pytest
 from pytest_mock import MockFixture
@ -100,7 +99,7 @@ def test_fetch_mails_from_gmail_empty(mocker: MockFixture) -> None:
        "messages": []
    }
    connector = GmailConnector()
-    connector.creds = MagicMock()
+    connector.creds = mocker.Mock()
    with pytest.raises(StopIteration):
        next(connector.load_from_state())

@ -178,7 +177,7 @@ def test_fetch_mails_from_gmail(mocker: MockFixture) -> None:
    }

    connector = GmailConnector()
-    connector.creds = MagicMock()
+    connector.creds = mocker.Mock()
    docs = next(connector.load_from_state())
    assert len(docs) == 1
    doc: Document = docs[0]
--- a/backend/tests/unit/danswer/connectors/mediawiki/test_mediawiki_family.py
+++ b/backend/tests/unit/danswer/connectors/mediawiki/test_mediawiki_family.py
@ -1,7 +1,7 @@
 from typing import Final
-from unittest import mock

 import pytest
+from pytest_mock import MockFixture
 from pywikibot.families.wikipedia_family import Family as WikipediaFamily  # type: ignore[import-untyped]
 from pywikibot.family import Family  # type: ignore[import-untyped]

@ -50,13 +50,11 @@ def test_family_class_dispatch_builtins(

@pytest.mark.parametrize("url, name", NON_BUILTIN_WIKIS)
 def test_family_class_dispatch_on_non_builtins_generates_new_class_fast(
-    url: str, name: str
+    url: str, name: str, mocker: MockFixture
 ) -> None:
    """Test that using the family class dispatch function on an unknown url generates a new family class."""
-    with mock.patch.object(
-        family, "generate_family_class"
-    ) as mock_generate_family_class:
-        family.family_class_dispatch(url, name)
+    mock_generate_family_class = mocker.patch.object(family, "generate_family_class")
+    family.family_class_dispatch(url, name)
    mock_generate_family_class.assert_called_once_with(url, name)


--- a/backend/tests/unit/danswer/direct_qa/test_qa_utils.py
+++ b/backend/tests/unit/danswer/direct_qa/test_qa_utils.py
@ -1,5 +1,6 @@
 import textwrap
-import unittest
+
+import pytest

 from danswer.configs.constants import DocumentSource
 from danswer.llm.answering.stream_processing.quotes_processing import (
@ -11,194 +12,181 @@ from danswer.llm.answering.stream_processing.quotes_processing import (
 from danswer.search.models import InferenceChunk


-class TestQAPostprocessing(unittest.TestCase):
-    def test_separate_answer_quotes(self) -> None:
-        test_answer = textwrap.dedent(
-            """
-            It seems many people love dogs
-            Quote: A dog is a man's best friend
-            Quote: Air Bud was a movie about dogs and people loved it
-            """
-        ).strip()
-        answer, quotes = separate_answer_quotes(test_answer)
-        self.assertEqual(answer, "It seems many people love dogs")
-        self.assertEqual(quotes[0], "A dog is a man's best friend")  # type: ignore
-        self.assertEqual(
-            quotes[1], "Air Bud was a movie about dogs and people loved it"  # type: ignore
-        )
+def test_separate_answer_quotes() -> None:
+    # Test case 1: Basic quote separation
+    test_answer = textwrap.dedent(
+        """
+        It seems many people love dogs
+        Quote: A dog is a man's best friend
+        Quote: Air Bud was a movie about dogs and people loved it
+        """
+    ).strip()
+    answer, quotes = separate_answer_quotes(test_answer)
+    assert answer == "It seems many people love dogs"
+    assert isinstance(quotes, list)
+    assert quotes[0] == "A dog is a man's best friend"
+    assert quotes[1] == "Air Bud was a movie about dogs and people loved it"

-        # Lowercase should be allowed
-        test_answer = textwrap.dedent(
-            """
-            It seems many people love dogs
-            quote: A dog is a man's best friend
-            Quote: Air Bud was a movie about dogs and people loved it
-            """
-        ).strip()
-        answer, quotes = separate_answer_quotes(test_answer)
-        self.assertEqual(answer, "It seems many people love dogs")
-        self.assertEqual(quotes[0], "A dog is a man's best friend")  # type: ignore
-        self.assertEqual(
-            quotes[1], "Air Bud was a movie about dogs and people loved it"  # type: ignore
-        )
+    # Test case 2: Lowercase 'quote' allowed
+    test_answer = textwrap.dedent(
+        """
+        It seems many people love dogs
+        quote: A dog is a man's best friend
+        Quote: Air Bud was a movie about dogs and people loved it
+        """
+    ).strip()
+    answer, quotes = separate_answer_quotes(test_answer)
+    assert answer == "It seems many people love dogs"
+    assert isinstance(quotes, list)
+    assert quotes[0] == "A dog is a man's best friend"
+    assert quotes[1] == "Air Bud was a movie about dogs and people loved it"

-        # No Answer
-        test_answer = textwrap.dedent(
-            """
-            Quote: This one has no answer
-            """
-        ).strip()
-        answer, quotes = separate_answer_quotes(test_answer)
-        self.assertIsNone(answer)
-        self.assertIsNone(quotes)
+    # Test case 3: No Answer
+    test_answer = textwrap.dedent(
+        """
+        Quote: This one has no answer
+        """
+    ).strip()
+    answer, quotes = separate_answer_quotes(test_answer)
+    assert answer is None
+    assert quotes is None

-        # Multiline Quote
-        test_answer = textwrap.dedent(
-            """
-            It seems many people love dogs
-            quote: A well known saying is:
-            A dog is a man's best friend
-            Quote: Air Bud was a movie about dogs and people loved it
-            """
-        ).strip()
-        answer, quotes = separate_answer_quotes(test_answer)
-        self.assertEqual(answer, "It seems many people love dogs")
-        self.assertEqual(
-            quotes[0], "A well known saying is:\nA dog is a man's best friend"  # type: ignore
-        )
-        self.assertEqual(
-            quotes[1], "Air Bud was a movie about dogs and people loved it"  # type: ignore
-        )
+    # Test case 4: Multiline Quote
+    test_answer = textwrap.dedent(
+        """
+        It seems many people love dogs
+        quote: A well known saying is:
+        A dog is a man's best friend
+        Quote: Air Bud was a movie about dogs and people loved it
+        """
+    ).strip()
+    answer, quotes = separate_answer_quotes(test_answer)
+    assert answer == "It seems many people love dogs"
+    assert isinstance(quotes, list)
+    assert quotes[0] == "A well known saying is:\nA dog is a man's best friend"
+    assert quotes[1] == "Air Bud was a movie about dogs and people loved it"

-        # Random patterns not picked up
-        test_answer = textwrap.dedent(
-            """
-            It seems many people love quote: dogs
-            quote: Quote: A well known saying is:
-            A dog is a man's best friend
-            Quote: Answer: Air Bud was a movie about dogs and quote: people loved it
-            """
-        ).strip()
-        answer, quotes = separate_answer_quotes(test_answer)
-        self.assertEqual(answer, "It seems many people love quote: dogs")
-        self.assertEqual(
-            quotes[0], "Quote: A well known saying is:\nA dog is a man's best friend"  # type: ignore
-        )
-        self.assertEqual(
-            quotes[1],  # type: ignore
-            "Answer: Air Bud was a movie about dogs and quote: people loved it",
-        )
-
-    @unittest.skip(
-        "Using fuzzy match is too slow anyway, doesn't matter if it's broken"
+    # Test case 5: Random patterns not picked up
+    test_answer = textwrap.dedent(
+        """
+        It seems many people love quote: dogs
+        quote: Quote: A well known saying is:
+        A dog is a man's best friend
+        Quote: Answer: Air Bud was a movie about dogs and quote: people loved it
+        """
+    ).strip()
+    answer, quotes = separate_answer_quotes(test_answer)
+    assert answer == "It seems many people love quote: dogs"
+    assert isinstance(quotes, list)
+    assert quotes[0] == "Quote: A well known saying is:\nA dog is a man's best friend"
+    assert (
+        quotes[1] == "Answer: Air Bud was a movie about dogs and quote: people loved it"
    )
-    def test_fuzzy_match_quotes_to_docs(self) -> None:
-        chunk_0_text = textwrap.dedent(
-            """
-            Here's a doc with some LINK embedded in the text
-            THIS SECTION IS A LINK
-            Some more text
-            """
-        ).strip()
-        chunk_1_text = textwrap.dedent(
-            """
-            Some completely different text here
-            ANOTHER LINK embedded in this text
-            ending in a DIFFERENT-LINK
-            """
-        ).strip()
-        test_chunk_0 = InferenceChunk(
-            document_id="test doc 0",
-            source_type=DocumentSource.FILE,
-            chunk_id=0,
-            content=chunk_0_text,
-            source_links={
-                0: "doc 0 base",
-                23: "first line link",
-                49: "second line link",
-            },
-            blurb="anything",
-            semantic_identifier="anything",
-            section_continuation=False,
-            recency_bias=1,
-            boost=0,
-            hidden=False,
-            score=1,
-            metadata={},
-            match_highlights=[],
-            updated_at=None,
-        )
-        test_chunk_1 = InferenceChunk(
-            document_id="test doc 1",
-            source_type=DocumentSource.FILE,
-            chunk_id=0,
-            content=chunk_1_text,
-            source_links={0: "doc 1 base", 36: "2nd line link", 82: "last link"},
-            blurb="whatever",
-            semantic_identifier="whatever",
-            section_continuation=False,
-            recency_bias=1,
-            boost=0,
-            hidden=False,
-            score=1,
-            metadata={},
-            match_highlights=[],
-            updated_at=None,
-        )
-
-        test_quotes = [
-            "a doc with some",  # Basic case
-            "a doc with some LINK",  # Should take the start of quote, even if a link is in it
-            "a doc with some \nLINK",  # Requires a newline deletion fuzzy match
-            "a doc with some link",  # Capitalization insensitive
-            "embedded in this text",  # Fuzzy match to first doc
-            "SECTION IS A LINK",  # Match exact link
-            "some more text",  # Match the end, after every link offset
-            "different taxt",  # Substitution
-            "embedded in this texts",  # Cannot fuzzy match to first doc, fuzzy match to second doc
-            "DIFFERENT-LINK",  # Exact link match at the end
-            "Some complitali",  # Too many edits, shouldn't match anything
-        ]
-        results = match_quotes_to_docs(
-            test_quotes, [test_chunk_0, test_chunk_1], fuzzy_search=True
-        )
-        self.assertEqual(
-            results,
-            {
-                "a doc with some": {"document": "test doc 0", "link": "doc 0 base"},
-                "a doc with some LINK": {
-                    "document": "test doc 0",
-                    "link": "doc 0 base",
-                },
-                "a doc with some \nLINK": {
-                    "document": "test doc 0",
-                    "link": "doc 0 base",
-                },
-                "a doc with some link": {
-                    "document": "test doc 0",
-                    "link": "doc 0 base",
-                },
-                "embedded in this text": {
-                    "document": "test doc 0",
-                    "link": "first line link",
-                },
-                "SECTION IS A LINK": {
-                    "document": "test doc 0",
-                    "link": "second line link",
-                },
-                "some more text": {
-                    "document": "test doc 0",
-                    "link": "second line link",
-                },
-                "different taxt": {"document": "test doc 1", "link": "doc 1 base"},
-                "embedded in this texts": {
-                    "document": "test doc 1",
-                    "link": "2nd line link",
-                },
-                "DIFFERENT-LINK": {"document": "test doc 1", "link": "last link"},
-            },
-        )


-if __name__ == "__main__":
-    unittest.main()
+@pytest.mark.skip(
+    reason="Using fuzzy match is too slow anyway, doesn't matter if it's broken"
+)
+def test_fuzzy_match_quotes_to_docs() -> None:
+    chunk_0_text = textwrap.dedent(
+        """
+        Here's a doc with some LINK embedded in the text
+        THIS SECTION IS A LINK
+        Some more text
+        """
+    ).strip()
+    chunk_1_text = textwrap.dedent(
+        """
+        Some completely different text here
+        ANOTHER LINK embedded in this text
+        ending in a DIFFERENT-LINK
+        """
+    ).strip()
+    test_chunk_0 = InferenceChunk(
+        document_id="test doc 0",
+        source_type=DocumentSource.FILE,
+        chunk_id=0,
+        content=chunk_0_text,
+        source_links={
+            0: "doc 0 base",
+            23: "first line link",
+            49: "second line link",
+        },
+        blurb="anything",
+        semantic_identifier="anything",
+        section_continuation=False,
+        recency_bias=1,
+        boost=0,
+        hidden=False,
+        score=1,
+        metadata={},
+        match_highlights=[],
+        updated_at=None,
+    )
+    test_chunk_1 = InferenceChunk(
+        document_id="test doc 1",
+        source_type=DocumentSource.FILE,
+        chunk_id=0,
+        content=chunk_1_text,
+        source_links={0: "doc 1 base", 36: "2nd line link", 82: "last link"},
+        blurb="whatever",
+        semantic_identifier="whatever",
+        section_continuation=False,
+        recency_bias=1,
+        boost=0,
+        hidden=False,
+        score=1,
+        metadata={},
+        match_highlights=[],
+        updated_at=None,
+    )
+
+    test_quotes = [
+        "a doc with some",  # Basic case
+        "a doc with some LINK",  # Should take the start of quote, even if a link is in it
+        "a doc with some \nLINK",  # Requires a newline deletion fuzzy match
+        "a doc with some link",  # Capitalization insensitive
+        "embedded in this text",  # Fuzzy match to first doc
+        "SECTION IS A LINK",  # Match exact link
+        "some more text",  # Match the end, after every link offset
+        "different taxt",  # Substitution
+        "embedded in this texts",  # Cannot fuzzy match to first doc, fuzzy match to second doc
+        "DIFFERENT-LINK",  # Exact link match at the end
+        "Some complitali",  # Too many edits, shouldn't match anything
+    ]
+    results = match_quotes_to_docs(
+        test_quotes, [test_chunk_0, test_chunk_1], fuzzy_search=True
+    )
+    assert results == {
+        "a doc with some": {"document": "test doc 0", "link": "doc 0 base"},
+        "a doc with some LINK": {
+            "document": "test doc 0",
+            "link": "doc 0 base",
+        },
+        "a doc with some \nLINK": {
+            "document": "test doc 0",
+            "link": "doc 0 base",
+        },
+        "a doc with some link": {
+            "document": "test doc 0",
+            "link": "doc 0 base",
+        },
+        "embedded in this text": {
+            "document": "test doc 0",
+            "link": "first line link",
+        },
+        "SECTION IS A LINK": {
+            "document": "test doc 0",
+            "link": "second line link",
+        },
+        "some more text": {
+            "document": "test doc 0",
+            "link": "second line link",
+        },
+        "different taxt": {"document": "test doc 1", "link": "doc 1 base"},
+        "embedded in this texts": {
+            "document": "test doc 1",
+            "link": "2nd line link",
+        },
+        "DIFFERENT-LINK": {"document": "test doc 1", "link": "last link"},
+    }
--- a/backend/tests/unit/danswer/llm/answering/stream_processing/test_citation_processing.py
+++ b/backend/tests/unit/danswer/llm/answering/stream_processing/test_citation_processing.py
@ -0,0 +1,277 @@
+from datetime import datetime
+
+import pytest
+
+from danswer.chat.models import CitationInfo
+from danswer.chat.models import DanswerAnswerPiece
+from danswer.chat.models import LlmDoc
+from danswer.configs.constants import DocumentSource
+from danswer.llm.answering.stream_processing.citation_processing import (
+    extract_citations_from_stream,
+)
+from danswer.llm.answering.stream_processing.utils import DocumentIdOrderMapping
+
+
+"""
+This module contains tests for the citation extraction functionality in Danswer.
+
+The tests focus on the `extract_citations_from_stream` function, which processes
+a stream of tokens and extracts citations, replacing them with properly formatted
+versions including links where available.
+
+Key components:
+- mock_docs: A list of mock LlmDoc objects used for testing.
+- mock_doc_mapping: A dictionary mapping document IDs to their ranks.
+- process_text: A helper function that simulates the citation extraction process.
+- test_citation_extraction: A parametrized test function covering various citation scenarios.
+
+To add new test cases:
+1. Add a new tuple to the @pytest.mark.parametrize decorator of test_citation_extraction.
+2. Each tuple should contain:
+   - A descriptive test name (string)
+   - Input tokens (list of strings)
+   - Expected output text (string)
+   - Expected citations (list of document IDs)
+"""
+
+
+mock_docs = [
+    LlmDoc(
+        document_id=f"doc_{int(id/2)}",
+        content="Document is a doc",
+        blurb=f"Document #{id}",
+        semantic_identifier=f"Doc {id}",
+        source_type=DocumentSource.WEB,
+        metadata={},
+        updated_at=datetime.now(),
+        link=f"https://{int(id/2)}.com" if int(id / 2) % 2 == 0 else None,
+        source_links={0: "https://mintlify.com/docs/settings/broken-links"},
+    )
+    for id in range(10)
+]
+
+mock_doc_mapping = {
+    "doc_0": 1,
+    "doc_1": 2,
+    "doc_2": 3,
+    "doc_3": 4,
+    "doc_4": 5,
+    "doc_5": 6,
+}
+
+
+@pytest.fixture
+def mock_data() -> tuple[list[LlmDoc], dict[str, int]]:
+    return mock_docs, mock_doc_mapping
+
+
+def process_text(
+    tokens: list[str], mock_data: tuple[list[LlmDoc], dict[str, int]]
+) -> tuple[str, list[CitationInfo]]:
+    mock_docs, mock_doc_id_to_rank_map = mock_data
+    mapping = DocumentIdOrderMapping(order_mapping=mock_doc_id_to_rank_map)
+    result = list(
+        extract_citations_from_stream(
+            tokens=iter(tokens),
+            context_docs=mock_docs,
+            doc_id_to_rank_map=mapping,
+            stop_stream=None,
+        )
+    )
+    final_answer_text = ""
+    citations = []
+    for piece in result:
+        if isinstance(piece, DanswerAnswerPiece):
+            final_answer_text += piece.answer_piece or ""
+        elif isinstance(piece, CitationInfo):
+            citations.append(piece)
+    return final_answer_text, citations
+
+
+@pytest.mark.parametrize(
+    "test_name, input_tokens, expected_text, expected_citations",
+    [
+        (
+            "Single citation",
+            ["Gro", "wth! [", "1", "]", "."],
+            "Growth! [[1]](https://0.com).",
+            ["doc_0"],
+        ),
+        (
+            "Repeated citations",
+            ["Test! ", "[", "1", "]", ". And so", "me more ", "[", "2", "]", "."],
+            "Test! [[1]](https://0.com). And some more [[1]](https://0.com).",
+            ["doc_0"],
+        ),
+        (
+            "Citations at sentence boundaries",
+            [
+                "Citation at the ",
+                "end of a sen",
+                "tence.",
+                "[",
+                "2",
+                "]",
+                " Another sen",
+                "tence.",
+                "[",
+                "4",
+                "]",
+            ],
+            "Citation at the end of a sentence.[[1]](https://0.com) Another sentence.[[2]]()",
+            ["doc_0", "doc_1"],
+        ),
+        (
+            "Citations at beginning, middle, and end",
+            [
+                "[",
+                "1",
+                "]",
+                " Citation at ",
+                "the beginning. ",
+                "[",
+                "3",
+                "]",
+                " In the mid",
+                "dle. At the end ",
+                "[",
+                "5",
+                "]",
+                ".",
+            ],
+            "[[1]](https://0.com) Citation at the beginning. [[2]]() In the middle. At the end [[3]](https://2.com).",
+            ["doc_0", "doc_1", "doc_2"],
+        ),
+        (
+            "Mixed valid and invalid citations",
+            [
+                "Mixed valid and in",
+                "valid citations ",
+                "[",
+                "1",
+                "]",
+                "[",
+                "99",
+                "]",
+                "[",
+                "3",
+                "]",
+                "[",
+                "100",
+                "]",
+                "[",
+                "5",
+                "]",
+                ".",
+            ],
+            "Mixed valid and invalid citations [[1]](https://0.com)[99][[2]]()[100][[3]](https://2.com).",
+            ["doc_0", "doc_1", "doc_2"],
+        ),
+        (
+            "Hardest!",
+            [
+                "Multiple cit",
+                "ations in one ",
+                "sentence [",
+                "1",
+                "]",
+                "[",
+                "4",
+                "]",
+                "[",
+                "5",
+                "]",
+                ". ",
+            ],
+            "Multiple citations in one sentence [[1]](https://0.com)[[2]]()[[3]](https://2.com).",
+            ["doc_0", "doc_1", "doc_2"],
+        ),
+        (
+            "Repeated citations with text",
+            ["[", "1", "]", "Aasf", "asda", "sff  ", "[", "1", "]", " ."],
+            "[[1]](https://0.com)Aasfasdasff  [[1]](https://0.com) .",
+            ["doc_0"],
+        ),
+        (
+            "Consecutive identical citations!",
+            [
+                "Citations [",
+                "1",
+                "]",
+                "[",
+                "1]",
+                "",
+                "[2",
+                "",
+                "]",
+                ". ",
+            ],
+            "Citations [[1]](https://0.com).",
+            ["doc_0"],
+        ),
+        (
+            "Consecutive identical citations!",
+            [
+                "test [1]tt[1]t",
+                "",
+            ],
+            "test [[1]](https://0.com)ttt",
+            ["doc_0"],
+        ),
+        (
+            "Consecutive identical citations!",
+            [
+                "test [1]t[1]t[1]",
+                "",
+            ],
+            "test [[1]](https://0.com)tt",
+            ["doc_0"],
+        ),
+        (
+            "Repeated citations with text",
+            ["[", "1", "]", "Aasf", "asda", "sff  ", "[", "1", "]", " ."],
+            "[[1]](https://0.com)Aasfasdasff  [[1]](https://0.com) .",
+            ["doc_0"],
+        ),
+        (
+            "Repeated citations with text",
+            ["[1][", "1", "]t", "[2]"],
+            "[[1]](https://0.com)t",
+            ["doc_0"],
+        ),
+        (
+            "Repeated citations with text",
+            ["[1][", "1", "]t]", "[2]"],
+            "[[1]](https://0.com)t]",
+            ["doc_0"],
+        ),
+        (
+            "Repeated citations with text",
+            ["[1][", "3", "]t]", "[2]"],
+            "[[1]](https://0.com)[[2]]()t]",
+            ["doc_0", "doc_1"],
+        ),
+        (
+            "Repeated citations with text",
+            ["[1", "][", "3", "]t]", "[2]"],
+            "[[1]](https://0.com)[[2]]()t]",
+            ["doc_0", "doc_1"],
+        ),
+    ],
+)
+def test_citation_extraction(
+    mock_data: tuple[list[LlmDoc], dict[str, int]],
+    test_name: str,
+    input_tokens: list[str],
+    expected_text: str,
+    expected_citations: list[str],
+) -> None:
+    final_answer_text, citations = process_text(input_tokens, mock_data)
+    assert (
+        final_answer_text.strip() == expected_text.strip()
+    ), f"Test '{test_name}' failed: Final answer text does not match expected output."
+    assert [
+        citation.document_id for citation in citations
+    ] == expected_citations, (
+        f"Test '{test_name}' failed: Citations do not match expected output."
+    )