import unittest from danswer.chunking.chunk import chunk_document from danswer.chunking.chunk import chunk_large_section from danswer.configs.constants import DocumentSource from danswer.connectors.models import Document from danswer.connectors.models import Section WAR_AND_PEACE = ( "Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, " "if you don’t tell me that this means war, if you still try to defend the infamies and horrors perpetrated by " "that Antichrist—I really believe he is Antichrist—I will have nothing more to do with you and you are no longer " "my friend, no longer my ‘faithful slave,’ as you call yourself! But how do you do? I see I have frightened " "you—sit down and tell me all the news." ) class TestDocumentChunking(unittest.TestCase): def setUp(self) -> None: self.large_section = Section(text=WAR_AND_PEACE, link="https://www.test.com/") self.large_unbroken_section = Section( text="0123456789" * 40, link="https://www.test.com/" ) self.document = Document( id="test_document", sections=[ Section( text="Here is some testing text", link="https://www.test.com/0" ), Section( text="Some more text, still under 100 chars", link="https://www.test.com/1", ), Section( text="Now with this section it's longer than the chunk size", link="https://www.test.com/2", ), self.large_section, Section(text="These last 2 sections", link="https://www.test.com/4"), Section( text="should be combined into one", link="https://www.test.com/5" ), ], source=DocumentSource.WEB, # arbitrary picking web, doens't matter for this test semantic_identifier="Whatever", metadata={}, ) def test_chunk_large_section(self) -> None: chunks = chunk_large_section( section=self.large_section, document=self.document, start_chunk_id=5, chunk_size=100, word_overlap=3, ) contents = [chunk.content for chunk in chunks] self.assertEqual(len(contents), 5) self.assertEqual(contents[0], WAR_AND_PEACE[:100]) self.assertEqual( contents[-2], WAR_AND_PEACE[-172:-62] ) # slightly longer than 100 due to overlap self.assertEqual( contents[-1], WAR_AND_PEACE[-125:] ) # large overlap with second to last segment self.assertFalse(chunks[0].section_continuation) self.assertTrue(chunks[1].section_continuation) self.assertTrue(chunks[-1].section_continuation) def test_chunk_max_overflow(self) -> None: chunks = chunk_large_section( section=self.large_unbroken_section, document=self.document, start_chunk_id=5, chunk_size=100, word_overlap=3, ) contents = [chunk.content for chunk in chunks] self.assertEqual(len(contents), 4) self.assertEqual(contents[0], self.large_unbroken_section.text[:150]) self.assertEqual(contents[1], self.large_unbroken_section.text[50:250]) self.assertEqual(contents[2], self.large_unbroken_section.text[150:350]) # Last chunk counts back from the end, full chunk size (100) + 50 overlap => 400 - 150 = 250 self.assertEqual(contents[3], self.large_unbroken_section.text[250:]) def test_chunk_document(self) -> None: chunks = chunk_document(self.document, chunk_size=100, subsection_overlap=3) self.assertEqual(len(chunks), 8) self.assertEqual( chunks[0].content, self.document.sections[0].text + "\n\n" + self.document.sections[1].text, ) self.assertEqual( chunks[0].source_links, {0: "https://www.test.com/0", 21: "https://www.test.com/1"}, ) self.assertEqual( chunks[-1].source_links, {0: "https://www.test.com/4", 18: "https://www.test.com/5"}, ) self.assertEqual(chunks[5].chunk_id, 5) self.assertEqual(chunks[6].source_document, self.document) if __name__ == "__main__": unittest.main()