DAN-50 References should include blurb (#26)

2025-03-17 21:32:36 +01:00 · 2023-05-10 21:03:15 -07:00 · 2023-05-10 21:03:15 -07:00 · 279c5e0eb1
commit 279c5e0eb1
parent 38bcb3ee6b
8 changed files with 54 additions and 1 deletions
--- a/backend/danswer/chunking/chunk.py
+++ b/backend/danswer/chunking/chunk.py
@ -1,7 +1,9 @@
 import abc
+import re
 from collections.abc import Callable

 from danswer.chunking.models import IndexChunk
+from danswer.configs.app_configs import BLURB_LENGTH
 from danswer.configs.app_configs import CHUNK_OVERLAP
 from danswer.configs.app_configs import CHUNK_SIZE
 from danswer.connectors.models import Document
@ -12,14 +14,44 @@ SECTION_SEPARATOR = "\n\n"
 ChunkFunc = Callable[[Document], list[IndexChunk]]


+def extract_blurb(text: str, blurb_len: int) -> str:
+    if len(text) < blurb_len:
+        return text
+
+    match = re.search(r"[.!?:]", text[blurb_len:])
+    max_blub_len = min(2 * blurb_len, len(text))
+
+    end_index = (
+        max_blub_len
+        if match is None
+        else min(blurb_len + match.start() + 1, max_blub_len)
+    )
+
+    if text[end_index : end_index + 1] not in [" ", "", "\r", "\n"]:
+        last_space = text.rfind(" ", 0, end_index)
+        # If there's no space in the text (single word longer than blurb_len), return the whole text
+        end_index = last_space if last_space != -1 else len(text)
+
+    blurb = text[:end_index]
+
+    blurb = blurb.replace("\n", " ")
+    blurb = blurb.replace("\r", " ")
+    while "  " in blurb:
+        blurb = blurb.replace("  ", " ")
+
+    return blurb
+
+
 def chunk_large_section(
    section: Section,
    document: Document,
    start_chunk_id: int,
    chunk_size: int = CHUNK_SIZE,
    word_overlap: int = CHUNK_OVERLAP,
+    blurb_len: int = BLURB_LENGTH,
 ) -> list[IndexChunk]:
    section_text = section.text
+    blurb = extract_blurb(section_text, blurb_len)
    char_count = len(section_text)
    chunk_strs: list[str] = []
    start_pos = segment_start_pos = 0
@ -61,6 +93,7 @@ def chunk_large_section(
            IndexChunk(
                source_document=document,
                chunk_id=start_chunk_id + chunk_ind,
+                blurb=blurb,
                content=chunk_str,
                source_links={0: section.link},
                section_continuation=(chunk_ind != 0),
@ -73,6 +106,7 @@ def chunk_document(
    document: Document,
    chunk_size: int = CHUNK_SIZE,
    subsection_overlap: int = CHUNK_OVERLAP,
+    blurb_len=BLURB_LENGTH,
 ) -> list[IndexChunk]:
    chunks: list[IndexChunk] = []
    link_offsets: dict[int, str] = {}
@ -90,6 +124,7 @@ def chunk_document(
                    IndexChunk(
                        source_document=document,
                        chunk_id=len(chunks),
+                        blurb=extract_blurb(chunk_text, blurb_len),
                        content=chunk_text,
                        source_links=link_offsets,
                        section_continuation=False,
@ -104,6 +139,7 @@ def chunk_document(
                start_chunk_id=len(chunks),
                chunk_size=chunk_size,
                word_overlap=subsection_overlap,
+                blurb_len=blurb_len,
            )
            chunks.extend(large_section_chunks)
            continue
@ -119,6 +155,7 @@ def chunk_document(
                IndexChunk(
                    source_document=document,
                    chunk_id=len(chunks),
+                    blurb=extract_blurb(chunk_text, blurb_len),
                    content=chunk_text,
                    source_links=link_offsets,
                    section_continuation=False,
@ -133,6 +170,7 @@ def chunk_document(
            IndexChunk(
                source_document=document,
                chunk_id=len(chunks),
+                blurb=extract_blurb(chunk_text, blurb_len),
                content=chunk_text,
                source_links=link_offsets,
                section_continuation=False,
--- a/backend/danswer/chunking/models.py
+++ b/backend/danswer/chunking/models.py
@ -8,6 +8,7 @@ from danswer.connectors.models import Document
@dataclass
 class BaseChunk:
    chunk_id: int
+    blurb: str  # The first sentence(s) of the first Section of the chunk
    content: str
    source_links: Optional[
        dict[int, str]
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@ -7,6 +7,12 @@ APP_HOST = "0.0.0.0"
 APP_PORT = 8080


+#####
+# User Facing Features Configs
+#####
+BLURB_LENGTH = 200  # Characters. Blurbs will be truncated at the first punctuation after this many characters.
+
+
 #####
 # Vector DB Configs
 #####
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -2,6 +2,7 @@ from enum import Enum

 DOCUMENT_ID = "document_id"
 CHUNK_ID = "chunk_id"
+BLURB = "blurb"
 CONTENT = "content"
 SOURCE_TYPE = "source_type"
 SOURCE_LINKS = "source_links"
--- a/backend/danswer/connectors/google_drive/batch.py
+++ b/backend/danswer/connectors/google_drive/batch.py
@ -121,7 +121,7 @@ class BatchGoogleDriveLoader(BatchLoader):
            doc_batch = []
            for file in files_batch:
                text_contents = extract_text(file, service)
-                full_context = file["name"] + " " + text_contents
+                full_context = file["name"] + " - " + text_contents

                doc_batch.append(
                    Document(
--- a/backend/danswer/datastores/qdrant/indexing.py
+++ b/backend/danswer/datastores/qdrant/indexing.py
@ -3,6 +3,7 @@ import uuid
 from danswer.chunking.models import EmbeddedIndexChunk
 from danswer.configs.constants import ALLOWED_GROUPS
 from danswer.configs.constants import ALLOWED_USERS
+from danswer.configs.constants import BLURB
 from danswer.configs.constants import CHUNK_ID
 from danswer.configs.constants import CONTENT
 from danswer.configs.constants import DOCUMENT_ID
@ -58,6 +59,7 @@ def index_chunks(
                payload={
                    DOCUMENT_ID: document.id,
                    CHUNK_ID: chunk.chunk_id,
+                    BLURB: chunk.blurb,
                    CONTENT: chunk.content,
                    SOURCE_TYPE: str(document.source.value),
                    SOURCE_LINKS: chunk.source_links,
--- a/backend/danswer/direct_qa/question_answer.py
+++ b/backend/danswer/direct_qa/question_answer.py
@ -12,6 +12,7 @@ import regex
 from danswer.chunking.models import InferenceChunk
 from danswer.configs.app_configs import OPENAI_API_KEY
 from danswer.configs.app_configs import QUOTE_ALLOWED_ERROR_PERCENT
+from danswer.configs.constants import BLURB
 from danswer.configs.constants import DOCUMENT_ID
 from danswer.configs.constants import SEMANTIC_IDENTIFIER
 from danswer.configs.constants import SOURCE_LINK
@ -140,6 +141,7 @@ def match_quotes_to_docs(
                        SOURCE_LINK: curr_link,
                        SOURCE_TYPE: chunk.source_type,
                        SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
+                        BLURB: chunk.blurb,
                    }
                    break
            quotes_dict[quote] = {
@ -147,6 +149,7 @@ def match_quotes_to_docs(
                SOURCE_LINK: curr_link,
                SOURCE_TYPE: chunk.source_type,
                SEMANTIC_IDENTIFIER: chunk.semantic_identifier,
+                BLURB: chunk.blurb,
            }
            break
    return quotes_dict
--- a/backend/scripts/simulate_frontend.py
+++ b/backend/scripts/simulate_frontend.py
@ -4,6 +4,7 @@ import json
 import requests
 from danswer.configs.app_configs import APP_PORT
 from danswer.configs.app_configs import QDRANT_DEFAULT_COLLECTION
+from danswer.configs.constants import BLURB
 from danswer.configs.constants import SEMANTIC_IDENTIFIER
 from danswer.configs.constants import SOURCE_LINK
 from danswer.configs.constants import SOURCE_TYPE
@ -84,6 +85,7 @@ if __name__ == "__main__":
                    ):
                        print(f"Quote {str(ind + 1)}:\n{quote}")
                        print(f"Semantic Identifier: {quote_info[SEMANTIC_IDENTIFIER]}")
+                        print(f"Blurb: {quote_info[BLURB]}")
                        print(f"Link: {quote_info[SOURCE_LINK]}")
                        print(f"Source: {quote_info[SOURCE_TYPE]}")
                else: