From 9651ea828b02f3832f1eeaf3fb6fd032cc839051 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Wed, 24 Jul 2024 11:05:56 -0700
Subject: [PATCH] Handling Metadata by Vector and Keyword (#1909)

---
 backend/danswer/configs/constants.py          |   1 -
 backend/danswer/connectors/models.py          |   6 +-
 backend/danswer/document_index/vespa/index.py |  10 +-
 backend/danswer/indexing/chunker.py           | 147 +++++++++++-------
 backend/danswer/indexing/embedder.py          |   7 +-
 backend/danswer/indexing/models.py            |   8 +-
 .../search/postprocessing/postprocessing.py   |  12 +-
 .../unit/danswer/indexing/test_chunker.py     |   4 +-
 8 files changed, 119 insertions(+), 76 deletions(-)

diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index fb14bb4b3d..c536d9189d 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -44,7 +44,6 @@ QUERY_EVENT_ID = "query_event_id"
 LLM_CHUNKS = "llm_chunks"
 
 # For chunking/processing chunks
-MAX_CHUNK_TITLE_LEN = 1000
 RETURN_SEPARATOR = "\n\r\n"
 SECTION_SEPARATOR = "\n\n"
 # For combining attributes, doesn't have to be unique/perfect to work
diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py
index 11925bdc6a..1f4a1d2ae3 100644
--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@@ -114,7 +114,9 @@ class DocumentBase(BaseModel):
     title: str | None = None
     from_ingestion_api: bool = False
 
-    def get_title_for_document_index(self) -> str | None:
+    def get_title_for_document_index(
+        self,
+    ) -> str | None:
         # If title is explicitly empty, return a None here for embedding purposes
         if self.title == "":
             return None
@@ -123,8 +125,6 @@ class DocumentBase(BaseModel):
         for char in replace_chars:
             title = title.replace(char, " ")
         title = title.strip()
-        # Title could be quite long here as there is no truncation done
-        # just prior to embedding, it could be truncated
         return title
 
     def get_metadata_str_attributes(self) -> list[str] | None:
diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py
index d5815be601..c238da6e7d 100644
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -352,11 +352,15 @@ def _index_vespa_chunk(
         BLURB: remove_invalid_unicode_chars(chunk.blurb),
         TITLE: remove_invalid_unicode_chars(title) if title else None,
         SKIP_TITLE_EMBEDDING: not title,
-        CONTENT: remove_invalid_unicode_chars(chunk.content),
+        # For the BM25 index, the keyword suffix is used, the vector is already generated with the more
+        # natural language representation of the metadata section
+        CONTENT: remove_invalid_unicode_chars(
+            f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_keyword}"
+        ),
         # This duplication of `content` is needed for keyword highlighting
         # Note that it's not exactly the same as the actual content
         # which contains the title prefix and metadata suffix
-        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content_summary),
+        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
         SOURCE_TYPE: str(document.source.value),
         SOURCE_LINKS: json.dumps(chunk.source_links),
         SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
@@ -364,7 +368,7 @@ def _index_vespa_chunk(
         METADATA: json.dumps(document.metadata),
         # Save as a list for efficient extraction as an Attribute
         METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
-        METADATA_SUFFIX: chunk.metadata_suffix,
+        METADATA_SUFFIX: chunk.metadata_suffix_keyword,
         EMBEDDINGS: embeddings_name_vector_map,
         TITLE_EMBEDDING: chunk.title_embedding,
         BOOST: chunk.boost,
diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py
index 1b4d65afbc..cddcd0c195 100644
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -6,7 +6,6 @@ from danswer.configs.app_configs import BLURB_SIZE
 from danswer.configs.app_configs import MINI_CHUNK_SIZE
 from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
 from danswer.configs.constants import DocumentSource
-from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
 from danswer.configs.constants import RETURN_SEPARATOR
 from danswer.configs.constants import SECTION_SEPARATOR
 from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
@@ -20,7 +19,7 @@ from danswer.utils.logger import setup_logger
 from danswer.utils.text_processing import shared_precompare_cleanup
 
 if TYPE_CHECKING:
-    from transformers import AutoTokenizer  # type:ignore
+    from llama_index.text_splitter import SentenceSplitter  # type:ignore
 
 
 # Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
@@ -28,6 +27,8 @@ if TYPE_CHECKING:
 CHUNK_OVERLAP = 0
 # Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
 # overwhelm the actual contents of the chunk
+# For example in a rare case, this could be 128 tokens for the 512 chunk and title prefix
+# could be another 128 tokens leaving 256 for the actual contents
 MAX_METADATA_PERCENTAGE = 0.25
 CHUNK_MIN_CONTENT = 256
 
@@ -36,14 +37,7 @@ logger = setup_logger()
 ChunkFunc = Callable[[Document], list[DocAwareChunk]]
 
 
-def extract_blurb(text: str, blurb_size: int) -> str:
-    from llama_index.text_splitter import SentenceSplitter
-
-    token_count_func = get_default_tokenizer().tokenize
-    blurb_splitter = SentenceSplitter(
-        tokenizer=token_count_func, chunk_size=blurb_size, chunk_overlap=0
-    )
-
+def extract_blurb(text: str, blurb_splitter: "SentenceSplitter") -> str:
     return blurb_splitter.split_text(text)[0]
 
 
@@ -52,33 +46,25 @@ def chunk_large_section(
     section_link_text: str,
     document: Document,
     start_chunk_id: int,
-    tokenizer: "AutoTokenizer",
-    chunk_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
-    chunk_overlap: int = CHUNK_OVERLAP,
-    blurb_size: int = BLURB_SIZE,
+    blurb: str,
+    chunk_splitter: "SentenceSplitter",
     title_prefix: str = "",
-    metadata_suffix: str = "",
+    metadata_suffix_semantic: str = "",
+    metadata_suffix_keyword: str = "",
 ) -> list[DocAwareChunk]:
-    from llama_index.text_splitter import SentenceSplitter
-
-    blurb = extract_blurb(section_text, blurb_size)
-
-    sentence_aware_splitter = SentenceSplitter(
-        tokenizer=tokenizer.tokenize, chunk_size=chunk_size, chunk_overlap=chunk_overlap
-    )
-
-    split_texts = sentence_aware_splitter.split_text(section_text)
+    split_texts = chunk_splitter.split_text(section_text)
 
     chunks = [
         DocAwareChunk(
             source_document=document,
             chunk_id=start_chunk_id + chunk_ind,
             blurb=blurb,
-            content=f"{title_prefix}{chunk_str}{metadata_suffix}",
-            content_summary=chunk_str,
+            content=chunk_str,
             source_links={0: section_link_text},
             section_continuation=(chunk_ind != 0),
-            metadata_suffix=metadata_suffix,
+            title_prefix=title_prefix,
+            metadata_suffix_semantic=metadata_suffix_semantic,
+            metadata_suffix_keyword=metadata_suffix_keyword,
         )
         for chunk_ind, chunk_str in enumerate(split_texts)
     ]
@@ -86,42 +72,87 @@ def chunk_large_section(
 
 
 def _get_metadata_suffix_for_document_index(
-    metadata: dict[str, str | list[str]]
-) -> str:
+    metadata: dict[str, str | list[str]], include_separator: bool = False
+) -> tuple[str, str]:
+    """
+    Returns the metadata as a natural language string representation with all of the keys and values for the vector embedding
+    and a string of all of the values for the keyword search
+
+    For example, if we have the following metadata:
+    {
+        "author": "John Doe",
+        "space": "Engineering"
+    }
+    The vector embedding string should include the relation between the key and value wheres as for keyword we only want John Doe
+    and Engineering. The keys are repeat and much more noisy.
+    """
     if not metadata:
-        return ""
+        return "", ""
+
     metadata_str = "Metadata:\n"
+    metadata_values = []
     for key, value in metadata.items():
         if key in get_metadata_keys_to_ignore():
             continue
 
         value_str = ", ".join(value) if isinstance(value, list) else value
+
+        if isinstance(value, list):
+            metadata_values.extend(value)
+        else:
+            metadata_values.append(value)
+
         metadata_str += f"\t{key} - {value_str}\n"
-    return metadata_str.strip()
+
+    metadata_semantic = metadata_str.strip()
+    metadata_keyword = " ".join(metadata_values)
+
+    if include_separator:
+        return RETURN_SEPARATOR + metadata_semantic, RETURN_SEPARATOR + metadata_keyword
+    return metadata_semantic, metadata_keyword
 
 
 def chunk_document(
     document: Document,
     chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
     subsection_overlap: int = CHUNK_OVERLAP,
-    blurb_size: int = BLURB_SIZE,
+    blurb_size: int = BLURB_SIZE,  # Used for both title and content
     include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
 ) -> list[DocAwareChunk]:
+    from llama_index.text_splitter import SentenceSplitter
+
     tokenizer = get_default_tokenizer()
 
-    title = document.get_title_for_document_index()
-    title_prefix = f"{title[:MAX_CHUNK_TITLE_LEN]}{RETURN_SEPARATOR}" if title else ""
+    blurb_splitter = SentenceSplitter(
+        tokenizer=tokenizer.tokenize, chunk_size=blurb_size, chunk_overlap=0
+    )
+
+    chunk_splitter = SentenceSplitter(
+        tokenizer=tokenizer.tokenize,
+        chunk_size=chunk_tok_size,
+        chunk_overlap=subsection_overlap,
+    )
+
+    title = extract_blurb(document.get_title_for_document_index() or "", blurb_splitter)
+    title_prefix = title + RETURN_SEPARATOR if title else ""
     title_tokens = len(tokenizer.tokenize(title_prefix))
 
-    metadata_suffix = ""
+    metadata_suffix_semantic = ""
+    metadata_suffix_keyword = ""
     metadata_tokens = 0
     if include_metadata:
-        metadata = _get_metadata_suffix_for_document_index(document.metadata)
-        metadata_suffix = RETURN_SEPARATOR + metadata if metadata else ""
-        metadata_tokens = len(tokenizer.tokenize(metadata_suffix))
+        (
+            metadata_suffix_semantic,
+            metadata_suffix_keyword,
+        ) = _get_metadata_suffix_for_document_index(
+            document.metadata, include_separator=True
+        )
+        metadata_tokens = len(tokenizer.tokenize(metadata_suffix_semantic))
 
     if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
-        metadata_suffix = ""
+        # Note: we can keep the keyword suffix even if the semantic suffix is too long to fit in the model
+        # context, there is no limit for the keyword component
+        metadata_suffix_semantic = ""
         metadata_tokens = 0
 
     content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
@@ -130,7 +161,7 @@ def chunk_document(
     if content_token_limit <= CHUNK_MIN_CONTENT:
         content_token_limit = chunk_tok_size
         title_prefix = ""
-        metadata_suffix = ""
+        metadata_suffix_semantic = ""
 
     chunks: list[DocAwareChunk] = []
     link_offsets: dict[int, str] = {}
@@ -151,12 +182,13 @@ def chunk_document(
                     DocAwareChunk(
                         source_document=document,
                         chunk_id=len(chunks),
-                        blurb=extract_blurb(chunk_text, blurb_size),
-                        content=f"{title_prefix}{chunk_text}{metadata_suffix}",
-                        content_summary=chunk_text,
+                        blurb=extract_blurb(chunk_text, blurb_splitter),
+                        content=chunk_text,
                         source_links=link_offsets,
                         section_continuation=False,
-                        metadata_suffix=metadata_suffix,
+                        title_prefix=title_prefix,
+                        metadata_suffix_semantic=metadata_suffix_semantic,
+                        metadata_suffix_keyword=metadata_suffix_keyword,
                     )
                 )
                 link_offsets = {}
@@ -167,12 +199,11 @@ def chunk_document(
                 section_link_text=section_link_text,
                 document=document,
                 start_chunk_id=len(chunks),
-                tokenizer=tokenizer,
-                chunk_size=content_token_limit,
-                chunk_overlap=subsection_overlap,
-                blurb_size=blurb_size,
+                chunk_splitter=chunk_splitter,
+                blurb=extract_blurb(section_text, blurb_splitter),
                 title_prefix=title_prefix,
-                metadata_suffix=metadata_suffix,
+                metadata_suffix_semantic=metadata_suffix_semantic,
+                metadata_suffix_keyword=metadata_suffix_keyword,
             )
             chunks.extend(large_section_chunks)
             continue
@@ -193,12 +224,13 @@ def chunk_document(
                 DocAwareChunk(
                     source_document=document,
                     chunk_id=len(chunks),
-                    blurb=extract_blurb(chunk_text, blurb_size),
-                    content=f"{title_prefix}{chunk_text}{metadata_suffix}",
-                    content_summary=chunk_text,
+                    blurb=extract_blurb(chunk_text, blurb_splitter),
+                    content=chunk_text,
                     source_links=link_offsets,
                     section_continuation=False,
-                    metadata_suffix=metadata_suffix,
+                    title_prefix=title_prefix,
+                    metadata_suffix_semantic=metadata_suffix_semantic,
+                    metadata_suffix_keyword=metadata_suffix_keyword,
                 )
             )
             link_offsets = {0: section_link_text}
@@ -211,12 +243,13 @@ def chunk_document(
             DocAwareChunk(
                 source_document=document,
                 chunk_id=len(chunks),
-                blurb=extract_blurb(chunk_text, blurb_size),
-                content=f"{title_prefix}{chunk_text}{metadata_suffix}",
-                content_summary=chunk_text,
+                blurb=extract_blurb(chunk_text, blurb_splitter),
+                content=chunk_text,
                 source_links=link_offsets,
                 section_continuation=False,
-                metadata_suffix=metadata_suffix,
+                title_prefix=title_prefix,
+                metadata_suffix_semantic=metadata_suffix_semantic,
+                metadata_suffix_keyword=metadata_suffix_keyword,
             )
         )
     return chunks
diff --git a/backend/danswer/indexing/embedder.py b/backend/danswer/indexing/embedder.py
index 3121839832..1244232e43 100644
--- a/backend/danswer/indexing/embedder.py
+++ b/backend/danswer/indexing/embedder.py
@@ -81,9 +81,12 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
         chunk_texts: list[str] = []
         chunk_mini_chunks_count = {}
         for chunk_ind, chunk in enumerate(chunks):
-            chunk_texts.append(chunk.content)
+            # The whole chunk including the prefix/suffix is included in the overall vector representation
+            chunk_texts.append(
+                f"{chunk.title_prefix}{chunk.content}{chunk.metadata_suffix_semantic}"
+            )
             mini_chunk_texts = (
-                split_chunk_text_into_mini_chunks(chunk.content_summary)
+                split_chunk_text_into_mini_chunks(chunk.content)
                 if enable_mini_chunk
                 else []
             )
diff --git a/backend/danswer/indexing/models.py b/backend/danswer/indexing/models.py
index 44fe2dd5a6..207ea30938 100644
--- a/backend/danswer/indexing/models.py
+++ b/backend/danswer/indexing/models.py
@@ -36,15 +36,13 @@ class DocAwareChunk(BaseChunk):
     # During inference we only have access to the document id and do not reconstruct the Document
     source_document: Document
 
-    # The Vespa documents require a separate highlight field. Since it is stored as a duplicate anyway,
-    # it's easier to just store a not prefixed/suffixed string for the highlighting
-    # Also during the chunking, this non-prefixed/suffixed string is used for mini-chunks
-    content_summary: str
+    title_prefix: str
 
     # During indexing we also (optionally) build a metadata string from the metadata dict
     # This is also indexed so that we can strip it out after indexing, this way it supports
     # multiple iterations of metadata representation for backwards compatibility
-    metadata_suffix: str
+    metadata_suffix_semantic: str
+    metadata_suffix_keyword: str
 
     def to_short_descriptor(self) -> str:
         """Used when logging the identity of a chunk"""
diff --git a/backend/danswer/search/postprocessing/postprocessing.py b/backend/danswer/search/postprocessing/postprocessing.py
index 2ab5976127..3a13689c93 100644
--- a/backend/danswer/search/postprocessing/postprocessing.py
+++ b/backend/danswer/search/postprocessing/postprocessing.py
@@ -4,7 +4,7 @@ from typing import cast
 
 import numpy
 
-from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
+from danswer.configs.app_configs import BLURB_SIZE
 from danswer.configs.constants import RETURN_SEPARATOR
 from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
 from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
@@ -60,8 +60,14 @@ def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk
         if chunk.content.startswith(chunk.title):
             return chunk.content[len(chunk.title) :].lstrip()
 
-        if chunk.content.startswith(chunk.title[:MAX_CHUNK_TITLE_LEN]):
-            return chunk.content[MAX_CHUNK_TITLE_LEN:].lstrip()
+        # BLURB SIZE is by token instead of char but each token is at least 1 char
+        # If this prefix matches the content, it's assumed the title was prepended
+        if chunk.content.startswith(chunk.title[:BLURB_SIZE]):
+            return (
+                chunk.content.split(RETURN_SEPARATOR, 1)[-1]
+                if RETURN_SEPARATOR in chunk.content
+                else chunk.content
+            )
 
         return chunk.content
 
diff --git a/backend/tests/unit/danswer/indexing/test_chunker.py b/backend/tests/unit/danswer/indexing/test_chunker.py
index 1cd6f66f0c..54e902eb08 100644
--- a/backend/tests/unit/danswer/indexing/test_chunker.py
+++ b/backend/tests/unit/danswer/indexing/test_chunker.py
@@ -31,8 +31,8 @@ def test_chunk_document() -> None:
 
     chunks = chunk_document(document)
     assert len(chunks) == 5
-    assert all(semantic_identifier in chunk.content for chunk in chunks)
     assert short_section_1 in chunks[0].content
     assert short_section_3 in chunks[-1].content
     assert short_section_4 in chunks[-1].content
-    assert "tag1" in chunks[0].content
+    assert "tag1" in chunks[0].metadata_suffix_keyword
+    assert "tag2" in chunks[0].metadata_suffix_semantic