Include Titles in Chunks (#1817)

2025-04-09 12:30:49 +02:00 · 2024-07-12 09:42:24 -07:00 · 2024-07-12 09:42:24 -07:00 · e90c66c1b6
commit e90c66c1b6
parent 8c312482c1
15 changed files with 224 additions and 63 deletions
--- a/backend/alembic/versions/7aea705850d5_added_slack_auto_filter.py
+++ b/backend/alembic/versions/7aea705850d5_added_slack_auto_filter.py
@ -10,8 +10,8 @@ import sqlalchemy as sa

 revision = "7aea705850d5"
 down_revision = "4505fd7302e1"
-branch_labels = None
-depends_on = None
+branch_labels: None = None
+depends_on: None = None


 def upgrade() -> None:
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@ -243,13 +243,15 @@ DISABLE_INDEX_UPDATE_ON_SWAP = (
 # fairly large amount of memory in order to increase substantially, since
 # each worker loads the embedding models into memory.
 NUM_INDEXING_WORKERS = int(os.environ.get("NUM_INDEXING_WORKERS") or 1)
-CHUNK_OVERLAP = 0
 # More accurate results at the expense of indexing speed and index size (stores additional 4 MINI_CHUNK vectors)
 ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true"
 # Finer grained chunking for more detail retention
 # Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
 # tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
 MINI_CHUNK_SIZE = 150
+# Include the document level metadata in each chunk. If the metadata is too long, then it is thrown out
+# We don't want the metadata to overwhelm the actual contents of the chunk
+SKIP_METADATA_IN_CHUNK = os.environ.get("SKIP_METADATA_IN_CHUNK", "").lower() == "true"
 # Timeout to wait for job's last update before killing it, in hours
 CLEANUP_INDEXING_JOBS_TIMEOUT = int(os.environ.get("CLEANUP_INDEXING_JOBS_TIMEOUT", 3))

--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -19,6 +19,7 @@ DOCUMENT_SETS = "document_sets"
 TIME_FILTER = "time_filter"
 METADATA = "metadata"
 METADATA_LIST = "metadata_list"
+METADATA_SUFFIX = "metadata_suffix"
 MATCH_HIGHLIGHTS = "match_highlights"
 # stored in the `metadata` of a chunk. Used to signify that this chunk should
 # not be used for QA. For example, Google Drive file types which can't be parsed
@ -43,7 +44,8 @@ QUERY_EVENT_ID = "query_event_id"
 LLM_CHUNKS = "llm_chunks"

 # For chunking/processing chunks
-TITLE_SEPARATOR = "\n\r\n"
+MAX_CHUNK_TITLE_LEN = 1000
+RETURN_SEPARATOR = "\n\r\n"
 SECTION_SEPARATOR = "\n\n"
 # For combining attributes, doesn't have to be unique/perfect to work
 INDEX_SEPARATOR = "==="
--- a/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py
@ -6,6 +6,7 @@ from typing import TypeVar

 from dateutil.parser import parse

+from danswer.configs.constants import IGNORE_FOR_QA
 from danswer.connectors.models import BasicExpertInfo
 from danswer.utils.text_processing import is_valid_email

@ -57,3 +58,7 @@ def process_in_batches(
 ) -> Iterator[list[U]]:
    for i in range(0, len(objects), batch_size):
        yield [process_function(obj) for obj in objects[i : i + batch_size]]
+
+
+def get_metadata_keys_to_ignore() -> list[str]:
+    return [IGNORE_FOR_QA]
--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@ -6,6 +6,7 @@ from pydantic import BaseModel

 from danswer.configs.constants import DocumentSource
 from danswer.configs.constants import INDEX_SEPARATOR
+from danswer.configs.constants import RETURN_SEPARATOR
 from danswer.utils.text_processing import make_url_compatible


@ -117,7 +118,12 @@ class DocumentBase(BaseModel):
        # If title is explicitly empty, return a None here for embedding purposes
        if self.title == "":
            return None
-        return self.semantic_identifier if self.title is None else self.title
+        replace_chars = set(RETURN_SEPARATOR)
+        title = self.semantic_identifier if self.title is None else self.title
+        for char in replace_chars:
+            title = title.replace(char, " ")
+        title = title.strip()
+        return title

    def get_metadata_str_attributes(self) -> list[str] | None:
        if not self.metadata:
--- a/backend/danswer/document_index/interfaces.py
+++ b/backend/danswer/document_index/interfaces.py
@ -6,7 +6,7 @@ from typing import Any
 from danswer.access.models import DocumentAccess
 from danswer.indexing.models import DocMetadataAwareIndexChunk
 from danswer.search.models import IndexFilters
-from danswer.search.models import InferenceChunk
+from danswer.search.models import InferenceChunkUncleaned


@dataclass(frozen=True)
@ -186,7 +186,7 @@ class IdRetrievalCapable(abc.ABC):
        min_chunk_ind: int | None,
        max_chunk_ind: int | None,
        user_access_control_list: list[str] | None = None,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        """
        Fetch chunk(s) based on document id

@ -222,7 +222,7 @@ class KeywordCapable(abc.ABC):
        time_decay_multiplier: float,
        num_to_retrieve: int,
        offset: int = 0,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        """
        Run keyword search and return a list of chunks. Inference chunks are chunks with all of the
        information required for query time purposes. For example, some details of the document
@ -262,7 +262,7 @@ class VectorCapable(abc.ABC):
        time_decay_multiplier: float,
        num_to_retrieve: int,
        offset: int = 0,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        """
        Run vector/semantic search and return a list of inference chunks.

@ -298,7 +298,7 @@ class HybridCapable(abc.ABC):
        num_to_retrieve: int,
        offset: int = 0,
        hybrid_alpha: float | None = None,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        """
        Run hybrid search and return a list of inference chunks.

@ -348,7 +348,7 @@ class AdminCapable(abc.ABC):
        filters: IndexFilters,
        num_to_retrieve: int,
        offset: int = 0,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        """
        Run the special search for the admin document explorer page

--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -91,6 +91,9 @@ schema DANSWER_CHUNK_NAME {
        field metadata type string {
            indexing: summary | attribute
        }
+        field metadata_suffix type string {
+            indexing: summary | attribute
+        }
        field doc_updated_at type int {
            indexing: summary | attribute
        }
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@ -41,6 +41,7 @@ from danswer.configs.constants import HIDDEN
 from danswer.configs.constants import INDEX_SEPARATOR
 from danswer.configs.constants import METADATA
 from danswer.configs.constants import METADATA_LIST
+from danswer.configs.constants import METADATA_SUFFIX
 from danswer.configs.constants import PRIMARY_OWNERS
 from danswer.configs.constants import RECENCY_BIAS
 from danswer.configs.constants import SECONDARY_OWNERS
@ -51,7 +52,6 @@ from danswer.configs.constants import SOURCE_LINKS
 from danswer.configs.constants import SOURCE_TYPE
 from danswer.configs.constants import TITLE
 from danswer.configs.constants import TITLE_EMBEDDING
-from danswer.configs.constants import TITLE_SEPARATOR
 from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
 from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
    get_experts_stores_representations,
@ -64,7 +64,7 @@ from danswer.document_index.vespa.utils import remove_invalid_unicode_chars
 from danswer.document_index.vespa.utils import replace_invalid_doc_id_characters
 from danswer.indexing.models import DocMetadataAwareIndexChunk
 from danswer.search.models import IndexFilters
-from danswer.search.models import InferenceChunk
+from danswer.search.models import InferenceChunkUncleaned
 from danswer.search.retrieval.search_runner import query_processing
 from danswer.search.retrieval.search_runner import remove_stop_words_and_punctuation
 from danswer.utils.batching import batch_generator
@ -347,8 +347,10 @@ def _index_vespa_chunk(
        TITLE: remove_invalid_unicode_chars(title) if title else None,
        SKIP_TITLE_EMBEDDING: not title,
        CONTENT: remove_invalid_unicode_chars(chunk.content),
-        # This duplication of `content` is needed for keyword highlighting :(
-        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
+        # This duplication of `content` is needed for keyword highlighting
+        # Note that it's not exactly the same as the actual content
+        # which contains the title prefix and metadata suffix
+        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content_summary),
        SOURCE_TYPE: str(document.source.value),
        SOURCE_LINKS: json.dumps(chunk.source_links),
        SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
@ -356,6 +358,7 @@ def _index_vespa_chunk(
        METADATA: json.dumps(document.metadata),
        # Save as a list for efficient extraction as an Attribute
        METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
+        METADATA_SUFFIX: chunk.metadata_suffix,
        EMBEDDINGS: embeddings_name_vector_map,
        TITLE_EMBEDDING: chunk.title_embedding,
        BOOST: chunk.boost,
@ -562,7 +565,7 @@ def _process_dynamic_summary(

 def _vespa_hit_to_inference_chunk(
    hit: dict[str, Any], null_score: bool = False
-) -> InferenceChunk:
+) -> InferenceChunkUncleaned:
    fields = cast(dict[str, Any], hit["fields"])

    # parse fields that are stored as strings, but are really json / datetime
@ -585,19 +588,6 @@ def _vespa_hit_to_inference_chunk(
            f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
        )

-    # Remove the title from the first chunk as every chunk already included
-    # its semantic identifier for LLM
-    content = fields[CONTENT]
-    if fields[CHUNK_ID] == 0:
-        parts = content.split(TITLE_SEPARATOR, maxsplit=1)
-        content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
-
-    # User ran into this, not sure why this could happen, error checking here
-    blurb = fields.get(BLURB)
-    if not blurb:
-        logger.error(f"Chunk with id {fields.get(semantic_identifier)} ")
-        blurb = ""
-
    source_links = fields.get(SOURCE_LINKS, {})
    source_links_dict_unprocessed = (
        json.loads(source_links) if isinstance(source_links, str) else source_links
@ -607,14 +597,15 @@ def _vespa_hit_to_inference_chunk(
        for k, v in cast(dict[str, str], source_links_dict_unprocessed).items()
    }

-    return InferenceChunk(
+    return InferenceChunkUncleaned(
        chunk_id=fields[CHUNK_ID],
-        blurb=blurb,
-        content=content,
+        blurb=fields.get(BLURB, ""),  # Unused
+        content=fields[CONTENT],  # Includes extra title prefix and metadata suffix
        source_links=source_links_dict,
        section_continuation=fields[SECTION_CONTINUATION],
        document_id=fields[DOCUMENT_ID],
        source_type=fields[SOURCE_TYPE],
+        title=fields[TITLE],
        semantic_identifier=fields[SEMANTIC_IDENTIFIER],
        boost=fields.get(BOOST, 1),
        recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
@ -623,13 +614,16 @@ def _vespa_hit_to_inference_chunk(
        primary_owners=fields.get(PRIMARY_OWNERS),
        secondary_owners=fields.get(SECONDARY_OWNERS),
        metadata=metadata,
+        metadata_suffix=fields.get(METADATA_SUFFIX) or "",
        match_highlights=match_highlights,
        updated_at=updated_at,
    )


@retry(tries=3, delay=1, backoff=2)
-def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[InferenceChunk]:
+def _query_vespa(
+    query_params: Mapping[str, str | int | float]
+) -> list[InferenceChunkUncleaned]:
    if "query" in query_params and not cast(str, query_params["query"]).strip():
        raise ValueError("No/empty query received")

@ -684,16 +678,6 @@ def _query_vespa(query_params: Mapping[str, str | int | float]) -> list[Inferenc
    return inference_chunks


-@retry(tries=3, delay=1, backoff=2)
-def _inference_chunk_by_vespa_id(vespa_id: str, index_name: str) -> InferenceChunk:
-    res = requests.get(
-        f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_id}"
-    )
-    res.raise_for_status()
-
-    return _vespa_hit_to_inference_chunk(res.json())
-
-
 def in_memory_zip_from_file_bytes(file_contents: dict[str, bytes]) -> BinaryIO:
    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
@ -738,6 +722,7 @@ class VespaIndex(DocumentIndex):
        f"{SOURCE_TYPE}, "
        f"{SOURCE_LINKS}, "
        f"{SEMANTIC_IDENTIFIER}, "
+        f"{TITLE}, "
        f"{SECTION_CONTINUATION}, "
        f"{BOOST}, "
        f"{HIDDEN}, "
@ -745,6 +730,7 @@ class VespaIndex(DocumentIndex):
        f"{PRIMARY_OWNERS}, "
        f"{SECONDARY_OWNERS}, "
        f"{METADATA}, "
+        f"{METADATA_SUFFIX}, "
        f"{CONTENT_SUMMARY} "
        f"from {{index_name}} where "
    )
@ -980,7 +966,7 @@ class VespaIndex(DocumentIndex):
        min_chunk_ind: int | None,
        max_chunk_ind: int | None,
        user_access_control_list: list[str] | None = None,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        document_id = replace_invalid_doc_id_characters(document_id)

        vespa_chunks = _get_vespa_chunks_by_document_id(
@ -1009,7 +995,7 @@ class VespaIndex(DocumentIndex):
        num_to_retrieve: int = NUM_RETURNED_HITS,
        offset: int = 0,
        edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
        vespa_where_clauses = _build_vespa_filters(filters)
        yql = (
@ -1046,7 +1032,7 @@ class VespaIndex(DocumentIndex):
        offset: int = 0,
        distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
        edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
        vespa_where_clauses = _build_vespa_filters(filters)
        yql = (
@ -1090,7 +1076,7 @@ class VespaIndex(DocumentIndex):
        title_content_ratio: float | None = TITLE_CONTENT_RATIO,
        distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
        edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        vespa_where_clauses = _build_vespa_filters(filters)
        # Needs to be at least as much as the value set in Vespa schema config
        target_hits = max(10 * num_to_retrieve, 1000)
@ -1134,7 +1120,7 @@ class VespaIndex(DocumentIndex):
        filters: IndexFilters,
        num_to_retrieve: int = NUM_RETURNED_HITS,
        offset: int = 0,
-    ) -> list[InferenceChunk]:
+    ) -> list[InferenceChunkUncleaned]:
        vespa_where_clauses = _build_vespa_filters(filters, include_hidden=True)
        yql = (
            VespaIndex.yql_base.format(index_name=self.index_name)
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@ -3,12 +3,16 @@ from collections.abc import Callable
 from typing import TYPE_CHECKING

 from danswer.configs.app_configs import BLURB_SIZE
-from danswer.configs.app_configs import CHUNK_OVERLAP
 from danswer.configs.app_configs import MINI_CHUNK_SIZE
+from danswer.configs.app_configs import SKIP_METADATA_IN_CHUNK
 from danswer.configs.constants import DocumentSource
+from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
+from danswer.configs.constants import RETURN_SEPARATOR
 from danswer.configs.constants import SECTION_SEPARATOR
-from danswer.configs.constants import TITLE_SEPARATOR
 from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
+    get_metadata_keys_to_ignore,
+)
 from danswer.connectors.models import Document
 from danswer.indexing.models import DocAwareChunk
 from danswer.search.search_nlp_models import get_default_tokenizer
@ -19,6 +23,14 @@ if TYPE_CHECKING:
    from transformers import AutoTokenizer  # type:ignore


+# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
+# actually help quality at all
+CHUNK_OVERLAP = 0
+# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
+# overwhelm the actual contents of the chunk
+MAX_METADATA_PERCENTAGE = 0.25
+CHUNK_MIN_CONTENT = 256
+
 logger = setup_logger()

 ChunkFunc = Callable[[Document], list[DocAwareChunk]]
@ -44,6 +56,8 @@ def chunk_large_section(
    chunk_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
    chunk_overlap: int = CHUNK_OVERLAP,
    blurb_size: int = BLURB_SIZE,
+    title_prefix: str = "",
+    metadata_suffix: str = "",
 ) -> list[DocAwareChunk]:
    from llama_index.text_splitter import SentenceSplitter

@ -60,30 +74,69 @@ def chunk_large_section(
            source_document=document,
            chunk_id=start_chunk_id + chunk_ind,
            blurb=blurb,
-            content=chunk_str,
+            content=f"{title_prefix}{chunk_str}{metadata_suffix}",
+            content_summary=chunk_str,
            source_links={0: section_link_text},
            section_continuation=(chunk_ind != 0),
+            metadata_suffix=metadata_suffix,
        )
        for chunk_ind, chunk_str in enumerate(split_texts)
    ]
    return chunks


+def _get_metadata_suffix_for_document_index(
+    metadata: dict[str, str | list[str]]
+) -> str:
+    if not metadata:
+        return ""
+    metadata_str = "Metadata:\n"
+    for key, value in metadata.items():
+        if key in get_metadata_keys_to_ignore():
+            continue
+
+        value_str = ", ".join(value) if isinstance(value, list) else value
+        metadata_str += f"\t{key} - {value_str}\n"
+    return metadata_str.strip()
+
+
 def chunk_document(
    document: Document,
    chunk_tok_size: int = DOC_EMBEDDING_CONTEXT_SIZE,
    subsection_overlap: int = CHUNK_OVERLAP,
    blurb_size: int = BLURB_SIZE,
+    include_metadata: bool = not SKIP_METADATA_IN_CHUNK,
 ) -> list[DocAwareChunk]:
-    title = document.get_title_for_document_index()
-    title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
    tokenizer = get_default_tokenizer()

+    title = document.get_title_for_document_index()
+    title_prefix = f"{title}{RETURN_SEPARATOR}"[:MAX_CHUNK_TITLE_LEN] if title else ""
+    title_tokens = len(tokenizer.tokenize(title_prefix))
+
+    metadata_suffix = ""
+    metadata_tokens = 0
+    if include_metadata:
+        metadata = _get_metadata_suffix_for_document_index(document.metadata)
+        metadata_suffix = RETURN_SEPARATOR + metadata if metadata else ""
+        metadata_tokens = len(tokenizer.tokenize(metadata_suffix))
+
+    if metadata_tokens >= chunk_tok_size * MAX_METADATA_PERCENTAGE:
+        metadata_suffix = ""
+        metadata_tokens = 0
+
+    content_token_limit = chunk_tok_size - title_tokens - metadata_tokens
+
+    # If there is not enough context remaining then just index the chunk with no prefix/suffix
+    if content_token_limit <= CHUNK_MIN_CONTENT:
+        content_token_limit = chunk_tok_size
+        title_prefix = ""
+        metadata_suffix = ""
+
    chunks: list[DocAwareChunk] = []
    link_offsets: dict[int, str] = {}
    chunk_text = ""
-    for ind, section in enumerate(document.sections):
-        section_text = title_prefix + section.text if ind == 0 else section.text
+    for section in document.sections:
+        section_text = section.text
        section_link_text = section.link or ""

        section_tok_length = len(tokenizer.tokenize(section_text))
@ -92,16 +145,18 @@ def chunk_document(

        # Large sections are considered self-contained/unique therefore they start a new chunk and are not concatenated
        # at the end by other sections
-        if section_tok_length > chunk_tok_size:
+        if section_tok_length > content_token_limit:
            if chunk_text:
                chunks.append(
                    DocAwareChunk(
                        source_document=document,
                        chunk_id=len(chunks),
                        blurb=extract_blurb(chunk_text, blurb_size),
-                        content=chunk_text,
+                        content=f"{title_prefix}{chunk_text}{metadata_suffix}",
+                        content_summary=chunk_text,
                        source_links=link_offsets,
                        section_continuation=False,
+                        metadata_suffix=metadata_suffix,
                    )
                )
                link_offsets = {}
@ -113,9 +168,11 @@ def chunk_document(
                document=document,
                start_chunk_id=len(chunks),
                tokenizer=tokenizer,
-                chunk_size=chunk_tok_size,
+                chunk_size=content_token_limit,
                chunk_overlap=subsection_overlap,
                blurb_size=blurb_size,
+                title_prefix=title_prefix,
+                metadata_suffix=metadata_suffix,
            )
            chunks.extend(large_section_chunks)
            continue
@ -125,7 +182,7 @@ def chunk_document(
            current_tok_length
            + len(tokenizer.tokenize(SECTION_SEPARATOR))
            + section_tok_length
-            <= chunk_tok_size
+            <= content_token_limit
        ):
            chunk_text += (
                SECTION_SEPARATOR + section_text if chunk_text else section_text
@ -137,9 +194,11 @@ def chunk_document(
                    source_document=document,
                    chunk_id=len(chunks),
                    blurb=extract_blurb(chunk_text, blurb_size),
-                    content=chunk_text,
+                    content=f"{title_prefix}{chunk_text}{metadata_suffix}",
+                    content_summary=chunk_text,
                    source_links=link_offsets,
                    section_continuation=False,
+                    metadata_suffix=metadata_suffix,
                )
            )
            link_offsets = {0: section_link_text}
@ -153,9 +212,11 @@ def chunk_document(
                source_document=document,
                chunk_id=len(chunks),
                blurb=extract_blurb(chunk_text, blurb_size),
-                content=chunk_text,
+                content=f"{title_prefix}{chunk_text}{metadata_suffix}",
+                content_summary=chunk_text,
                source_links=link_offsets,
                section_continuation=False,
+                metadata_suffix=metadata_suffix,
            )
        )
    return chunks
@ -164,6 +225,9 @@ def chunk_document(
 def split_chunk_text_into_mini_chunks(
    chunk_text: str, mini_chunk_size: int = MINI_CHUNK_SIZE
 ) -> list[str]:
+    """The minichunks won't all have the title prefix or metadata suffix
+    It could be a significant percentage of every minichunk so better to not include it
+    """
    from llama_index.text_splitter import SentenceSplitter

    token_count_func = get_default_tokenizer().tokenize
--- a/backend/danswer/indexing/embedder.py
+++ b/backend/danswer/indexing/embedder.py
@ -81,7 +81,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
        for chunk_ind, chunk in enumerate(chunks):
            chunk_texts.append(chunk.content)
            mini_chunk_texts = (
-                split_chunk_text_into_mini_chunks(chunk.content)
+                split_chunk_text_into_mini_chunks(chunk.content_summary)
                if enable_mini_chunk
                else []
            )
--- a/backend/danswer/indexing/models.py
+++ b/backend/danswer/indexing/models.py
@ -36,6 +36,16 @@ class DocAwareChunk(BaseChunk):
    # During inference we only have access to the document id and do not reconstruct the Document
    source_document: Document

+    # The Vespa documents require a separate highlight field. Since it is stored as a duplicate anyway,
+    # it's easier to just store a not prefixed/suffixed string for the highlighting
+    # Also during the chunking, this non-prefixed/suffixed string is used for mini-chunks
+    content_summary: str
+
+    # During indexing we also (optionally) build a metadata string from the metadata dict
+    # This is also indexed so that we can strip it out after indexing, this way it supports
+    # multiple iterations of metadata representation for backwards compatibility
+    metadata_suffix: str
+
    def to_short_descriptor(self) -> str:
        """Used when logging the identity of a chunk"""
        return (
--- a/backend/danswer/search/models.py
+++ b/backend/danswer/search/models.py
@ -189,6 +189,20 @@ class InferenceChunk(BaseChunk):
        return self.score > other.score


+class InferenceChunkUncleaned(InferenceChunk):
+    title: str  # Separate from Semantic Identifier though often same
+    metadata_suffix: str
+
+    def to_inference_chunk(self) -> InferenceChunk:
+        # Create a dict of all fields except 'title' and 'metadata_suffix'
+        inference_chunk_data = {
+            k: v
+            for k, v in self.dict().items()
+            if k not in ["title", "metadata_suffix"]
+        }
+        return InferenceChunk(**inference_chunk_data)
+
+
 class InferenceSection(BaseModel):
    """Section list of chunks with a combined content. A section could be a single chunk, several
    chunks from the same document or the entire document."""
--- a/backend/danswer/search/postprocessing/postprocessing.py
+++ b/backend/danswer/search/postprocessing/postprocessing.py
@ -4,6 +4,8 @@ from typing import cast

 import numpy

+from danswer.configs.constants import MAX_CHUNK_TITLE_LEN
+from danswer.configs.constants import RETURN_SEPARATOR
 from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX
 from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN
 from danswer.document_index.document_index_utils import (
@ -12,6 +14,7 @@ from danswer.document_index.document_index_utils import (
 from danswer.llm.interfaces import LLM
 from danswer.search.models import ChunkMetric
 from danswer.search.models import InferenceChunk
+from danswer.search.models import InferenceChunkUncleaned
 from danswer.search.models import InferenceSection
 from danswer.search.models import MAX_METRICS_CONTENT
 from danswer.search.models import RerankMetricsContainer
@ -47,6 +50,33 @@ def should_apply_llm_based_relevance_filter(query: SearchQuery) -> bool:
    return not query.skip_llm_chunk_filter


+def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk]:
+    def _remove_title(chunk: InferenceChunkUncleaned) -> str:
+        if not chunk.title or not chunk.content:
+            return chunk.content
+
+        if chunk.content.startswith(chunk.title):
+            return chunk.content[len(chunk.title) :].lstrip()
+
+        if chunk.content.startswith(chunk.title[:MAX_CHUNK_TITLE_LEN]):
+            return chunk.content[MAX_CHUNK_TITLE_LEN:].lstrip()
+
+        return chunk.content
+
+    def _remove_metadata_suffix(chunk: InferenceChunkUncleaned) -> str:
+        if not chunk.metadata_suffix:
+            return chunk.content
+        return chunk.content.removesuffix(chunk.metadata_suffix).rstrip(
+            RETURN_SEPARATOR
+        )
+
+    for chunk in chunks:
+        chunk.content = _remove_title(chunk)
+        chunk.content = _remove_metadata_suffix(chunk)
+
+    return [chunk.to_inference_chunk() for chunk in chunks]
+
+
@log_function_time(print_only=True)
 def semantic_reranking(
    query: str,
--- a/backend/danswer/search/retrieval/search_runner.py
+++ b/backend/danswer/search/retrieval/search_runner.py
@ -20,6 +20,7 @@ from danswer.search.models import MAX_METRICS_CONTENT
 from danswer.search.models import RetrievalMetricsContainer
 from danswer.search.models import SearchQuery
 from danswer.search.models import SearchType
+from danswer.search.postprocessing.postprocessing import cleanup_chunks
 from danswer.search.search_nlp_models import EmbeddingModel
 from danswer.search.utils import inference_section_from_chunks
 from danswer.secondary_llm_flows.query_expansion import multilingual_query_expansion
@ -160,7 +161,7 @@ def doc_index_retrieval(
        else:
            raise RuntimeError("Invalid Search Flow")

-    return top_chunks
+    return cleanup_chunks(top_chunks)


 def _simplify_text(text: str) -> str:
--- a/backend/tests/unit/danswer/indexing/test_chunker.py
+++ b/backend/tests/unit/danswer/indexing/test_chunker.py
@ -0,0 +1,38 @@
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.models import Document
+from danswer.connectors.models import Section
+from danswer.indexing.chunker import chunk_document
+
+
+def test_chunk_document() -> None:
+    short_section_1 = "This is a short section."
+    long_section = (
+        "This is a long section that should be split into multiple chunks. " * 100
+    )
+    short_section_2 = "This is another short section."
+    short_section_3 = "This is another short section again."
+    short_section_4 = "Final short section."
+    semantic_identifier = "Test Document"
+
+    document = Document(
+        id="test_doc",
+        source=DocumentSource.WEB,
+        semantic_identifier=semantic_identifier,
+        metadata={"tags": ["tag1", "tag2"]},
+        doc_updated_at=None,
+        sections=[
+            Section(text=short_section_1, link="link1"),
+            Section(text=short_section_2, link="link2"),
+            Section(text=long_section, link="link3"),
+            Section(text=short_section_3, link="link4"),
+            Section(text=short_section_4, link="link5"),
+        ],
+    )
+
+    chunks = chunk_document(document)
+    assert len(chunks) == 5
+    assert all(semantic_identifier in chunk.content for chunk in chunks)
+    assert short_section_1 in chunks[0].content
+    assert short_section_3 in chunks[-1].content
+    assert short_section_4 in chunks[-1].content
+    assert "tag1" in chunks[0].content