Metadata and Title Search (#903)

2025-05-30 01:30:21 +02:00 · 2024-01-02 11:25:50 -08:00 · 2024-01-02 11:25:50 -08:00 · d7141df5fc
commit d7141df5fc
parent 615bb7b095
38 changed files with 639 additions and 162 deletions
--- a/backend/alembic/versions/904e5138fffb_tags.py
+++ b/backend/alembic/versions/904e5138fffb_tags.py
@ -0,0 +1,61 @@
+"""Tags
+
+Revision ID: 904e5138fffb
+Revises: 891cd83c87a8
+Create Date: 2024-01-01 10:44:43.733974
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "904e5138fffb"
+down_revision = "891cd83c87a8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "tag",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("tag_key", sa.String(), nullable=False),
+        sa.Column("tag_value", sa.String(), nullable=False),
+        sa.Column("source", sa.String(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint(
+            "tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
+        ),
+    )
+    op.create_table(
+        "document__tag",
+        sa.Column("document_id", sa.String(), nullable=False),
+        sa.Column("tag_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["document_id"],
+            ["document.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["tag_id"],
+            ["tag.id"],
+        ),
+        sa.PrimaryKeyConstraint("document_id", "tag_id"),
+    )
+
+    op.add_column(
+        "search_doc",
+        sa.Column(
+            "doc_metadata",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+        ),
+    )
+    op.execute("UPDATE search_doc SET doc_metadata = '{}' WHERE doc_metadata IS NULL")
+    op.alter_column("search_doc", "doc_metadata", nullable=False)
+
+
+def downgrade() -> None:
+    op.drop_table("document__tag")
+    op.drop_table("tag")
+    op.drop_column("search_doc", "doc_metadata")
--- a/backend/danswer/configs/chat_configs.py
+++ b/backend/danswer/configs/chat_configs.py
@ -59,7 +59,14 @@ if os.environ.get("EDIT_KEYWORD_QUERY"):
 else:
    EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
 # Weighting factor between Vector and Keyword Search, 1 for completely vector search
-HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.6)))
+HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.66)))
+# Weighting factor between Title and Content of documents during search, 1 for completely
+# Title based. Default heavily favors Content because Title is also included at the top of
+# Content. This is to avoid cases where the Content is very relevant but it may not be clear
+# if the title is separated out. Title is most of a "boost" than a separate field.
+TITLE_CONTENT_RATIO = max(
+    0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20))
+)
 # A list of languages passed to the LLM to rephase the query
 # For example "English,French,Spanish", be sure to use the "," separator
 MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@ -11,11 +11,13 @@ SEMANTIC_IDENTIFIER = "semantic_identifier"
 TITLE = "title"
 SECTION_CONTINUATION = "section_continuation"
 EMBEDDINGS = "embeddings"
+TITLE_EMBEDDING = "title_embedding"
 ALLOWED_USERS = "allowed_users"
 ACCESS_CONTROL_LIST = "access_control_list"
 DOCUMENT_SETS = "document_sets"
 TIME_FILTER = "time_filter"
 METADATA = "metadata"
+METADATA_LIST = "metadata_list"
 MATCH_HIGHLIGHTS = "match_highlights"
 # stored in the `metadata` of a chunk. Used to signify that this chunk should
 # not be used for QA. For example, Google Drive file types which can't be parsed
@ -38,6 +40,12 @@ SESSION_KEY = "session"
 QUERY_EVENT_ID = "query_event_id"
 LLM_CHUNKS = "llm_chunks"

+# For chunking/processing chunks
+TITLE_SEPARATOR = "\n\r\n"
+SECTION_SEPARATOR = "\n\n"
+# For combining attributes, doesn't have to be unique/perfect to work
+INDEX_SEPARATOR = "==="
+

 class DocumentSource(str, Enum):
    # Special case, document passed in via Danswer APIs without specifying a source type
--- a/backend/danswer/connectors/bookstack/connector.py
+++ b/backend/danswer/connectors/bookstack/connector.py
@ -8,6 +8,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.bookstack.client import BookStackApiClient
 from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@ -72,13 +73,21 @@ class BookstackConnector(LoadConnector, PollConnector):
        bookstack_client: BookStackApiClient, book: dict[str, Any]
    ) -> Document:
        url = bookstack_client.build_app_url("/books/" + str(book.get("slug")))
+        title = str(book.get("name", ""))
        text = book.get("name", "") + "\n" + book.get("description", "")
+        updated_at_str = (
+            str(book.get("updated_at")) if book.get("updated_at") is not None else None
+        )
        return Document(
-            id="book:" + str(book.get("id")),
+            id="book__" + str(book.get("id")),
            sections=[Section(link=url, text=text)],
            source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Book: " + str(book.get("name")),
-            metadata={"type": "book", "updated_at": str(book.get("updated_at"))},
+            semantic_identifier="Book: " + title,
+            title=title,
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "book"},
        )

    @staticmethod
@ -91,13 +100,23 @@ class BookstackConnector(LoadConnector, PollConnector):
            + "/chapter/"
            + str(chapter.get("slug"))
        )
+        title = str(chapter.get("name", ""))
        text = chapter.get("name", "") + "\n" + chapter.get("description", "")
+        updated_at_str = (
+            str(chapter.get("updated_at"))
+            if chapter.get("updated_at") is not None
+            else None
+        )
        return Document(
-            id="chapter:" + str(chapter.get("id")),
+            id="chapter__" + str(chapter.get("id")),
            sections=[Section(link=url, text=text)],
            source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Chapter: " + str(chapter.get("name")),
-            metadata={"type": "chapter", "updated_at": str(chapter.get("updated_at"))},
+            semantic_identifier="Chapter: " + title,
+            title=title,
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "chapter"},
        )

    @staticmethod
@ -105,13 +124,23 @@ class BookstackConnector(LoadConnector, PollConnector):
        bookstack_client: BookStackApiClient, shelf: dict[str, Any]
    ) -> Document:
        url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug")))
+        title = str(shelf.get("name", ""))
        text = shelf.get("name", "") + "\n" + shelf.get("description", "")
+        updated_at_str = (
+            str(shelf.get("updated_at"))
+            if shelf.get("updated_at") is not None
+            else None
+        )
        return Document(
            id="shelf:" + str(shelf.get("id")),
            sections=[Section(link=url, text=text)],
            source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Shelf: " + str(shelf.get("name")),
-            metadata={"type": "shelf", "updated_at": shelf.get("updated_at")},
+            semantic_identifier="Shelf: " + title,
+            title=title,
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "shelf"},
        )

    @staticmethod
@ -119,7 +148,7 @@ class BookstackConnector(LoadConnector, PollConnector):
        bookstack_client: BookStackApiClient, page: dict[str, Any]
    ) -> Document:
        page_id = str(page.get("id"))
-        page_name = str(page.get("name"))
+        title = str(page.get("name", ""))
        page_data = bookstack_client.get("/pages/" + page_id, {})
        url = bookstack_client.build_app_url(
            "/books/"
@ -127,17 +156,24 @@ class BookstackConnector(LoadConnector, PollConnector):
            + "/page/"
            + str(page_data.get("slug"))
        )
-        page_html = (
-            "<h1>" + html.escape(page_name) + "</h1>" + str(page_data.get("html"))
-        )
+        page_html = "<h1>" + html.escape(title) + "</h1>" + str(page_data.get("html"))
        text = parse_html_page_basic(page_html)
+        updated_at_str = (
+            str(page_data.get("updated_at"))
+            if page_data.get("updated_at") is not None
+            else None
+        )
        time.sleep(0.1)
        return Document(
            id="page:" + page_id,
            sections=[Section(link=url, text=text)],
            source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Page: " + str(page_name),
-            metadata={"type": "page", "updated_at": page_data.get("updated_at")},
+            semantic_identifier="Page: " + str(title),
+            title=str(title),
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "page"},
        )

    def load_from_state(self) -> GenerateDocumentsOutput:
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@ -333,11 +333,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                if not page_html:
                    logger.debug("Page is empty, skipping: %s", page_url)
                    continue
-                page_text = (
-                    page.get("title", "")
-                    + "\n"
-                    + parse_html_page(page_html, self.confluence_client)
-                )
+                page_text = parse_html_page(page_html, self.confluence_client)
                comments_text = self._fetch_comments(self.confluence_client, page_id)
                page_text += comments_text

--- a/backend/danswer/connectors/cross_connector_utils/init.py
+++ b/backend/danswer/connectors/cross_connector_utils/init.py
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@ -3,16 +3,17 @@ from datetime import timezone
 from typing import Any
 from urllib.parse import urlparse

-from dateutil.parser import parse
 from jira import JIRA
 from jira.resources import Issue

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.interfaces import SecondsSinceUnixEpoch
+from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
@ -60,26 +61,32 @@ def fetch_jira_issues_batch(
            logger.warning(f"Found Jira object not of type Issue {jira}")
            continue

-        ticket_updated_time = parse(jira.fields.updated)
-
-        semantic_rep = (
-            f"Jira Ticket Summary: {jira.fields.summary}\n"
-            f"Description: {jira.fields.description}\n"
-            + "\n".join(
-                [f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
-            )
+        semantic_rep = f"{jira.fields.description}\n" + "\n".join(
+            [f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
        )

        page_url = f"{jira_client.client_info()}/browse/{jira.key}"

+        author = None
+        try:
+            author = BasicExpertInfo(
+                display_name=jira.fields.creator.displayName,
+                email=jira.fields.creator.emailAddress,
+            )
+        except Exception:
+            # Author should exist but if not, doesn't matter
+            pass
+
        doc_batch.append(
            Document(
                id=page_url,
                sections=[Section(link=page_url, text=semantic_rep)],
                source=DocumentSource.JIRA,
                semantic_identifier=jira.fields.summary,
-                doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
-                metadata={},
+                doc_updated_at=time_str_to_utc(jira.fields.updated),
+                primary_owners=[author] if author is not None else None,
+                # TODO add secondary_owners if needed
+                metadata={"label": jira.fields.labels} if jira.fields.labels else {},
            )
        )
    return doc_batch, len(batch)
--- a/backend/danswer/connectors/document360/connector.py
+++ b/backend/danswer/connectors/document360/connector.py
@ -140,11 +140,7 @@ class Document360Connector(LoadConnector, PollConnector):
            html_content = article_details["html_content"]
            article_content = parse_html_page_basic(html_content)
            doc_text = (
-                f"workspace: {self.workspace}\n"
-                f"category: {article['category_name']}\n"
-                f"article: {article_details['title']} - "
-                f"{article_details.get('description', '')}\n"
-                f"{article_content}"
+                f"{article_details.get('description', '')}\n{article_content}".strip()
            )

            document = Document(
@ -154,7 +150,10 @@ class Document360Connector(LoadConnector, PollConnector):
                semantic_identifier=article_details["title"],
                doc_updated_at=updated_at,
                primary_owners=authors,
-                metadata={},
+                metadata={
+                    "workspace": self.workspace,
+                    "category": article["category_name"],
+                },
            )

            doc_batch.append(document)
--- a/backend/danswer/connectors/file/init.py
+++ b/backend/danswer/connectors/file/init.py
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@ -37,10 +37,9 @@ def _batch_github_objects(


 def _convert_pr_to_document(pull_request: PullRequest) -> Document:
-    full_context = f"Pull-Request {pull_request.title}\n{pull_request.body}"
    return Document(
        id=pull_request.html_url,
-        sections=[Section(link=pull_request.html_url, text=full_context)],
+        sections=[Section(link=pull_request.html_url, text=pull_request.body or "")],
        source=DocumentSource.GITHUB,
        semantic_identifier=pull_request.title,
        # updated_at is UTC time but is timezone unaware, explicitly add UTC
@ -48,7 +47,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
        # due to local time discrepancies with UTC
        doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
        metadata={
-            "merged": pull_request.merged,
+            "merged": str(pull_request.merged),
            "state": pull_request.state,
        },
    )
@ -60,10 +59,9 @@ def _fetch_issue_comments(issue: Issue) -> str:


 def _convert_issue_to_document(issue: Issue) -> Document:
-    full_context = f"Issue {issue.title}\n{issue.body}"
    return Document(
        id=issue.html_url,
-        sections=[Section(link=issue.html_url, text=full_context)],
+        sections=[Section(link=issue.html_url, text=issue.body or "")],
        source=DocumentSource.GITHUB,
        semantic_identifier=issue.title,
        # updated_at is UTC time but is timezone unaware
--- a/backend/danswer/connectors/gong/connector.py
+++ b/backend/danswer/connectors/gong/connector.py
@ -206,9 +206,6 @@ class GongConnector(LoadConnector, PollConnector):
                speaker_to_name: dict[str, str] = {}

                transcript_text = ""
-                if call_title:
-                    transcript_text += f"Call Title: {call_title}\n\n"
-
                call_purpose = call_metadata["purpose"]
                if call_purpose:
                    transcript_text += f"Call Description: {call_purpose}\n\n"
@ -234,6 +231,11 @@ class GongConnector(LoadConnector, PollConnector):
                    )
                    transcript_text += f"{speaker_name}: {monolog}\n\n"

+                metadata = {}
+                if call_metadata.get("system"):
+                    metadata["client"] = call_metadata.get("system")
+                # TODO calls have a clientUniqueId field, can pull that in later
+
                doc_batch.append(
                    Document(
                        id=call_id,
@ -246,7 +248,7 @@ class GongConnector(LoadConnector, PollConnector):
                        doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
                            timezone.utc
                        ),
-                        metadata={},
+                        metadata={"client": call_metadata.get("system")},
                    )
                )
            yield doc_batch
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@ -466,24 +466,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
            doc_batch = []
            for file in files_batch:
                try:
-                    text_contents = extract_text(file, service)
-                    if text_contents:
-                        full_context = file["name"] + " - " + text_contents
-                    else:
-                        full_context = file["name"]
+                    text_contents = extract_text(file, service) or ""

                    doc_batch.append(
                        Document(
                            id=file["webViewLink"],
                            sections=[
-                                Section(link=file["webViewLink"], text=full_context)
+                                Section(link=file["webViewLink"], text=text_contents)
                            ],
                            source=DocumentSource.GOOGLE_DRIVE,
                            semantic_identifier=file["name"],
                            doc_updated_at=datetime.fromisoformat(
                                file["modifiedTime"]
                            ).astimezone(timezone.utc),
-                            metadata={} if text_contents else {IGNORE_FOR_QA: True},
+                            metadata={} if text_contents else {IGNORE_FOR_QA: "True"},
                        )
                    )
                except Exception as e:
--- a/backend/danswer/connectors/google_site/init.py
+++ b/backend/danswer/connectors/google_site/init.py
--- a/backend/danswer/connectors/guru/connector.py
+++ b/backend/danswer/connectors/guru/connector.py
@ -77,7 +77,7 @@ class GuruConnector(LoadConnector, PollConnector):
            for card in cards:
                title = card["preferredPhrase"]
                link = GURU_CARDS_URL + card["slug"]
-                content_text = title + "\n" + parse_html_page_basic(card["content"])
+                content_text = parse_html_page_basic(card["content"])
                last_updated = time_str_to_utc(card["lastModified"])
                last_verified = (
                    time_str_to_utc(card.get("lastVerified"))
--- a/backend/danswer/connectors/hubspot/connector.py
+++ b/backend/danswer/connectors/hubspot/connector.py
@ -73,7 +73,7 @@ class HubSpotConnector(LoadConnector, PollConnector):

            title = ticket.properties["subject"]
            link = self.ticket_base_url + ticket.id
-            content_text = title + "\n" + ticket.properties["content"]
+            content_text = ticket.properties["content"]

            associated_emails: list[str] = []
            associated_notes: list[str] = []
--- a/backend/danswer/connectors/linear/init.py
+++ b/backend/danswer/connectors/linear/init.py
--- a/backend/danswer/connectors/linear/connector.py
+++ b/backend/danswer/connectors/linear/connector.py
@ -8,6 +8,7 @@ import requests

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@ -30,7 +31,6 @@ def _make_query(request_body: dict[str, Any], api_key: str) -> requests.Response
        "Content-Type": "application/json",
    }

-    response: requests.Response | None = None
    for i in range(_NUM_RETRIES):
        try:
            response = requests.post(
@ -187,8 +187,8 @@ class LinearConnector(LoadConnector, PollConnector):
                        ],
                        source=DocumentSource.LINEAR,
                        semantic_identifier=node["identifier"],
+                        doc_updated_at=time_str_to_utc(node["updatedAt"]),
                        metadata={
-                            "updated_at": node["updatedAt"],
                            "team": node["team"]["name"],
                        },
                    )
--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@ -1,10 +1,10 @@
 from datetime import datetime
 from enum import Enum
-from typing import Any

 from pydantic import BaseModel

 from danswer.configs.constants import DocumentSource
+from danswer.configs.constants import INDEX_SEPARATOR
 from danswer.utils.text_processing import make_url_compatible


@ -50,21 +50,38 @@ class DocumentBase(BaseModel):
    sections: list[Section]
    source: DocumentSource | None = None
    semantic_identifier: str  # displayed in the UI as the main identifier for the doc
-    metadata: dict[str, Any]
+    metadata: dict[str, str | list[str]]
    # UTC time
    doc_updated_at: datetime | None = None
    # Owner, creator, etc.
    primary_owners: list[BasicExpertInfo] | None = None
    # Assignee, space owner, etc.
    secondary_owners: list[BasicExpertInfo] | None = None
-    # `title` is used when computing best matches for a query
-    # if `None`, then we will use the `semantic_identifier` as the title in Vespa
+    # title is used for search whereas semantic_identifier is used for displaying in the UI
+    # different because Slack message may display as #general but general should not be part
+    # of the search, at least not in the same way as a document title should be for like Confluence
+    # The default title is semantic_identifier though unless otherwise specified
    title: str | None = None
    from_ingestion_api: bool = False

-    def get_title_for_document_index(self) -> str:
+    def get_title_for_document_index(self) -> str | None:
+        # If title is explicitly empty, return a None here for embedding purposes
+        if self.title == "":
+            return None
        return self.semantic_identifier if self.title is None else self.title

+    def get_metadata_str_attributes(self) -> list[str] | None:
+        if not self.metadata:
+            return None
+        # Combined string for the key/value for easy filtering
+        attributes: list[str] = []
+        for k, v in self.metadata.items():
+            if isinstance(v, list):
+                attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
+            else:
+                attributes.append(k + INDEX_SEPARATOR + v)
+        return attributes
+

 class Document(DocumentBase):
    id: str  # This must be unique or during indexing/reindexing, chunks will be overwritten
--- a/backend/danswer/connectors/notion/connector.py
+++ b/backend/danswer/connectors/notion/connector.py
@ -267,7 +267,8 @@ class NotionConnector(LoadConnector, PollConnector):
            yield (
                Document(
                    id=page.id,
-                    sections=[Section(link=page.url, text=f"{page_title}\n")]
+                    # Will add title to the first section later in processing
+                    sections=[Section(link=page.url, text="")]
                    + [
                        Section(
                            link=f"{page.url}#{block_id.replace('-', '')}",
--- a/backend/danswer/connectors/productboard/connector.py
+++ b/backend/danswer/connectors/productboard/connector.py
@ -14,6 +14,7 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.interfaces import SecondsSinceUnixEpoch
+from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
@ -94,26 +95,24 @@ class ProductboardConnector(PollConnector):
        for feature in self._fetch_documents(
            initial_link=f"{_PRODUCT_BOARD_BASE_URL}/features"
        ):
+            owner = self._get_owner_email(feature)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
            yield Document(
                id=feature["id"],
                sections=[
                    Section(
                        link=feature["links"]["html"],
-                        text=" - ".join(
-                            (
-                                feature["name"],
-                                self._parse_description_html(feature["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(feature["description"]),
                    )
                ],
                semantic_identifier=feature["name"],
                source=DocumentSource.PRODUCTBOARD,
                doc_updated_at=time_str_to_utc(feature["updatedAt"]),
+                primary_owners=experts,
                metadata={
-                    "productboard_entity_type": feature["type"],
+                    "entity_type": feature["type"],
                    "status": feature["status"]["name"],
-                    "owner": self._get_owner_email(feature),
                },
            )

@ -122,25 +121,23 @@ class ProductboardConnector(PollConnector):
        for component in self._fetch_documents(
            initial_link=f"{_PRODUCT_BOARD_BASE_URL}/components"
        ):
+            owner = self._get_owner_email(component)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
            yield Document(
                id=component["id"],
                sections=[
                    Section(
                        link=component["links"]["html"],
-                        text=" - ".join(
-                            (
-                                component["name"],
-                                self._parse_description_html(component["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(component["description"]),
                    )
                ],
                semantic_identifier=component["name"],
                source=DocumentSource.PRODUCTBOARD,
                doc_updated_at=time_str_to_utc(component["updatedAt"]),
+                primary_owners=experts,
                metadata={
-                    "productboard_entity_type": "component",
-                    "owner": self._get_owner_email(component),
+                    "entity_type": "component",
                },
            )

@ -150,25 +147,23 @@ class ProductboardConnector(PollConnector):
        for product in self._fetch_documents(
            initial_link=f"{_PRODUCT_BOARD_BASE_URL}/products"
        ):
+            owner = self._get_owner_email(product)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
            yield Document(
                id=product["id"],
                sections=[
                    Section(
                        link=product["links"]["html"],
-                        text=" - ".join(
-                            (
-                                product["name"],
-                                self._parse_description_html(product["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(product["description"]),
                    )
                ],
                semantic_identifier=product["name"],
                source=DocumentSource.PRODUCTBOARD,
                doc_updated_at=time_str_to_utc(product["updatedAt"]),
+                primary_owners=experts,
                metadata={
-                    "productboard_entity_type": "product",
-                    "owner": self._get_owner_email(product),
+                    "entity_type": "product",
                },
            )

@ -176,26 +171,24 @@ class ProductboardConnector(PollConnector):
        for objective in self._fetch_documents(
            initial_link=f"{_PRODUCT_BOARD_BASE_URL}/objectives"
        ):
+            owner = self._get_owner_email(objective)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
            yield Document(
                id=objective["id"],
                sections=[
                    Section(
                        link=objective["links"]["html"],
-                        text=" - ".join(
-                            (
-                                objective["name"],
-                                self._parse_description_html(objective["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(objective["description"]),
                    )
                ],
                semantic_identifier=objective["name"],
                source=DocumentSource.PRODUCTBOARD,
                doc_updated_at=time_str_to_utc(objective["updatedAt"]),
+                primary_owners=experts,
                metadata={
-                    "productboard_entity_type": "release",
+                    "entity_type": "release",
                    "state": objective["state"],
-                    "owner": self._get_owner_email(objective),
                },
            )

--- a/backend/danswer/connectors/requesttracker/connector.py
+++ b/backend/danswer/connectors/requesttracker/connector.py
@ -97,7 +97,8 @@ class RequestTrackerConnector(PollConnector):
            logger.info(f"Processing ticket {tid}")
            doc = Document(
                id=ticket["id"],
-                sections=[Section(link=ticketLink, text=f"{ticket['Subject']}\n")]
+                # Will add title to the first section later in processing
+                sections=[Section(link=ticketLink, text="")]
                + self.build_doc_sections_from_txn(Rt0, tid),
                source=DocumentSource.REQUESTTRACKER,
                semantic_identifier=ticket["Subject"],
--- a/backend/danswer/connectors/zulip/init.py
+++ b/backend/danswer/connectors/zulip/init.py
--- a/backend/danswer/db/chat.py
+++ b/backend/danswer/db/chat.py
@ -642,6 +642,7 @@ def create_db_search_doc(
        source_type=server_search_doc.source_type,
        boost=server_search_doc.boost,
        hidden=server_search_doc.hidden,
+        doc_metadata=server_search_doc.metadata,
        score=server_search_doc.score,
        match_highlights=server_search_doc.match_highlights,
        updated_at=server_search_doc.updated_at,
@ -674,6 +675,7 @@ def translate_db_search_doc_to_server_search_doc(
        source_type=db_search_doc.source_type,
        boost=db_search_doc.boost,
        hidden=db_search_doc.hidden,
+        metadata=db_search_doc.doc_metadata,
        score=db_search_doc.score,
        match_highlights=db_search_doc.match_highlights,
        updated_at=db_search_doc.updated_at,
--- a/backend/danswer/db/document.py
+++ b/backend/danswer/db/document.py
@ -17,6 +17,7 @@ from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import Credential
 from danswer.db.models import Document as DbDocument
 from danswer.db.models import DocumentByConnectorCredentialPair
+from danswer.db.tag import delete_document_tags_for_documents
 from danswer.db.utils import model_to_dict
 from danswer.document_index.interfaces import DocumentMetadata
 from danswer.server.documents.models import ConnectorCredentialPairIdentifier
@ -272,6 +273,7 @@ def delete_documents_complete(db_session: Session, document_ids: list[str]) -> N
    delete_document_feedback_for_documents(
        document_ids=document_ids, db_session=db_session
    )
+    delete_document_tags_for_documents(document_ids=document_ids, db_session=db_session)
    delete_documents(db_session, document_ids)
    db_session.commit()

--- a/backend/danswer/db/models.py
+++ b/backend/danswer/db/models.py
@ -22,6 +22,7 @@ from sqlalchemy import Integer
 from sqlalchemy import Sequence
 from sqlalchemy import String
 from sqlalchemy import Text
+from sqlalchemy import UniqueConstraint
 from sqlalchemy.dialects import postgresql
 from sqlalchemy.orm import DeclarativeBase
 from sqlalchemy.orm import Mapped
@ -153,6 +154,15 @@ class ChatMessage__SearchDoc(Base):
    )


+class Document__Tag(Base):
+    __tablename__ = "document__tag"
+
+    document_id: Mapped[str] = mapped_column(
+        ForeignKey("document.id"), primary_key=True
+    )
+    tag_id: Mapped[int] = mapped_column(ForeignKey("tag.id"), primary_key=True)
+
+
 """
 Documents/Indexing Tables
 """
@ -247,6 +257,32 @@ class Document(Base):
    retrieval_feedbacks: Mapped[List["DocumentRetrievalFeedback"]] = relationship(
        "DocumentRetrievalFeedback", back_populates="document"
    )
+    tags = relationship(
+        "Tag",
+        secondary="document__tag",
+        back_populates="documents",
+    )
+
+
+class Tag(Base):
+    __tablename__ = "tag"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    tag_key: Mapped[str] = mapped_column(String)
+    tag_value: Mapped[str] = mapped_column(String)
+    source: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
+
+    documents = relationship(
+        "Document",
+        secondary="document__tag",
+        back_populates="tags",
+    )
+
+    __table_args__ = (
+        UniqueConstraint(
+            "tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
+        ),
+    )


 class Connector(Base):
@ -424,6 +460,7 @@ class SearchDoc(Base):
    boost: Mapped[int] = mapped_column(Integer)
    source_type: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
    hidden: Mapped[bool] = mapped_column(Boolean)
+    doc_metadata: Mapped[dict[str, str | list[str]]] = mapped_column(postgresql.JSONB())
    score: Mapped[float] = mapped_column(Float)
    match_highlights: Mapped[list[str]] = mapped_column(postgresql.ARRAY(String))
    # This is for the document, not this row in the table
--- a/backend/danswer/db/tag.py
+++ b/backend/danswer/db/tag.py
@ -0,0 +1,116 @@
+from sqlalchemy import delete
+from sqlalchemy import func
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from danswer.configs.constants import DocumentSource
+from danswer.db.models import Document
+from danswer.db.models import Document__Tag
+from danswer.db.models import Tag
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def create_or_add_document_tag(
+    tag_key: str,
+    tag_value: str,
+    source: DocumentSource,
+    document_id: str,
+    db_session: Session,
+) -> Tag:
+    document = db_session.get(Document, document_id)
+    if not document:
+        raise ValueError("Invalid Document, cannot attach Tags")
+
+    tag_stmt = select(Tag).where(
+        Tag.tag_key == tag_key,
+        Tag.tag_value == tag_value,
+        Tag.source == source,
+    )
+    tag = db_session.execute(tag_stmt).scalar_one_or_none()
+
+    if not tag:
+        tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
+        db_session.add(tag)
+
+    if tag not in document.tags:
+        document.tags.append(tag)
+
+    db_session.commit()
+    return tag
+
+
+def create_or_add_document_tag_list(
+    tag_key: str,
+    tag_values: list[str],
+    source: DocumentSource,
+    document_id: str,
+    db_session: Session,
+) -> list[Tag]:
+    document = db_session.get(Document, document_id)
+    if not document:
+        raise ValueError("Invalid Document, cannot attach Tags")
+
+    existing_tags_stmt = select(Tag).where(
+        Tag.tag_key == tag_key, Tag.tag_value.in_(tag_values), Tag.source == source
+    )
+    existing_tags = list(db_session.execute(existing_tags_stmt).scalars().all())
+    existing_tag_values = {tag.tag_value for tag in existing_tags}
+
+    new_tags = []
+    for tag_value in tag_values:
+        if tag_value not in existing_tag_values:
+            new_tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
+            db_session.add(new_tag)
+            new_tags.append(new_tag)
+
+    all_tags = existing_tags + new_tags
+
+    for tag in all_tags:
+        if tag not in document.tags:
+            document.tags.append(tag)
+
+    db_session.commit()
+    return all_tags
+
+
+def get_tags_by_value_prefix_for_source_types(
+    tag_value_prefix: str | None,
+    sources: list[DocumentSource] | None,
+    db_session: Session,
+) -> list[Tag]:
+    query = select(Tag)
+
+    if tag_value_prefix:
+        query = query.where(Tag.tag_value.startswith(tag_value_prefix))
+
+    if sources:
+        query = query.where(Tag.source.in_(sources))
+
+    result = db_session.execute(query)
+
+    tags = result.scalars().all()
+    return list(tags)
+
+
+def delete_document_tags_for_documents(
+    document_ids: list[str], db_session: Session
+) -> None:
+    stmt = delete(Document__Tag).where(Document__Tag.document_id.in_(document_ids))
+    db_session.execute(stmt)
+    db_session.commit()
+
+    orphan_tags_query = (
+        select(Tag.id)
+        .outerjoin(Document__Tag, Tag.id == Document__Tag.tag_id)
+        .group_by(Tag.id)
+        .having(func.count(Document__Tag.document_id) == 0)
+    )
+
+    orphan_tags = db_session.execute(orphan_tags_query).scalars().all()
+
+    if orphan_tags:
+        delete_orphan_tags_stmt = delete(Tag).where(Tag.id.in_(orphan_tags))
+        db_session.execute(delete_orphan_tags_stmt)
+        db_session.commit()
--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@ -7,12 +7,20 @@ schema danswer_chunk {
        field chunk_id type int {
            indexing: summary | attribute
        }
-        field blurb type string {
+        # Displayed in the UI as the main identifier for the doc
+        field semantic_identifier type string {
            indexing: summary | attribute
        }
-        # Can separate out title in the future and give heavier bm-25 weighting
-        # Need to consider that not every doc has a separable title (ie. slack message)
-        # Set summary options to enable bolding
+        # May not always match the `semantic_identifier` e.g. for Slack docs the
+        # `semantic_identifier` will be the channel name, but the `title` will be empty
+        field title type string {
+            indexing: summary | index
+            match {
+                gram
+                gram-size: 3
+            }
+            index: enable-bm25
+        }
        field content type string {
            indexing: summary | index
            match {
@ -28,6 +36,25 @@ schema danswer_chunk {
            indexing: summary | index
            summary: dynamic
        }
+        # Title embedding (x1)
+        field title_embedding type tensor<float>(x[384]) {
+            indexing: attribute
+            attribute {
+                distance-metric: angular
+            }
+        }
+        # Content embeddings (chunk + optional mini chunks embeddings)
+        # "t" and "x" are arbitrary names, not special keywords
+        field embeddings type tensor<float>(t{},x[384]) {
+            indexing: attribute
+            attribute {
+                distance-metric: angular
+            }
+        }
+        # Starting section of the doc, currently unused as it has been replaced by match highlighting
+        field blurb type string {
+            indexing: summary | attribute
+        }
        # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
        field source_type type string {
            indexing: summary | attribute
@ -39,21 +66,6 @@ schema danswer_chunk {
        field source_links type string {
            indexing: summary | attribute
        }
-        # displayed in the UI as the main identifier for the doc
-        field semantic_identifier type string {
-            indexing: summary | attribute
-        }
-        # this is used when computing best matches based on the title of the document
-        # may not always match the `semantic_identifier` e.g. for Slack docs the 
-        # `semantic_identifier` will be the channel name, but the `title` will be empty
-        field title type string {
-            indexing: summary | index
-            match {
-                gram
-                gram-size: 3
-            }
-            index: enable-bm25
-        }
        field section_continuation type bool {
            indexing: summary | attribute
        }
@ -65,15 +77,15 @@ schema danswer_chunk {
            indexing: summary | attribute
            rank: filter
        }
+        # Needs to have a separate Attribute list for efficient filtering
+        field metadata_list type array<string> {
+            indexing: summary | attribute
+            rank:filter
+            attribute: fast-search
+        }
        field metadata type string {
            indexing: summary | attribute
        }
-        field embeddings type tensor<float>(t{},x[384]) {
-            indexing: attribute
-            attribute {
-                distance-metric: angular
-            }
-        }
        field doc_updated_at type int {
            indexing: summary | attribute
        }
@ -95,6 +107,11 @@ schema danswer_chunk {
        }
    }

+    # If using different tokenization settings, the fieldset has to be removed, and the field must
+    # be specified in the yql like:
+    # + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) '
+    # + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) '
+    # Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores
    fieldset default {
        fields: content, title
    }
@ -124,6 +141,79 @@ schema danswer_chunk {
        match-features: recency_bias
    }

+    rank-profile hybrid_search inherits default, default_rank {
+        inputs {
+            query(query_embedding) tensor<float>(x[384])
+        }
+
+        # This must be separate function for normalize_linear to work
+        function vector_score() {
+            expression {
+                (query(title_content_ratio) * closeness(field, title_embedding)) +
+                ((1 - query(title_content_ratio)) * closeness(field, embeddings))
+            }
+        }
+
+        # This must be separate function for normalize_linear to work
+        function keyword_score() {
+            expression {
+                (query(title_content_ratio) * bm25(title)) +
+                ((1 - query(title_content_ratio)) * bm25(content))
+            }
+        }
+
+        first-phase {
+            expression: vector_score
+        }
+
+        # Weighted average between Vector Search and BM-25
+        # Each is a weighted average between the Title and Content fields
+        # Finally each doc is boosted by it's user feedback based boost and recency
+        # If any embedding or index field is missing, it just receives a score of 0
+        # Assumptions:
+        # - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
+        #   therefore not normalizing before combining.
+        # - For documents without title, it gets a score of 0 for that and this is ok as documents
+        #   without any title match should be penalized.
+        global-phase {
+            expression {
+                (
+                    # Weighted Vector Similarity Score
+                    (query(alpha) * normalize_linear(vector_score)) +
+                    # Weighted Keyword Similarity Score
+                    ((1 - query(alpha)) * normalize_linear(keyword_score))
+                )
+                # Boost based on user feedback
+                * document_boost
+                # Decay factor based on time document was last updated
+                * recency_bias
+            }
+            rerank-count: 1000
+        }
+
+        match-features {
+            bm25(title)
+            bm25(content)
+            closeness(field, title_embedding)
+            closeness(field, embeddings)
+            keyword_score
+            vector_score
+            document_boost
+            recency_bias
+            closest(embeddings)
+        }
+    }
+
+    # Used when searching from the admin UI for a specific doc to hide / boost
+    # Very heavily prioritize title
+    rank-profile admin_search inherits default, default_rank {
+        first-phase {
+            expression: bm25(content) + (5 * bm25(title))
+        }
+    }
+
+    # THE ONES BELOW ARE OUT OF DATE, DO NOT USE
+    # THEY MIGHT NOT EVEN WORK AT ALL
    rank-profile keyword_search inherits default, default_rank {
        first-phase {
            expression: bm25(content) * document_boost * recency_bias
@ -145,29 +235,4 @@ schema danswer_chunk {

        match-features: recency_bias document_boost closest(embeddings)
    }
-
-    rank-profile hybrid_search inherits default, default_rank {
-        inputs {
-            query(query_embedding) tensor<float>(x[384])
-        }
-
-        first-phase {
-            expression: closeness(field, embeddings)
-        }
-
-        global-phase {
-            expression: ((query(alpha) * normalize_linear(closeness(field, embeddings))) + ((1 - query(alpha)) * normalize_linear(bm25(content)))) * document_boost * recency_bias
-            rerank-count: 1000
-        }
-
-        # Cannot pass normalize_linear features in match-features
-        match-features: recency_bias document_boost closest(embeddings)
-    }
-
-    # used when searching from the admin UI for a specific doc to hide / boost
-    rank-profile admin_search inherits default, default_rank {
-        first-phase {
-            expression: bm25(content) + (5 * bm25(title))
-        }
-    }
 }
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@ -25,6 +25,7 @@ from danswer.configs.chat_configs import DOC_TIME_DECAY
 from danswer.configs.chat_configs import EDIT_KEYWORD_QUERY
 from danswer.configs.chat_configs import HYBRID_ALPHA
 from danswer.configs.chat_configs import NUM_RETURNED_HITS
+from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
 from danswer.configs.constants import ACCESS_CONTROL_LIST
 from danswer.configs.constants import BLURB
 from danswer.configs.constants import BOOST
@ -35,7 +36,9 @@ from danswer.configs.constants import DOCUMENT_ID
 from danswer.configs.constants import DOCUMENT_SETS
 from danswer.configs.constants import EMBEDDINGS
 from danswer.configs.constants import HIDDEN
+from danswer.configs.constants import INDEX_SEPARATOR
 from danswer.configs.constants import METADATA
+from danswer.configs.constants import METADATA_LIST
 from danswer.configs.constants import PRIMARY_OWNERS
 from danswer.configs.constants import RECENCY_BIAS
 from danswer.configs.constants import SECONDARY_OWNERS
@ -44,6 +47,8 @@ from danswer.configs.constants import SEMANTIC_IDENTIFIER
 from danswer.configs.constants import SOURCE_LINKS
 from danswer.configs.constants import SOURCE_TYPE
 from danswer.configs.constants import TITLE
+from danswer.configs.constants import TITLE_EMBEDDING
+from danswer.configs.constants import TITLE_SEPARATOR
 from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
 from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
    get_experts_stores_representations,
@ -239,20 +244,25 @@ def _index_vespa_chunk(
        for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings):
            embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed

+    title = document.get_title_for_document_index()
+
    vespa_document_fields = {
        DOCUMENT_ID: document.id,
        CHUNK_ID: chunk.chunk_id,
        BLURB: remove_invalid_unicode_chars(chunk.blurb),
-        # this duplication of `content` is needed for keyword highlighting :(
+        TITLE: remove_invalid_unicode_chars(title) if title else None,
        CONTENT: remove_invalid_unicode_chars(chunk.content),
+        # This duplication of `content` is needed for keyword highlighting :(
        CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
        SOURCE_TYPE: str(document.source.value),
        SOURCE_LINKS: json.dumps(chunk.source_links),
        SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
-        TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()),
        SECTION_CONTINUATION: chunk.section_continuation,
        METADATA: json.dumps(document.metadata),
+        # Save as a list for efficient extraction as an Attribute
+        METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
        EMBEDDINGS: embeddings_name_vector_map,
+        TITLE_EMBEDDING: chunk.title_embedding,
        BOOST: chunk.boost,
        DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
        PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
@ -394,6 +404,12 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) ->
    )
    filter_str += _build_or_filters(SOURCE_TYPE, source_strs)

+    tag_attributes = None
+    tags = filters.tags
+    if tags:
+        tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
+    filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
+
    filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)

    filter_str += _build_time_filter(filters.time_cutoff)
@ -448,6 +464,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
        if DOC_UPDATED_AT in fields
        else None
    )
+
+    # The highlights might include the title but this is the best way we have so far to show the highlighting
    match_highlights = _process_dynamic_summary(
        # fallback to regular `content` if the `content_summary` field
        # isn't present
@ -459,6 +477,13 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
            f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
        )

+    # Remove the title from the first chunk as every chunk already included
+    # its semantic identifier for LLM
+    content = fields[CONTENT]
+    if fields[CHUNK_ID] == 0:
+        parts = content.split(TITLE_SEPARATOR, maxsplit=1)
+        content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
+
    # User ran into this, not sure why this could happen, error checking here
    blurb = fields.get(BLURB)
    if not blurb:
@ -477,7 +502,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
    return InferenceChunk(
        chunk_id=fields[CHUNK_ID],
        blurb=blurb,
-        content=fields[CONTENT],
+        content=content,
        source_links=source_links_dict,
        section_continuation=fields[SECTION_CONTINUATION],
        document_id=fields[DOCUMENT_ID],
@ -725,6 +750,7 @@ class VespaIndex(DocumentIndex):
        num_to_retrieve: int = NUM_RETURNED_HITS,
        edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
    ) -> list[InferenceChunk]:
+        # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
        vespa_where_clauses = _build_vespa_filters(filters)
        yql = (
            VespaIndex.yql_base
@ -759,6 +785,7 @@ class VespaIndex(DocumentIndex):
        distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
        edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
    ) -> list[InferenceChunk]:
+        # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
        vespa_where_clauses = _build_vespa_filters(filters)
        yql = (
            VespaIndex.yql_base
@ -798,6 +825,7 @@ class VespaIndex(DocumentIndex):
        time_decay_multiplier: float,
        num_to_retrieve: int,
        hybrid_alpha: float | None = HYBRID_ALPHA,
+        title_content_ratio: float | None = TITLE_CONTENT_RATIO,
        distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
        edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
    ) -> list[InferenceChunk]:
@ -808,6 +836,7 @@ class VespaIndex(DocumentIndex):
            VespaIndex.yql_base
            + vespa_where_clauses
            + f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
+            + f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) "
            + 'or ({grammar: "weakAnd"}userInput(@query)) '
            + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
        )
@ -828,6 +857,9 @@ class VespaIndex(DocumentIndex):
            "input.query(alpha)": hybrid_alpha
            if hybrid_alpha is not None
            else HYBRID_ALPHA,
+            "input.query(title_content_ratio)": title_content_ratio
+            if title_content_ratio is not None
+            else TITLE_CONTENT_RATIO,
            "hits": num_to_retrieve,
            "offset": 0,
            "ranking.profile": "hybrid_search",
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@ -7,15 +7,15 @@ from transformers import AutoTokenizer  # type:ignore
 from danswer.configs.app_configs import BLURB_SIZE
 from danswer.configs.app_configs import CHUNK_OVERLAP
 from danswer.configs.app_configs import MINI_CHUNK_SIZE
+from danswer.configs.constants import SECTION_SEPARATOR
+from danswer.configs.constants import TITLE_SEPARATOR
 from danswer.configs.model_configs import CHUNK_SIZE
 from danswer.connectors.models import Document
-from danswer.connectors.models import Section
 from danswer.indexing.models import DocAwareChunk
 from danswer.search.search_nlp_models import get_default_tokenizer
 from danswer.utils.text_processing import shared_precompare_cleanup


-SECTION_SEPARATOR = "\n\n"
 ChunkFunc = Callable[[Document], list[DocAwareChunk]]


@ -29,7 +29,8 @@ def extract_blurb(text: str, blurb_size: int) -> str:


 def chunk_large_section(
-    section: Section,
+    section_text: str,
+    section_link_text: str,
    document: Document,
    start_chunk_id: int,
    tokenizer: AutoTokenizer,
@ -37,8 +38,6 @@ def chunk_large_section(
    chunk_overlap: int = CHUNK_OVERLAP,
    blurb_size: int = BLURB_SIZE,
 ) -> list[DocAwareChunk]:
-    section_text = section.text
-    section_link_text = section.link or ""
    blurb = extract_blurb(section_text, blurb_size)

    sentence_aware_splitter = SentenceSplitter(
@ -67,14 +66,18 @@ def chunk_document(
    subsection_overlap: int = CHUNK_OVERLAP,
    blurb_size: int = BLURB_SIZE,
 ) -> list[DocAwareChunk]:
+    title = document.get_title_for_document_index()
+    title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
    tokenizer = get_default_tokenizer()

    chunks: list[DocAwareChunk] = []
    link_offsets: dict[int, str] = {}
    chunk_text = ""
-    for section in document.sections:
+    for ind, section in enumerate(document.sections):
+        section_text = title_prefix + section.text if ind == 0 else section.text
        section_link_text = section.link or ""
-        section_tok_length = len(tokenizer.tokenize(section.text))
+
+        section_tok_length = len(tokenizer.tokenize(section_text))
        current_tok_length = len(tokenizer.tokenize(chunk_text))
        curr_offset_len = len(shared_precompare_cleanup(chunk_text))

@ -96,7 +99,8 @@ def chunk_document(
                chunk_text = ""

            large_section_chunks = chunk_large_section(
-                section=section,
+                section_text=section_text,
+                section_link_text=section_link_text,
                document=document,
                start_chunk_id=len(chunks),
                tokenizer=tokenizer,
@ -115,7 +119,7 @@ def chunk_document(
            <= chunk_tok_size
        ):
            chunk_text += (
-                SECTION_SEPARATOR + section.text if chunk_text else section.text
+                SECTION_SEPARATOR + section_text if chunk_text else section_text
            )
            link_offsets[curr_offset_len] = section_link_text
        else:
@ -130,7 +134,7 @@ def chunk_document(
                )
            )
            link_offsets = {0: section_link_text}
-            chunk_text = section.text
+            chunk_text = section_text

    # Once we hit the end, if we're still in the process of building a chunk, add what we have
    if chunk_text:
--- a/backend/danswer/indexing/embedder.py
+++ b/backend/danswer/indexing/embedder.py
@ -21,6 +21,9 @@ def embed_chunks(
    enable_mini_chunk: bool = ENABLE_MINI_CHUNK,
    passage_prefix: str = ASYM_PASSAGE_PREFIX,
 ) -> list[IndexChunk]:
+    # Cache the Title embeddings to only have to do it once
+    title_embed_dict: dict[str, list[float]] = {}
+
    embedded_chunks: list[IndexChunk] = []
    if embedding_model is None:
        embedding_model = EmbeddingModel()
@ -58,12 +61,24 @@ def embed_chunks(
        chunk_embeddings = embeddings[
            embedding_ind_start : embedding_ind_start + num_embeddings
        ]
+
+        title = chunk.source_document.get_title_for_document_index()
+
+        title_embedding = None
+        if title:
+            if title in title_embed_dict:
+                title_embedding = title_embed_dict[title]
+            else:
+                title_embedding = embedding_model.encode([title])[0]
+                title_embed_dict[title] = title_embedding
+
        new_embedded_chunk = IndexChunk(
            **{k: getattr(chunk, k) for k in chunk.__dataclass_fields__},
            embeddings=ChunkEmbedding(
                full_embedding=chunk_embeddings[0],
                mini_chunk_embeddings=chunk_embeddings[1:],
            ),
+            title_embedding=title_embedding,
        )
        embedded_chunks.append(new_embedded_chunk)
        embedding_ind_start += num_embeddings
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@ -17,6 +17,8 @@ from danswer.db.document import update_docs_updated_at
 from danswer.db.document import upsert_documents_complete
 from danswer.db.document_set import fetch_document_sets_for_documents
 from danswer.db.engine import get_sqlalchemy_engine
+from danswer.db.tag import create_or_add_document_tag
+from danswer.db.tag import create_or_add_document_tag_list
 from danswer.document_index.factory import get_default_document_index
 from danswer.document_index.interfaces import DocumentIndex
 from danswer.document_index.interfaces import DocumentMetadata
@ -44,6 +46,7 @@ def upsert_documents_in_db(
    index_attempt_metadata: IndexAttemptMetadata,
    db_session: Session,
 ) -> None:
+    # Metadata here refers to basic document info, not metadata about the actual content
    doc_m_batch: list[DocumentMetadata] = []
    for doc in documents:
        first_link = next(
@ -66,6 +69,26 @@ def upsert_documents_in_db(
        document_metadata_batch=doc_m_batch,
    )

+    # Insert document content metadata
+    for doc in documents:
+        for k, v in doc.metadata.items():
+            if isinstance(v, list):
+                create_or_add_document_tag_list(
+                    tag_key=k,
+                    tag_values=v,
+                    source=doc.source,
+                    document_id=doc.id,
+                    db_session=db_session,
+                )
+            else:
+                create_or_add_document_tag(
+                    tag_key=k,
+                    tag_value=v,
+                    source=doc.source,
+                    document_id=doc.id,
+                    db_session=db_session,
+                )
+

@log_function_time()
 def index_doc_batch(
@ -121,6 +144,8 @@ def index_doc_batch(
        )

        logger.debug("Starting chunking")
+
+        # The first chunk additionally contains the Title of the Document
        chunks: list[DocAwareChunk] = list(
            chain(*[chunker.chunk(document=document) for document in updatable_docs])
        )
--- a/backend/danswer/indexing/models.py
+++ b/backend/danswer/indexing/models.py
@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from dataclasses import fields
 from datetime import datetime
-from typing import Any

 from danswer.access.models import DocumentAccess
 from danswer.configs.constants import DocumentSource
@ -48,6 +47,7 @@ class DocAwareChunk(BaseChunk):
@dataclass
 class IndexChunk(DocAwareChunk):
    embeddings: ChunkEmbedding
+    title_embedding: Embedding | None


@dataclass
@ -95,7 +95,7 @@ class InferenceChunk(BaseChunk):
    recency_bias: float
    score: float | None
    hidden: bool
-    metadata: dict[str, Any]
+    metadata: dict[str, str | list[str]]
    # Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
    # to specify that a set of words should be highlighted. For example:
    # ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
--- a/backend/danswer/search/models.py
+++ b/backend/danswer/search/models.py
@ -48,10 +48,16 @@ class Embedder:
        raise NotImplementedError


+class Tag(BaseModel):
+    tag_key: str
+    tag_value: str
+
+
 class BaseFilters(BaseModel):
    source_type: list[DocumentSource] | None = None
    document_set: list[str] | None = None
    time_cutoff: datetime | None = None
+    tags: list[Tag] | None = None


 class IndexFilters(BaseFilters):
@ -110,6 +116,7 @@ class SearchDoc(BaseModel):
    # since a standard search will never find a hidden doc, this can only ever
    # be `True` when doing an admin search
    hidden: bool
+    metadata: dict[str, str | list[str]]
    score: float | None
    # Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
    # to specify that a set of words should be highlighted. For example:
--- a/backend/danswer/search/request_preprocessing.py
+++ b/backend/danswer/search/request_preprocessing.py
@ -121,6 +121,7 @@ def retrieval_preprocessing(
        source_type=preset_filters.source_type or predicted_source_filters,
        document_set=preset_filters.document_set,
        time_cutoff=preset_filters.time_cutoff or predicted_time_cutoff,
+        tags=preset_filters.tags,  # Tags are never auto-extracted
        access_control_list=user_acl_filters,
    )

--- a/backend/danswer/search/search_runner.py
+++ b/backend/danswer/search/search_runner.py
@ -96,6 +96,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
                source_type=chunk.source_type,
                boost=chunk.boost,
                hidden=chunk.hidden,
+                metadata=chunk.metadata,
                score=chunk.score,
                match_highlights=chunk.match_highlights,
                updated_at=chunk.updated_at,
--- a/backend/danswer/server/query_and_chat/models.py
+++ b/backend/danswer/server/query_and_chat/models.py
@ -5,12 +5,29 @@ from pydantic import BaseModel
 from pydantic import root_validator

 from danswer.chat.models import RetrievalDocs
+from danswer.configs.constants import DocumentSource
 from danswer.configs.constants import MessageType
 from danswer.configs.constants import SearchFeedbackType
 from danswer.search.models import BaseFilters
 from danswer.search.models import RetrievalDetails
 from danswer.search.models import SearchDoc
 from danswer.search.models import SearchType
+from danswer.search.models import Tag
+
+
+class TagRequest(BaseModel):
+    match_pattern: str | None
+    # If this is empty or None, then tags for all sources are considered
+    sources: list[DocumentSource] | None
+    allow_prefix: bool = True  # This is currently the only option
+
+
+class SourceTag(Tag):
+    source: DocumentSource
+
+
+class TagResponse(BaseModel):
+    tags: list[SourceTag]


 class SimpleQueryRequest(BaseModel):
--- a/backend/danswer/server/query_and_chat/query_backend.py
+++ b/backend/danswer/server/query_and_chat/query_backend.py
@ -9,6 +9,7 @@ from danswer.auth.users import current_user
 from danswer.configs.chat_configs import DISABLE_LLM_CHUNK_FILTER
 from danswer.db.engine import get_session
 from danswer.db.models import User
+from danswer.db.tag import get_tags_by_value_prefix_for_source_types
 from danswer.document_index.factory import get_default_document_index
 from danswer.document_index.vespa.index import VespaIndex
 from danswer.one_shot_answer.answer_question import stream_search_answer
@ -30,6 +31,9 @@ from danswer.server.query_and_chat.models import DocumentSearchRequest
 from danswer.server.query_and_chat.models import HelperResponse
 from danswer.server.query_and_chat.models import QueryValidationResponse
 from danswer.server.query_and_chat.models import SimpleQueryRequest
+from danswer.server.query_and_chat.models import SourceTag
+from danswer.server.query_and_chat.models import TagRequest
+from danswer.server.query_and_chat.models import TagResponse
 from danswer.utils.logger import setup_logger

 logger = setup_logger()
@ -75,6 +79,29 @@ def admin_search(
    return AdminSearchResponse(documents=deduplicated_documents)


+@basic_router.post("/valid-tags")
+def get_tags(
+    tag_request: TagRequest,
+    _: User = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> TagResponse:
+    if not tag_request.allow_prefix:
+        raise NotImplementedError("Cannot disable prefix match for now")
+
+    db_tags = get_tags_by_value_prefix_for_source_types(
+        tag_value_prefix=tag_request.match_pattern,
+        sources=tag_request.sources,
+        db_session=db_session,
+    )
+    server_tags = [
+        SourceTag(
+            tag_key=db_tag.tag_key, tag_value=db_tag.tag_value, source=db_tag.source
+        )
+        for db_tag in db_tags
+    ]
+    return TagResponse(tags=server_tags)
+
+
@basic_router.post("/search-intent")
 def get_search_type(
    simple_query: SimpleQueryRequest, _: User = Depends(current_user)
--- a/backend/scripts/simulate_chat_frontend.py
+++ b/backend/scripts/simulate_chat_frontend.py
@ -30,7 +30,11 @@ def send_chat_message(
        "chat_session_id": chat_session_id,
        "parent_message_id": parent_message,
        "prompt_id": 0,  # Global default Prompt
-        "retrieval_options": {"run_search": "always", "real_time": True},
+        "retrieval_options": {
+            "run_search": "always",
+            "real_time": True,
+            "filters": {"tags": []},
+        },
    }

    docs: list[dict] | None = None