From d7141df5fc28ef14e608e04dc9d8c3cca619d0aa Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Tue, 2 Jan 2024 11:25:50 -0800
Subject: [PATCH] Metadata and Title Search (#903)

---
 backend/alembic/versions/904e5138fffb_tags.py |  61 +++++++
 backend/danswer/configs/chat_configs.py       |   9 +-
 backend/danswer/configs/constants.py          |   8 +
 .../danswer/connectors/bookstack/connector.py |  64 +++++--
 .../connectors/confluence/connector.py        |   6 +-
 .../cross_connector_utils/__init__.py         |   0
 .../connectors/danswer_jira/connector.py      |  29 +--
 .../connectors/document360/connector.py       |  11 +-
 backend/danswer/connectors/file/__init__.py   |   0
 .../danswer/connectors/github/connector.py    |   8 +-
 backend/danswer/connectors/gong/connector.py  |  10 +-
 .../connectors/google_drive/connector.py      |  10 +-
 .../connectors/google_site/__init__.py        |   0
 backend/danswer/connectors/guru/connector.py  |   2 +-
 .../danswer/connectors/hubspot/connector.py   |   2 +-
 backend/danswer/connectors/linear/__init__.py |   0
 .../danswer/connectors/linear/connector.py    |   4 +-
 backend/danswer/connectors/models.py          |  27 ++-
 .../danswer/connectors/notion/connector.py    |   3 +-
 .../connectors/productboard/connector.py      |  57 +++---
 .../connectors/requesttracker/connector.py    |   3 +-
 backend/danswer/connectors/zulip/__init__.py  |   0
 backend/danswer/db/chat.py                    |   2 +
 backend/danswer/db/document.py                |   2 +
 backend/danswer/db/models.py                  |  37 ++++
 backend/danswer/db/tag.py                     | 116 ++++++++++++
 .../vespa/app_config/schemas/danswer_chunk.sd | 165 ++++++++++++------
 backend/danswer/document_index/vespa/index.py |  38 +++-
 backend/danswer/indexing/chunker.py           |  24 +--
 backend/danswer/indexing/embedder.py          |  15 ++
 backend/danswer/indexing/indexing_pipeline.py |  25 +++
 backend/danswer/indexing/models.py            |   4 +-
 backend/danswer/search/models.py              |   7 +
 .../danswer/search/request_preprocessing.py   |   1 +
 backend/danswer/search/search_runner.py       |   1 +
 .../danswer/server/query_and_chat/models.py   |  17 ++
 .../server/query_and_chat/query_backend.py    |  27 +++
 backend/scripts/simulate_chat_frontend.py     |   6 +-
 38 files changed, 639 insertions(+), 162 deletions(-)
 create mode 100644 backend/alembic/versions/904e5138fffb_tags.py
 create mode 100644 backend/danswer/connectors/cross_connector_utils/__init__.py
 create mode 100644 backend/danswer/connectors/file/__init__.py
 create mode 100644 backend/danswer/connectors/google_site/__init__.py
 create mode 100644 backend/danswer/connectors/linear/__init__.py
 create mode 100644 backend/danswer/connectors/zulip/__init__.py
 create mode 100644 backend/danswer/db/tag.py

diff --git a/backend/alembic/versions/904e5138fffb_tags.py b/backend/alembic/versions/904e5138fffb_tags.py
new file mode 100644
index 000000000..aaf4bd51f
--- /dev/null
+++ b/backend/alembic/versions/904e5138fffb_tags.py
@@ -0,0 +1,61 @@
+"""Tags
+
+Revision ID: 904e5138fffb
+Revises: 891cd83c87a8
+Create Date: 2024-01-01 10:44:43.733974
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "904e5138fffb"
+down_revision = "891cd83c87a8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "tag",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("tag_key", sa.String(), nullable=False),
+        sa.Column("tag_value", sa.String(), nullable=False),
+        sa.Column("source", sa.String(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint(
+            "tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
+        ),
+    )
+    op.create_table(
+        "document__tag",
+        sa.Column("document_id", sa.String(), nullable=False),
+        sa.Column("tag_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["document_id"],
+            ["document.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["tag_id"],
+            ["tag.id"],
+        ),
+        sa.PrimaryKeyConstraint("document_id", "tag_id"),
+    )
+
+    op.add_column(
+        "search_doc",
+        sa.Column(
+            "doc_metadata",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+        ),
+    )
+    op.execute("UPDATE search_doc SET doc_metadata = '{}' WHERE doc_metadata IS NULL")
+    op.alter_column("search_doc", "doc_metadata", nullable=False)
+
+
+def downgrade() -> None:
+    op.drop_table("document__tag")
+    op.drop_table("tag")
+    op.drop_column("search_doc", "doc_metadata")
diff --git a/backend/danswer/configs/chat_configs.py b/backend/danswer/configs/chat_configs.py
index 872e54e38..21d7b8c28 100644
--- a/backend/danswer/configs/chat_configs.py
+++ b/backend/danswer/configs/chat_configs.py
@@ -59,7 +59,14 @@ if os.environ.get("EDIT_KEYWORD_QUERY"):
 else:
     EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
 # Weighting factor between Vector and Keyword Search, 1 for completely vector search
-HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.6)))
+HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.66)))
+# Weighting factor between Title and Content of documents during search, 1 for completely
+# Title based. Default heavily favors Content because Title is also included at the top of
+# Content. This is to avoid cases where the Content is very relevant but it may not be clear
+# if the title is separated out. Title is most of a "boost" than a separate field.
+TITLE_CONTENT_RATIO = max(
+    0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20))
+)
 # A list of languages passed to the LLM to rephase the query
 # For example "English,French,Spanish", be sure to use the "," separator
 MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None
diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index dd1019d7e..d86f980c9 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -11,11 +11,13 @@ SEMANTIC_IDENTIFIER = "semantic_identifier"
 TITLE = "title"
 SECTION_CONTINUATION = "section_continuation"
 EMBEDDINGS = "embeddings"
+TITLE_EMBEDDING = "title_embedding"
 ALLOWED_USERS = "allowed_users"
 ACCESS_CONTROL_LIST = "access_control_list"
 DOCUMENT_SETS = "document_sets"
 TIME_FILTER = "time_filter"
 METADATA = "metadata"
+METADATA_LIST = "metadata_list"
 MATCH_HIGHLIGHTS = "match_highlights"
 # stored in the `metadata` of a chunk. Used to signify that this chunk should
 # not be used for QA. For example, Google Drive file types which can't be parsed
@@ -38,6 +40,12 @@ SESSION_KEY = "session"
 QUERY_EVENT_ID = "query_event_id"
 LLM_CHUNKS = "llm_chunks"
 
+# For chunking/processing chunks
+TITLE_SEPARATOR = "\n\r\n"
+SECTION_SEPARATOR = "\n\n"
+# For combining attributes, doesn't have to be unique/perfect to work
+INDEX_SEPARATOR = "==="
+
 
 class DocumentSource(str, Enum):
     # Special case, document passed in via Danswer APIs without specifying a source type
diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py
index 1bc8d3f9e..606866b42 100644
--- a/backend/danswer/connectors/bookstack/connector.py
+++ b/backend/danswer/connectors/bookstack/connector.py
@@ -8,6 +8,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.bookstack.client import BookStackApiClient
 from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -72,13 +73,21 @@ class BookstackConnector(LoadConnector, PollConnector):
         bookstack_client: BookStackApiClient, book: dict[str, Any]
     ) -> Document:
         url = bookstack_client.build_app_url("/books/" + str(book.get("slug")))
+        title = str(book.get("name", ""))
         text = book.get("name", "") + "\n" + book.get("description", "")
+        updated_at_str = (
+            str(book.get("updated_at")) if book.get("updated_at") is not None else None
+        )
         return Document(
-            id="book:" + str(book.get("id")),
+            id="book__" + str(book.get("id")),
             sections=[Section(link=url, text=text)],
             source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Book: " + str(book.get("name")),
-            metadata={"type": "book", "updated_at": str(book.get("updated_at"))},
+            semantic_identifier="Book: " + title,
+            title=title,
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "book"},
         )
 
     @staticmethod
@@ -91,13 +100,23 @@ class BookstackConnector(LoadConnector, PollConnector):
             + "/chapter/"
             + str(chapter.get("slug"))
         )
+        title = str(chapter.get("name", ""))
         text = chapter.get("name", "") + "\n" + chapter.get("description", "")
+        updated_at_str = (
+            str(chapter.get("updated_at"))
+            if chapter.get("updated_at") is not None
+            else None
+        )
         return Document(
-            id="chapter:" + str(chapter.get("id")),
+            id="chapter__" + str(chapter.get("id")),
             sections=[Section(link=url, text=text)],
             source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Chapter: " + str(chapter.get("name")),
-            metadata={"type": "chapter", "updated_at": str(chapter.get("updated_at"))},
+            semantic_identifier="Chapter: " + title,
+            title=title,
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "chapter"},
         )
 
     @staticmethod
@@ -105,13 +124,23 @@ class BookstackConnector(LoadConnector, PollConnector):
         bookstack_client: BookStackApiClient, shelf: dict[str, Any]
     ) -> Document:
         url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug")))
+        title = str(shelf.get("name", ""))
         text = shelf.get("name", "") + "\n" + shelf.get("description", "")
+        updated_at_str = (
+            str(shelf.get("updated_at"))
+            if shelf.get("updated_at") is not None
+            else None
+        )
         return Document(
             id="shelf:" + str(shelf.get("id")),
             sections=[Section(link=url, text=text)],
             source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Shelf: " + str(shelf.get("name")),
-            metadata={"type": "shelf", "updated_at": shelf.get("updated_at")},
+            semantic_identifier="Shelf: " + title,
+            title=title,
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "shelf"},
         )
 
     @staticmethod
@@ -119,7 +148,7 @@ class BookstackConnector(LoadConnector, PollConnector):
         bookstack_client: BookStackApiClient, page: dict[str, Any]
     ) -> Document:
         page_id = str(page.get("id"))
-        page_name = str(page.get("name"))
+        title = str(page.get("name", ""))
         page_data = bookstack_client.get("/pages/" + page_id, {})
         url = bookstack_client.build_app_url(
             "/books/"
@@ -127,17 +156,24 @@ class BookstackConnector(LoadConnector, PollConnector):
             + "/page/"
             + str(page_data.get("slug"))
         )
-        page_html = (
-            "<h1>" + html.escape(page_name) + "</h1>" + str(page_data.get("html"))
-        )
+        page_html = "<h1>" + html.escape(title) + "</h1>" + str(page_data.get("html"))
         text = parse_html_page_basic(page_html)
+        updated_at_str = (
+            str(page_data.get("updated_at"))
+            if page_data.get("updated_at") is not None
+            else None
+        )
         time.sleep(0.1)
         return Document(
             id="page:" + page_id,
             sections=[Section(link=url, text=text)],
             source=DocumentSource.BOOKSTACK,
-            semantic_identifier="Page: " + str(page_name),
-            metadata={"type": "page", "updated_at": page_data.get("updated_at")},
+            semantic_identifier="Page: " + str(title),
+            title=str(title),
+            doc_updated_at=time_str_to_utc(updated_at_str)
+            if updated_at_str is not None
+            else None,
+            metadata={"type": "page"},
         )
 
     def load_from_state(self) -> GenerateDocumentsOutput:
diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py
index bf9b9c00f..f2d091e4d 100644
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -333,11 +333,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                 if not page_html:
                     logger.debug("Page is empty, skipping: %s", page_url)
                     continue
-                page_text = (
-                    page.get("title", "")
-                    + "\n"
-                    + parse_html_page(page_html, self.confluence_client)
-                )
+                page_text = parse_html_page(page_html, self.confluence_client)
                 comments_text = self._fetch_comments(self.confluence_client, page_id)
                 page_text += comments_text
 
diff --git a/backend/danswer/connectors/cross_connector_utils/__init__.py b/backend/danswer/connectors/cross_connector_utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py
index 0bfd74f60..8d82fd8b4 100644
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@@ -3,16 +3,17 @@ from datetime import timezone
 from typing import Any
 from urllib.parse import urlparse
 
-from dateutil.parser import parse
 from jira import JIRA
 from jira.resources import Issue
 
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.interfaces import SecondsSinceUnixEpoch
+from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import ConnectorMissingCredentialError
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
@@ -60,26 +61,32 @@ def fetch_jira_issues_batch(
             logger.warning(f"Found Jira object not of type Issue {jira}")
             continue
 
-        ticket_updated_time = parse(jira.fields.updated)
-
-        semantic_rep = (
-            f"Jira Ticket Summary: {jira.fields.summary}\n"
-            f"Description: {jira.fields.description}\n"
-            + "\n".join(
-                [f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
-            )
+        semantic_rep = f"{jira.fields.description}\n" + "\n".join(
+            [f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
         )
 
         page_url = f"{jira_client.client_info()}/browse/{jira.key}"
 
+        author = None
+        try:
+            author = BasicExpertInfo(
+                display_name=jira.fields.creator.displayName,
+                email=jira.fields.creator.emailAddress,
+            )
+        except Exception:
+            # Author should exist but if not, doesn't matter
+            pass
+
         doc_batch.append(
             Document(
                 id=page_url,
                 sections=[Section(link=page_url, text=semantic_rep)],
                 source=DocumentSource.JIRA,
                 semantic_identifier=jira.fields.summary,
-                doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
-                metadata={},
+                doc_updated_at=time_str_to_utc(jira.fields.updated),
+                primary_owners=[author] if author is not None else None,
+                # TODO add secondary_owners if needed
+                metadata={"label": jira.fields.labels} if jira.fields.labels else {},
             )
         )
     return doc_batch, len(batch)
diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py
index 5324aa1d7..82ac51f17 100644
--- a/backend/danswer/connectors/document360/connector.py
+++ b/backend/danswer/connectors/document360/connector.py
@@ -140,11 +140,7 @@ class Document360Connector(LoadConnector, PollConnector):
             html_content = article_details["html_content"]
             article_content = parse_html_page_basic(html_content)
             doc_text = (
-                f"workspace: {self.workspace}\n"
-                f"category: {article['category_name']}\n"
-                f"article: {article_details['title']} - "
-                f"{article_details.get('description', '')}\n"
-                f"{article_content}"
+                f"{article_details.get('description', '')}\n{article_content}".strip()
             )
 
             document = Document(
@@ -154,7 +150,10 @@ class Document360Connector(LoadConnector, PollConnector):
                 semantic_identifier=article_details["title"],
                 doc_updated_at=updated_at,
                 primary_owners=authors,
-                metadata={},
+                metadata={
+                    "workspace": self.workspace,
+                    "category": article["category_name"],
+                },
             )
 
             doc_batch.append(document)
diff --git a/backend/danswer/connectors/file/__init__.py b/backend/danswer/connectors/file/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/danswer/connectors/github/connector.py b/backend/danswer/connectors/github/connector.py
index 7fa19b1c4..4fd104e29 100644
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@@ -37,10 +37,9 @@ def _batch_github_objects(
 
 
 def _convert_pr_to_document(pull_request: PullRequest) -> Document:
-    full_context = f"Pull-Request {pull_request.title}\n{pull_request.body}"
     return Document(
         id=pull_request.html_url,
-        sections=[Section(link=pull_request.html_url, text=full_context)],
+        sections=[Section(link=pull_request.html_url, text=pull_request.body or "")],
         source=DocumentSource.GITHUB,
         semantic_identifier=pull_request.title,
         # updated_at is UTC time but is timezone unaware, explicitly add UTC
@@ -48,7 +47,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
         # due to local time discrepancies with UTC
         doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
         metadata={
-            "merged": pull_request.merged,
+            "merged": str(pull_request.merged),
             "state": pull_request.state,
         },
     )
@@ -60,10 +59,9 @@ def _fetch_issue_comments(issue: Issue) -> str:
 
 
 def _convert_issue_to_document(issue: Issue) -> Document:
-    full_context = f"Issue {issue.title}\n{issue.body}"
     return Document(
         id=issue.html_url,
-        sections=[Section(link=issue.html_url, text=full_context)],
+        sections=[Section(link=issue.html_url, text=issue.body or "")],
         source=DocumentSource.GITHUB,
         semantic_identifier=issue.title,
         # updated_at is UTC time but is timezone unaware
diff --git a/backend/danswer/connectors/gong/connector.py b/backend/danswer/connectors/gong/connector.py
index e7691bdf3..711bdf11b 100644
--- a/backend/danswer/connectors/gong/connector.py
+++ b/backend/danswer/connectors/gong/connector.py
@@ -206,9 +206,6 @@ class GongConnector(LoadConnector, PollConnector):
                 speaker_to_name: dict[str, str] = {}
 
                 transcript_text = ""
-                if call_title:
-                    transcript_text += f"Call Title: {call_title}\n\n"
-
                 call_purpose = call_metadata["purpose"]
                 if call_purpose:
                     transcript_text += f"Call Description: {call_purpose}\n\n"
@@ -234,6 +231,11 @@ class GongConnector(LoadConnector, PollConnector):
                     )
                     transcript_text += f"{speaker_name}: {monolog}\n\n"
 
+                metadata = {}
+                if call_metadata.get("system"):
+                    metadata["client"] = call_metadata.get("system")
+                # TODO calls have a clientUniqueId field, can pull that in later
+
                 doc_batch.append(
                     Document(
                         id=call_id,
@@ -246,7 +248,7 @@ class GongConnector(LoadConnector, PollConnector):
                         doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
                             timezone.utc
                         ),
-                        metadata={},
+                        metadata={"client": call_metadata.get("system")},
                     )
                 )
             yield doc_batch
diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py
index 4a745a8b1..4eda2d532 100644
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@@ -466,24 +466,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
             doc_batch = []
             for file in files_batch:
                 try:
-                    text_contents = extract_text(file, service)
-                    if text_contents:
-                        full_context = file["name"] + " - " + text_contents
-                    else:
-                        full_context = file["name"]
+                    text_contents = extract_text(file, service) or ""
 
                     doc_batch.append(
                         Document(
                             id=file["webViewLink"],
                             sections=[
-                                Section(link=file["webViewLink"], text=full_context)
+                                Section(link=file["webViewLink"], text=text_contents)
                             ],
                             source=DocumentSource.GOOGLE_DRIVE,
                             semantic_identifier=file["name"],
                             doc_updated_at=datetime.fromisoformat(
                                 file["modifiedTime"]
                             ).astimezone(timezone.utc),
-                            metadata={} if text_contents else {IGNORE_FOR_QA: True},
+                            metadata={} if text_contents else {IGNORE_FOR_QA: "True"},
                         )
                     )
                 except Exception as e:
diff --git a/backend/danswer/connectors/google_site/__init__.py b/backend/danswer/connectors/google_site/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py
index dff2c366e..89f63b131 100644
--- a/backend/danswer/connectors/guru/connector.py
+++ b/backend/danswer/connectors/guru/connector.py
@@ -77,7 +77,7 @@ class GuruConnector(LoadConnector, PollConnector):
             for card in cards:
                 title = card["preferredPhrase"]
                 link = GURU_CARDS_URL + card["slug"]
-                content_text = title + "\n" + parse_html_page_basic(card["content"])
+                content_text = parse_html_page_basic(card["content"])
                 last_updated = time_str_to_utc(card["lastModified"])
                 last_verified = (
                     time_str_to_utc(card.get("lastVerified"))
diff --git a/backend/danswer/connectors/hubspot/connector.py b/backend/danswer/connectors/hubspot/connector.py
index e59a4c9eb..861f53ee6 100644
--- a/backend/danswer/connectors/hubspot/connector.py
+++ b/backend/danswer/connectors/hubspot/connector.py
@@ -73,7 +73,7 @@ class HubSpotConnector(LoadConnector, PollConnector):
 
             title = ticket.properties["subject"]
             link = self.ticket_base_url + ticket.id
-            content_text = title + "\n" + ticket.properties["content"]
+            content_text = ticket.properties["content"]
 
             associated_emails: list[str] = []
             associated_notes: list[str] = []
diff --git a/backend/danswer/connectors/linear/__init__.py b/backend/danswer/connectors/linear/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/danswer/connectors/linear/connector.py b/backend/danswer/connectors/linear/connector.py
index 067fb561d..7d81a2286 100644
--- a/backend/danswer/connectors/linear/connector.py
+++ b/backend/danswer/connectors/linear/connector.py
@@ -8,6 +8,7 @@ import requests
 
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -30,7 +31,6 @@ def _make_query(request_body: dict[str, Any], api_key: str) -> requests.Response
         "Content-Type": "application/json",
     }
 
-    response: requests.Response | None = None
     for i in range(_NUM_RETRIES):
         try:
             response = requests.post(
@@ -187,8 +187,8 @@ class LinearConnector(LoadConnector, PollConnector):
                         ],
                         source=DocumentSource.LINEAR,
                         semantic_identifier=node["identifier"],
+                        doc_updated_at=time_str_to_utc(node["updatedAt"]),
                         metadata={
-                            "updated_at": node["updatedAt"],
                             "team": node["team"]["name"],
                         },
                     )
diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py
index 871382f49..2934d2fa2 100644
--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@@ -1,10 +1,10 @@
 from datetime import datetime
 from enum import Enum
-from typing import Any
 
 from pydantic import BaseModel
 
 from danswer.configs.constants import DocumentSource
+from danswer.configs.constants import INDEX_SEPARATOR
 from danswer.utils.text_processing import make_url_compatible
 
 
@@ -50,21 +50,38 @@ class DocumentBase(BaseModel):
     sections: list[Section]
     source: DocumentSource | None = None
     semantic_identifier: str  # displayed in the UI as the main identifier for the doc
-    metadata: dict[str, Any]
+    metadata: dict[str, str | list[str]]
     # UTC time
     doc_updated_at: datetime | None = None
     # Owner, creator, etc.
     primary_owners: list[BasicExpertInfo] | None = None
     # Assignee, space owner, etc.
     secondary_owners: list[BasicExpertInfo] | None = None
-    # `title` is used when computing best matches for a query
-    # if `None`, then we will use the `semantic_identifier` as the title in Vespa
+    # title is used for search whereas semantic_identifier is used for displaying in the UI
+    # different because Slack message may display as #general but general should not be part
+    # of the search, at least not in the same way as a document title should be for like Confluence
+    # The default title is semantic_identifier though unless otherwise specified
     title: str | None = None
     from_ingestion_api: bool = False
 
-    def get_title_for_document_index(self) -> str:
+    def get_title_for_document_index(self) -> str | None:
+        # If title is explicitly empty, return a None here for embedding purposes
+        if self.title == "":
+            return None
         return self.semantic_identifier if self.title is None else self.title
 
+    def get_metadata_str_attributes(self) -> list[str] | None:
+        if not self.metadata:
+            return None
+        # Combined string for the key/value for easy filtering
+        attributes: list[str] = []
+        for k, v in self.metadata.items():
+            if isinstance(v, list):
+                attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
+            else:
+                attributes.append(k + INDEX_SEPARATOR + v)
+        return attributes
+
 
 class Document(DocumentBase):
     id: str  # This must be unique or during indexing/reindexing, chunks will be overwritten
diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py
index fd3de72b1..aa8a4001b 100644
--- a/backend/danswer/connectors/notion/connector.py
+++ b/backend/danswer/connectors/notion/connector.py
@@ -267,7 +267,8 @@ class NotionConnector(LoadConnector, PollConnector):
             yield (
                 Document(
                     id=page.id,
-                    sections=[Section(link=page.url, text=f"{page_title}\n")]
+                    # Will add title to the first section later in processing
+                    sections=[Section(link=page.url, text="")]
                     + [
                         Section(
                             link=f"{page.url}#{block_id.replace('-', '')}",
diff --git a/backend/danswer/connectors/productboard/connector.py b/backend/danswer/connectors/productboard/connector.py
index c5003951b..1c013f42b 100644
--- a/backend/danswer/connectors/productboard/connector.py
+++ b/backend/danswer/connectors/productboard/connector.py
@@ -14,6 +14,7 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.interfaces import SecondsSinceUnixEpoch
+from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
@@ -94,26 +95,24 @@ class ProductboardConnector(PollConnector):
         for feature in self._fetch_documents(
             initial_link=f"{_PRODUCT_BOARD_BASE_URL}/features"
         ):
+            owner = self._get_owner_email(feature)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
             yield Document(
                 id=feature["id"],
                 sections=[
                     Section(
                         link=feature["links"]["html"],
-                        text=" - ".join(
-                            (
-                                feature["name"],
-                                self._parse_description_html(feature["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(feature["description"]),
                     )
                 ],
                 semantic_identifier=feature["name"],
                 source=DocumentSource.PRODUCTBOARD,
                 doc_updated_at=time_str_to_utc(feature["updatedAt"]),
+                primary_owners=experts,
                 metadata={
-                    "productboard_entity_type": feature["type"],
+                    "entity_type": feature["type"],
                     "status": feature["status"]["name"],
-                    "owner": self._get_owner_email(feature),
                 },
             )
 
@@ -122,25 +121,23 @@ class ProductboardConnector(PollConnector):
         for component in self._fetch_documents(
             initial_link=f"{_PRODUCT_BOARD_BASE_URL}/components"
         ):
+            owner = self._get_owner_email(component)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
             yield Document(
                 id=component["id"],
                 sections=[
                     Section(
                         link=component["links"]["html"],
-                        text=" - ".join(
-                            (
-                                component["name"],
-                                self._parse_description_html(component["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(component["description"]),
                     )
                 ],
                 semantic_identifier=component["name"],
                 source=DocumentSource.PRODUCTBOARD,
                 doc_updated_at=time_str_to_utc(component["updatedAt"]),
+                primary_owners=experts,
                 metadata={
-                    "productboard_entity_type": "component",
-                    "owner": self._get_owner_email(component),
+                    "entity_type": "component",
                 },
             )
 
@@ -150,25 +147,23 @@ class ProductboardConnector(PollConnector):
         for product in self._fetch_documents(
             initial_link=f"{_PRODUCT_BOARD_BASE_URL}/products"
         ):
+            owner = self._get_owner_email(product)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
             yield Document(
                 id=product["id"],
                 sections=[
                     Section(
                         link=product["links"]["html"],
-                        text=" - ".join(
-                            (
-                                product["name"],
-                                self._parse_description_html(product["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(product["description"]),
                     )
                 ],
                 semantic_identifier=product["name"],
                 source=DocumentSource.PRODUCTBOARD,
                 doc_updated_at=time_str_to_utc(product["updatedAt"]),
+                primary_owners=experts,
                 metadata={
-                    "productboard_entity_type": "product",
-                    "owner": self._get_owner_email(product),
+                    "entity_type": "product",
                 },
             )
 
@@ -176,26 +171,24 @@ class ProductboardConnector(PollConnector):
         for objective in self._fetch_documents(
             initial_link=f"{_PRODUCT_BOARD_BASE_URL}/objectives"
         ):
+            owner = self._get_owner_email(objective)
+            experts = [BasicExpertInfo(email=owner)] if owner else None
+
             yield Document(
                 id=objective["id"],
                 sections=[
                     Section(
                         link=objective["links"]["html"],
-                        text=" - ".join(
-                            (
-                                objective["name"],
-                                self._parse_description_html(objective["description"]),
-                            )
-                        ),
+                        text=self._parse_description_html(objective["description"]),
                     )
                 ],
                 semantic_identifier=objective["name"],
                 source=DocumentSource.PRODUCTBOARD,
                 doc_updated_at=time_str_to_utc(objective["updatedAt"]),
+                primary_owners=experts,
                 metadata={
-                    "productboard_entity_type": "release",
+                    "entity_type": "release",
                     "state": objective["state"],
-                    "owner": self._get_owner_email(objective),
                 },
             )
 
diff --git a/backend/danswer/connectors/requesttracker/connector.py b/backend/danswer/connectors/requesttracker/connector.py
index 6b20504d3..9c4590fc2 100644
--- a/backend/danswer/connectors/requesttracker/connector.py
+++ b/backend/danswer/connectors/requesttracker/connector.py
@@ -97,7 +97,8 @@ class RequestTrackerConnector(PollConnector):
             logger.info(f"Processing ticket {tid}")
             doc = Document(
                 id=ticket["id"],
-                sections=[Section(link=ticketLink, text=f"{ticket['Subject']}\n")]
+                # Will add title to the first section later in processing
+                sections=[Section(link=ticketLink, text="")]
                 + self.build_doc_sections_from_txn(Rt0, tid),
                 source=DocumentSource.REQUESTTRACKER,
                 semantic_identifier=ticket["Subject"],
diff --git a/backend/danswer/connectors/zulip/__init__.py b/backend/danswer/connectors/zulip/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/backend/danswer/db/chat.py b/backend/danswer/db/chat.py
index be1d7a56b..15236b599 100644
--- a/backend/danswer/db/chat.py
+++ b/backend/danswer/db/chat.py
@@ -642,6 +642,7 @@ def create_db_search_doc(
         source_type=server_search_doc.source_type,
         boost=server_search_doc.boost,
         hidden=server_search_doc.hidden,
+        doc_metadata=server_search_doc.metadata,
         score=server_search_doc.score,
         match_highlights=server_search_doc.match_highlights,
         updated_at=server_search_doc.updated_at,
@@ -674,6 +675,7 @@ def translate_db_search_doc_to_server_search_doc(
         source_type=db_search_doc.source_type,
         boost=db_search_doc.boost,
         hidden=db_search_doc.hidden,
+        metadata=db_search_doc.doc_metadata,
         score=db_search_doc.score,
         match_highlights=db_search_doc.match_highlights,
         updated_at=db_search_doc.updated_at,
diff --git a/backend/danswer/db/document.py b/backend/danswer/db/document.py
index 58b9c837b..b1620fb60 100644
--- a/backend/danswer/db/document.py
+++ b/backend/danswer/db/document.py
@@ -17,6 +17,7 @@ from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import Credential
 from danswer.db.models import Document as DbDocument
 from danswer.db.models import DocumentByConnectorCredentialPair
+from danswer.db.tag import delete_document_tags_for_documents
 from danswer.db.utils import model_to_dict
 from danswer.document_index.interfaces import DocumentMetadata
 from danswer.server.documents.models import ConnectorCredentialPairIdentifier
@@ -272,6 +273,7 @@ def delete_documents_complete(db_session: Session, document_ids: list[str]) -> N
     delete_document_feedback_for_documents(
         document_ids=document_ids, db_session=db_session
     )
+    delete_document_tags_for_documents(document_ids=document_ids, db_session=db_session)
     delete_documents(db_session, document_ids)
     db_session.commit()
 
diff --git a/backend/danswer/db/models.py b/backend/danswer/db/models.py
index 8937210ba..dabe66d21 100644
--- a/backend/danswer/db/models.py
+++ b/backend/danswer/db/models.py
@@ -22,6 +22,7 @@ from sqlalchemy import Integer
 from sqlalchemy import Sequence
 from sqlalchemy import String
 from sqlalchemy import Text
+from sqlalchemy import UniqueConstraint
 from sqlalchemy.dialects import postgresql
 from sqlalchemy.orm import DeclarativeBase
 from sqlalchemy.orm import Mapped
@@ -153,6 +154,15 @@ class ChatMessage__SearchDoc(Base):
     )
 
 
+class Document__Tag(Base):
+    __tablename__ = "document__tag"
+
+    document_id: Mapped[str] = mapped_column(
+        ForeignKey("document.id"), primary_key=True
+    )
+    tag_id: Mapped[int] = mapped_column(ForeignKey("tag.id"), primary_key=True)
+
+
 """
 Documents/Indexing Tables
 """
@@ -247,6 +257,32 @@ class Document(Base):
     retrieval_feedbacks: Mapped[List["DocumentRetrievalFeedback"]] = relationship(
         "DocumentRetrievalFeedback", back_populates="document"
     )
+    tags = relationship(
+        "Tag",
+        secondary="document__tag",
+        back_populates="documents",
+    )
+
+
+class Tag(Base):
+    __tablename__ = "tag"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    tag_key: Mapped[str] = mapped_column(String)
+    tag_value: Mapped[str] = mapped_column(String)
+    source: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
+
+    documents = relationship(
+        "Document",
+        secondary="document__tag",
+        back_populates="tags",
+    )
+
+    __table_args__ = (
+        UniqueConstraint(
+            "tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
+        ),
+    )
 
 
 class Connector(Base):
@@ -424,6 +460,7 @@ class SearchDoc(Base):
     boost: Mapped[int] = mapped_column(Integer)
     source_type: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
     hidden: Mapped[bool] = mapped_column(Boolean)
+    doc_metadata: Mapped[dict[str, str | list[str]]] = mapped_column(postgresql.JSONB())
     score: Mapped[float] = mapped_column(Float)
     match_highlights: Mapped[list[str]] = mapped_column(postgresql.ARRAY(String))
     # This is for the document, not this row in the table
diff --git a/backend/danswer/db/tag.py b/backend/danswer/db/tag.py
new file mode 100644
index 000000000..bf70f7308
--- /dev/null
+++ b/backend/danswer/db/tag.py
@@ -0,0 +1,116 @@
+from sqlalchemy import delete
+from sqlalchemy import func
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from danswer.configs.constants import DocumentSource
+from danswer.db.models import Document
+from danswer.db.models import Document__Tag
+from danswer.db.models import Tag
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def create_or_add_document_tag(
+    tag_key: str,
+    tag_value: str,
+    source: DocumentSource,
+    document_id: str,
+    db_session: Session,
+) -> Tag:
+    document = db_session.get(Document, document_id)
+    if not document:
+        raise ValueError("Invalid Document, cannot attach Tags")
+
+    tag_stmt = select(Tag).where(
+        Tag.tag_key == tag_key,
+        Tag.tag_value == tag_value,
+        Tag.source == source,
+    )
+    tag = db_session.execute(tag_stmt).scalar_one_or_none()
+
+    if not tag:
+        tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
+        db_session.add(tag)
+
+    if tag not in document.tags:
+        document.tags.append(tag)
+
+    db_session.commit()
+    return tag
+
+
+def create_or_add_document_tag_list(
+    tag_key: str,
+    tag_values: list[str],
+    source: DocumentSource,
+    document_id: str,
+    db_session: Session,
+) -> list[Tag]:
+    document = db_session.get(Document, document_id)
+    if not document:
+        raise ValueError("Invalid Document, cannot attach Tags")
+
+    existing_tags_stmt = select(Tag).where(
+        Tag.tag_key == tag_key, Tag.tag_value.in_(tag_values), Tag.source == source
+    )
+    existing_tags = list(db_session.execute(existing_tags_stmt).scalars().all())
+    existing_tag_values = {tag.tag_value for tag in existing_tags}
+
+    new_tags = []
+    for tag_value in tag_values:
+        if tag_value not in existing_tag_values:
+            new_tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
+            db_session.add(new_tag)
+            new_tags.append(new_tag)
+
+    all_tags = existing_tags + new_tags
+
+    for tag in all_tags:
+        if tag not in document.tags:
+            document.tags.append(tag)
+
+    db_session.commit()
+    return all_tags
+
+
+def get_tags_by_value_prefix_for_source_types(
+    tag_value_prefix: str | None,
+    sources: list[DocumentSource] | None,
+    db_session: Session,
+) -> list[Tag]:
+    query = select(Tag)
+
+    if tag_value_prefix:
+        query = query.where(Tag.tag_value.startswith(tag_value_prefix))
+
+    if sources:
+        query = query.where(Tag.source.in_(sources))
+
+    result = db_session.execute(query)
+
+    tags = result.scalars().all()
+    return list(tags)
+
+
+def delete_document_tags_for_documents(
+    document_ids: list[str], db_session: Session
+) -> None:
+    stmt = delete(Document__Tag).where(Document__Tag.document_id.in_(document_ids))
+    db_session.execute(stmt)
+    db_session.commit()
+
+    orphan_tags_query = (
+        select(Tag.id)
+        .outerjoin(Document__Tag, Tag.id == Document__Tag.tag_id)
+        .group_by(Tag.id)
+        .having(func.count(Document__Tag.document_id) == 0)
+    )
+
+    orphan_tags = db_session.execute(orphan_tags_query).scalars().all()
+
+    if orphan_tags:
+        delete_orphan_tags_stmt = delete(Tag).where(Tag.id.in_(orphan_tags))
+        db_session.execute(delete_orphan_tags_stmt)
+        db_session.commit()
diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
index 8d68fb497..dcdda7a85 100644
--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -7,12 +7,20 @@ schema danswer_chunk {
         field chunk_id type int {
             indexing: summary | attribute
         }
-        field blurb type string {
+        # Displayed in the UI as the main identifier for the doc
+        field semantic_identifier type string {
             indexing: summary | attribute
         }
-        # Can separate out title in the future and give heavier bm-25 weighting
-        # Need to consider that not every doc has a separable title (ie. slack message)
-        # Set summary options to enable bolding
+        # May not always match the `semantic_identifier` e.g. for Slack docs the
+        # `semantic_identifier` will be the channel name, but the `title` will be empty
+        field title type string {
+            indexing: summary | index
+            match {
+                gram
+                gram-size: 3
+            }
+            index: enable-bm25
+        }
         field content type string {
             indexing: summary | index
             match {
@@ -28,6 +36,25 @@ schema danswer_chunk {
             indexing: summary | index
             summary: dynamic
         }
+        # Title embedding (x1)
+        field title_embedding type tensor<float>(x[384]) {
+            indexing: attribute
+            attribute {
+                distance-metric: angular
+            }
+        }
+        # Content embeddings (chunk + optional mini chunks embeddings)
+        # "t" and "x" are arbitrary names, not special keywords
+        field embeddings type tensor<float>(t{},x[384]) {
+            indexing: attribute
+            attribute {
+                distance-metric: angular
+            }
+        }
+        # Starting section of the doc, currently unused as it has been replaced by match highlighting
+        field blurb type string {
+            indexing: summary | attribute
+        }
         # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
         field source_type type string {
             indexing: summary | attribute
@@ -39,21 +66,6 @@ schema danswer_chunk {
         field source_links type string {
             indexing: summary | attribute
         }
-        # displayed in the UI as the main identifier for the doc
-        field semantic_identifier type string {
-            indexing: summary | attribute
-        }
-        # this is used when computing best matches based on the title of the document
-        # may not always match the `semantic_identifier` e.g. for Slack docs the 
-        # `semantic_identifier` will be the channel name, but the `title` will be empty
-        field title type string {
-            indexing: summary | index
-            match {
-                gram
-                gram-size: 3
-            }
-            index: enable-bm25
-        }
         field section_continuation type bool {
             indexing: summary | attribute
         }
@@ -65,15 +77,15 @@ schema danswer_chunk {
             indexing: summary | attribute
             rank: filter
         }
+        # Needs to have a separate Attribute list for efficient filtering
+        field metadata_list type array<string> {
+            indexing: summary | attribute
+            rank:filter
+            attribute: fast-search
+        }
         field metadata type string {
             indexing: summary | attribute
         }
-        field embeddings type tensor<float>(t{},x[384]) {
-            indexing: attribute
-            attribute {
-                distance-metric: angular
-            }
-        }
         field doc_updated_at type int {
             indexing: summary | attribute
         }
@@ -95,6 +107,11 @@ schema danswer_chunk {
         }
     }
 
+    # If using different tokenization settings, the fieldset has to be removed, and the field must
+    # be specified in the yql like:
+    # + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) '
+    # + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) '
+    # Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores
     fieldset default {
         fields: content, title
     }
@@ -124,6 +141,79 @@ schema danswer_chunk {
         match-features: recency_bias
     }
 
+    rank-profile hybrid_search inherits default, default_rank {
+        inputs {
+            query(query_embedding) tensor<float>(x[384])
+        }
+
+        # This must be separate function for normalize_linear to work
+        function vector_score() {
+            expression {
+                (query(title_content_ratio) * closeness(field, title_embedding)) +
+                ((1 - query(title_content_ratio)) * closeness(field, embeddings))
+            }
+        }
+
+        # This must be separate function for normalize_linear to work
+        function keyword_score() {
+            expression {
+                (query(title_content_ratio) * bm25(title)) +
+                ((1 - query(title_content_ratio)) * bm25(content))
+            }
+        }
+
+        first-phase {
+            expression: vector_score
+        }
+
+        # Weighted average between Vector Search and BM-25
+        # Each is a weighted average between the Title and Content fields
+        # Finally each doc is boosted by it's user feedback based boost and recency
+        # If any embedding or index field is missing, it just receives a score of 0
+        # Assumptions:
+        # - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
+        #   therefore not normalizing before combining.
+        # - For documents without title, it gets a score of 0 for that and this is ok as documents
+        #   without any title match should be penalized.
+        global-phase {
+            expression {
+                (
+                    # Weighted Vector Similarity Score
+                    (query(alpha) * normalize_linear(vector_score)) +
+                    # Weighted Keyword Similarity Score
+                    ((1 - query(alpha)) * normalize_linear(keyword_score))
+                )
+                # Boost based on user feedback
+                * document_boost
+                # Decay factor based on time document was last updated
+                * recency_bias
+            }
+            rerank-count: 1000
+        }
+
+        match-features {
+            bm25(title)
+            bm25(content)
+            closeness(field, title_embedding)
+            closeness(field, embeddings)
+            keyword_score
+            vector_score
+            document_boost
+            recency_bias
+            closest(embeddings)
+        }
+    }
+
+    # Used when searching from the admin UI for a specific doc to hide / boost
+    # Very heavily prioritize title
+    rank-profile admin_search inherits default, default_rank {
+        first-phase {
+            expression: bm25(content) + (5 * bm25(title))
+        }
+    }
+
+    # THE ONES BELOW ARE OUT OF DATE, DO NOT USE
+    # THEY MIGHT NOT EVEN WORK AT ALL
     rank-profile keyword_search inherits default, default_rank {
         first-phase {
             expression: bm25(content) * document_boost * recency_bias
@@ -145,29 +235,4 @@ schema danswer_chunk {
 
         match-features: recency_bias document_boost closest(embeddings)
     }
-
-    rank-profile hybrid_search inherits default, default_rank {
-        inputs {
-            query(query_embedding) tensor<float>(x[384])
-        }
-
-        first-phase {
-            expression: closeness(field, embeddings)
-        }
-
-        global-phase {
-            expression: ((query(alpha) * normalize_linear(closeness(field, embeddings))) + ((1 - query(alpha)) * normalize_linear(bm25(content)))) * document_boost * recency_bias
-            rerank-count: 1000
-        }
-
-        # Cannot pass normalize_linear features in match-features
-        match-features: recency_bias document_boost closest(embeddings)
-    }
-
-    # used when searching from the admin UI for a specific doc to hide / boost
-    rank-profile admin_search inherits default, default_rank {
-        first-phase {
-            expression: bm25(content) + (5 * bm25(title))
-        }
-    }
 }
diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py
index f649621bf..de053f71e 100644
--- a/backend/danswer/document_index/vespa/index.py
+++ b/backend/danswer/document_index/vespa/index.py
@@ -25,6 +25,7 @@ from danswer.configs.chat_configs import DOC_TIME_DECAY
 from danswer.configs.chat_configs import EDIT_KEYWORD_QUERY
 from danswer.configs.chat_configs import HYBRID_ALPHA
 from danswer.configs.chat_configs import NUM_RETURNED_HITS
+from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
 from danswer.configs.constants import ACCESS_CONTROL_LIST
 from danswer.configs.constants import BLURB
 from danswer.configs.constants import BOOST
@@ -35,7 +36,9 @@ from danswer.configs.constants import DOCUMENT_ID
 from danswer.configs.constants import DOCUMENT_SETS
 from danswer.configs.constants import EMBEDDINGS
 from danswer.configs.constants import HIDDEN
+from danswer.configs.constants import INDEX_SEPARATOR
 from danswer.configs.constants import METADATA
+from danswer.configs.constants import METADATA_LIST
 from danswer.configs.constants import PRIMARY_OWNERS
 from danswer.configs.constants import RECENCY_BIAS
 from danswer.configs.constants import SECONDARY_OWNERS
@@ -44,6 +47,8 @@ from danswer.configs.constants import SEMANTIC_IDENTIFIER
 from danswer.configs.constants import SOURCE_LINKS
 from danswer.configs.constants import SOURCE_TYPE
 from danswer.configs.constants import TITLE
+from danswer.configs.constants import TITLE_EMBEDDING
+from danswer.configs.constants import TITLE_SEPARATOR
 from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
 from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
     get_experts_stores_representations,
@@ -239,20 +244,25 @@ def _index_vespa_chunk(
         for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings):
             embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed
 
+    title = document.get_title_for_document_index()
+
     vespa_document_fields = {
         DOCUMENT_ID: document.id,
         CHUNK_ID: chunk.chunk_id,
         BLURB: remove_invalid_unicode_chars(chunk.blurb),
-        # this duplication of `content` is needed for keyword highlighting :(
+        TITLE: remove_invalid_unicode_chars(title) if title else None,
         CONTENT: remove_invalid_unicode_chars(chunk.content),
+        # This duplication of `content` is needed for keyword highlighting :(
         CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
         SOURCE_TYPE: str(document.source.value),
         SOURCE_LINKS: json.dumps(chunk.source_links),
         SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
-        TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()),
         SECTION_CONTINUATION: chunk.section_continuation,
         METADATA: json.dumps(document.metadata),
+        # Save as a list for efficient extraction as an Attribute
+        METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
         EMBEDDINGS: embeddings_name_vector_map,
+        TITLE_EMBEDDING: chunk.title_embedding,
         BOOST: chunk.boost,
         DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
         PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
@@ -394,6 +404,12 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) ->
     )
     filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
 
+    tag_attributes = None
+    tags = filters.tags
+    if tags:
+        tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
+    filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
+
     filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
 
     filter_str += _build_time_filter(filters.time_cutoff)
@@ -448,6 +464,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
         if DOC_UPDATED_AT in fields
         else None
     )
+
+    # The highlights might include the title but this is the best way we have so far to show the highlighting
     match_highlights = _process_dynamic_summary(
         # fallback to regular `content` if the `content_summary` field
         # isn't present
@@ -459,6 +477,13 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
             f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
         )
 
+    # Remove the title from the first chunk as every chunk already included
+    # its semantic identifier for LLM
+    content = fields[CONTENT]
+    if fields[CHUNK_ID] == 0:
+        parts = content.split(TITLE_SEPARATOR, maxsplit=1)
+        content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
+
     # User ran into this, not sure why this could happen, error checking here
     blurb = fields.get(BLURB)
     if not blurb:
@@ -477,7 +502,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
     return InferenceChunk(
         chunk_id=fields[CHUNK_ID],
         blurb=blurb,
-        content=fields[CONTENT],
+        content=content,
         source_links=source_links_dict,
         section_continuation=fields[SECTION_CONTINUATION],
         document_id=fields[DOCUMENT_ID],
@@ -725,6 +750,7 @@ class VespaIndex(DocumentIndex):
         num_to_retrieve: int = NUM_RETURNED_HITS,
         edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
     ) -> list[InferenceChunk]:
+        # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
         vespa_where_clauses = _build_vespa_filters(filters)
         yql = (
             VespaIndex.yql_base
@@ -759,6 +785,7 @@ class VespaIndex(DocumentIndex):
         distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
         edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
     ) -> list[InferenceChunk]:
+        # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
         vespa_where_clauses = _build_vespa_filters(filters)
         yql = (
             VespaIndex.yql_base
@@ -798,6 +825,7 @@ class VespaIndex(DocumentIndex):
         time_decay_multiplier: float,
         num_to_retrieve: int,
         hybrid_alpha: float | None = HYBRID_ALPHA,
+        title_content_ratio: float | None = TITLE_CONTENT_RATIO,
         distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
         edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
     ) -> list[InferenceChunk]:
@@ -808,6 +836,7 @@ class VespaIndex(DocumentIndex):
             VespaIndex.yql_base
             + vespa_where_clauses
             + f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
+            + f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) "
             + 'or ({grammar: "weakAnd"}userInput(@query)) '
             + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
         )
@@ -828,6 +857,9 @@ class VespaIndex(DocumentIndex):
             "input.query(alpha)": hybrid_alpha
             if hybrid_alpha is not None
             else HYBRID_ALPHA,
+            "input.query(title_content_ratio)": title_content_ratio
+            if title_content_ratio is not None
+            else TITLE_CONTENT_RATIO,
             "hits": num_to_retrieve,
             "offset": 0,
             "ranking.profile": "hybrid_search",
diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py
index ae45725b1..9d27885d3 100644
--- a/backend/danswer/indexing/chunker.py
+++ b/backend/danswer/indexing/chunker.py
@@ -7,15 +7,15 @@ from transformers import AutoTokenizer  # type:ignore
 from danswer.configs.app_configs import BLURB_SIZE
 from danswer.configs.app_configs import CHUNK_OVERLAP
 from danswer.configs.app_configs import MINI_CHUNK_SIZE
+from danswer.configs.constants import SECTION_SEPARATOR
+from danswer.configs.constants import TITLE_SEPARATOR
 from danswer.configs.model_configs import CHUNK_SIZE
 from danswer.connectors.models import Document
-from danswer.connectors.models import Section
 from danswer.indexing.models import DocAwareChunk
 from danswer.search.search_nlp_models import get_default_tokenizer
 from danswer.utils.text_processing import shared_precompare_cleanup
 
 
-SECTION_SEPARATOR = "\n\n"
 ChunkFunc = Callable[[Document], list[DocAwareChunk]]
 
 
@@ -29,7 +29,8 @@ def extract_blurb(text: str, blurb_size: int) -> str:
 
 
 def chunk_large_section(
-    section: Section,
+    section_text: str,
+    section_link_text: str,
     document: Document,
     start_chunk_id: int,
     tokenizer: AutoTokenizer,
@@ -37,8 +38,6 @@ def chunk_large_section(
     chunk_overlap: int = CHUNK_OVERLAP,
     blurb_size: int = BLURB_SIZE,
 ) -> list[DocAwareChunk]:
-    section_text = section.text
-    section_link_text = section.link or ""
     blurb = extract_blurb(section_text, blurb_size)
 
     sentence_aware_splitter = SentenceSplitter(
@@ -67,14 +66,18 @@ def chunk_document(
     subsection_overlap: int = CHUNK_OVERLAP,
     blurb_size: int = BLURB_SIZE,
 ) -> list[DocAwareChunk]:
+    title = document.get_title_for_document_index()
+    title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
     tokenizer = get_default_tokenizer()
 
     chunks: list[DocAwareChunk] = []
     link_offsets: dict[int, str] = {}
     chunk_text = ""
-    for section in document.sections:
+    for ind, section in enumerate(document.sections):
+        section_text = title_prefix + section.text if ind == 0 else section.text
         section_link_text = section.link or ""
-        section_tok_length = len(tokenizer.tokenize(section.text))
+
+        section_tok_length = len(tokenizer.tokenize(section_text))
         current_tok_length = len(tokenizer.tokenize(chunk_text))
         curr_offset_len = len(shared_precompare_cleanup(chunk_text))
 
@@ -96,7 +99,8 @@ def chunk_document(
                 chunk_text = ""
 
             large_section_chunks = chunk_large_section(
-                section=section,
+                section_text=section_text,
+                section_link_text=section_link_text,
                 document=document,
                 start_chunk_id=len(chunks),
                 tokenizer=tokenizer,
@@ -115,7 +119,7 @@ def chunk_document(
             <= chunk_tok_size
         ):
             chunk_text += (
-                SECTION_SEPARATOR + section.text if chunk_text else section.text
+                SECTION_SEPARATOR + section_text if chunk_text else section_text
             )
             link_offsets[curr_offset_len] = section_link_text
         else:
@@ -130,7 +134,7 @@ def chunk_document(
                 )
             )
             link_offsets = {0: section_link_text}
-            chunk_text = section.text
+            chunk_text = section_text
 
     # Once we hit the end, if we're still in the process of building a chunk, add what we have
     if chunk_text:
diff --git a/backend/danswer/indexing/embedder.py b/backend/danswer/indexing/embedder.py
index 2ca1092b9..63cb569ee 100644
--- a/backend/danswer/indexing/embedder.py
+++ b/backend/danswer/indexing/embedder.py
@@ -21,6 +21,9 @@ def embed_chunks(
     enable_mini_chunk: bool = ENABLE_MINI_CHUNK,
     passage_prefix: str = ASYM_PASSAGE_PREFIX,
 ) -> list[IndexChunk]:
+    # Cache the Title embeddings to only have to do it once
+    title_embed_dict: dict[str, list[float]] = {}
+
     embedded_chunks: list[IndexChunk] = []
     if embedding_model is None:
         embedding_model = EmbeddingModel()
@@ -58,12 +61,24 @@ def embed_chunks(
         chunk_embeddings = embeddings[
             embedding_ind_start : embedding_ind_start + num_embeddings
         ]
+
+        title = chunk.source_document.get_title_for_document_index()
+
+        title_embedding = None
+        if title:
+            if title in title_embed_dict:
+                title_embedding = title_embed_dict[title]
+            else:
+                title_embedding = embedding_model.encode([title])[0]
+                title_embed_dict[title] = title_embedding
+
         new_embedded_chunk = IndexChunk(
             **{k: getattr(chunk, k) for k in chunk.__dataclass_fields__},
             embeddings=ChunkEmbedding(
                 full_embedding=chunk_embeddings[0],
                 mini_chunk_embeddings=chunk_embeddings[1:],
             ),
+            title_embedding=title_embedding,
         )
         embedded_chunks.append(new_embedded_chunk)
         embedding_ind_start += num_embeddings
diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py
index c6ffbc278..be676e035 100644
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@@ -17,6 +17,8 @@ from danswer.db.document import update_docs_updated_at
 from danswer.db.document import upsert_documents_complete
 from danswer.db.document_set import fetch_document_sets_for_documents
 from danswer.db.engine import get_sqlalchemy_engine
+from danswer.db.tag import create_or_add_document_tag
+from danswer.db.tag import create_or_add_document_tag_list
 from danswer.document_index.factory import get_default_document_index
 from danswer.document_index.interfaces import DocumentIndex
 from danswer.document_index.interfaces import DocumentMetadata
@@ -44,6 +46,7 @@ def upsert_documents_in_db(
     index_attempt_metadata: IndexAttemptMetadata,
     db_session: Session,
 ) -> None:
+    # Metadata here refers to basic document info, not metadata about the actual content
     doc_m_batch: list[DocumentMetadata] = []
     for doc in documents:
         first_link = next(
@@ -66,6 +69,26 @@ def upsert_documents_in_db(
         document_metadata_batch=doc_m_batch,
     )
 
+    # Insert document content metadata
+    for doc in documents:
+        for k, v in doc.metadata.items():
+            if isinstance(v, list):
+                create_or_add_document_tag_list(
+                    tag_key=k,
+                    tag_values=v,
+                    source=doc.source,
+                    document_id=doc.id,
+                    db_session=db_session,
+                )
+            else:
+                create_or_add_document_tag(
+                    tag_key=k,
+                    tag_value=v,
+                    source=doc.source,
+                    document_id=doc.id,
+                    db_session=db_session,
+                )
+
 
 @log_function_time()
 def index_doc_batch(
@@ -121,6 +144,8 @@ def index_doc_batch(
         )
 
         logger.debug("Starting chunking")
+
+        # The first chunk additionally contains the Title of the Document
         chunks: list[DocAwareChunk] = list(
             chain(*[chunker.chunk(document=document) for document in updatable_docs])
         )
diff --git a/backend/danswer/indexing/models.py b/backend/danswer/indexing/models.py
index 66def05f3..331a5fed4 100644
--- a/backend/danswer/indexing/models.py
+++ b/backend/danswer/indexing/models.py
@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 from dataclasses import fields
 from datetime import datetime
-from typing import Any
 
 from danswer.access.models import DocumentAccess
 from danswer.configs.constants import DocumentSource
@@ -48,6 +47,7 @@ class DocAwareChunk(BaseChunk):
 @dataclass
 class IndexChunk(DocAwareChunk):
     embeddings: ChunkEmbedding
+    title_embedding: Embedding | None
 
 
 @dataclass
@@ -95,7 +95,7 @@ class InferenceChunk(BaseChunk):
     recency_bias: float
     score: float | None
     hidden: bool
-    metadata: dict[str, Any]
+    metadata: dict[str, str | list[str]]
     # Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
     # to specify that a set of words should be highlighted. For example:
     # ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
diff --git a/backend/danswer/search/models.py b/backend/danswer/search/models.py
index 3fbf1bbb1..0b6cef70d 100644
--- a/backend/danswer/search/models.py
+++ b/backend/danswer/search/models.py
@@ -48,10 +48,16 @@ class Embedder:
         raise NotImplementedError
 
 
+class Tag(BaseModel):
+    tag_key: str
+    tag_value: str
+
+
 class BaseFilters(BaseModel):
     source_type: list[DocumentSource] | None = None
     document_set: list[str] | None = None
     time_cutoff: datetime | None = None
+    tags: list[Tag] | None = None
 
 
 class IndexFilters(BaseFilters):
@@ -110,6 +116,7 @@ class SearchDoc(BaseModel):
     # since a standard search will never find a hidden doc, this can only ever
     # be `True` when doing an admin search
     hidden: bool
+    metadata: dict[str, str | list[str]]
     score: float | None
     # Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
     # to specify that a set of words should be highlighted. For example:
diff --git a/backend/danswer/search/request_preprocessing.py b/backend/danswer/search/request_preprocessing.py
index ea131eee7..77dee972f 100644
--- a/backend/danswer/search/request_preprocessing.py
+++ b/backend/danswer/search/request_preprocessing.py
@@ -121,6 +121,7 @@ def retrieval_preprocessing(
         source_type=preset_filters.source_type or predicted_source_filters,
         document_set=preset_filters.document_set,
         time_cutoff=preset_filters.time_cutoff or predicted_time_cutoff,
+        tags=preset_filters.tags,  # Tags are never auto-extracted
         access_control_list=user_acl_filters,
     )
 
diff --git a/backend/danswer/search/search_runner.py b/backend/danswer/search/search_runner.py
index f77aa2e6c..3ca5280e2 100644
--- a/backend/danswer/search/search_runner.py
+++ b/backend/danswer/search/search_runner.py
@@ -96,6 +96,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
                 source_type=chunk.source_type,
                 boost=chunk.boost,
                 hidden=chunk.hidden,
+                metadata=chunk.metadata,
                 score=chunk.score,
                 match_highlights=chunk.match_highlights,
                 updated_at=chunk.updated_at,
diff --git a/backend/danswer/server/query_and_chat/models.py b/backend/danswer/server/query_and_chat/models.py
index 035331a17..cf0d009f3 100644
--- a/backend/danswer/server/query_and_chat/models.py
+++ b/backend/danswer/server/query_and_chat/models.py
@@ -5,12 +5,29 @@ from pydantic import BaseModel
 from pydantic import root_validator
 
 from danswer.chat.models import RetrievalDocs
+from danswer.configs.constants import DocumentSource
 from danswer.configs.constants import MessageType
 from danswer.configs.constants import SearchFeedbackType
 from danswer.search.models import BaseFilters
 from danswer.search.models import RetrievalDetails
 from danswer.search.models import SearchDoc
 from danswer.search.models import SearchType
+from danswer.search.models import Tag
+
+
+class TagRequest(BaseModel):
+    match_pattern: str | None
+    # If this is empty or None, then tags for all sources are considered
+    sources: list[DocumentSource] | None
+    allow_prefix: bool = True  # This is currently the only option
+
+
+class SourceTag(Tag):
+    source: DocumentSource
+
+
+class TagResponse(BaseModel):
+    tags: list[SourceTag]
 
 
 class SimpleQueryRequest(BaseModel):
diff --git a/backend/danswer/server/query_and_chat/query_backend.py b/backend/danswer/server/query_and_chat/query_backend.py
index 576ac87dc..305a7de01 100644
--- a/backend/danswer/server/query_and_chat/query_backend.py
+++ b/backend/danswer/server/query_and_chat/query_backend.py
@@ -9,6 +9,7 @@ from danswer.auth.users import current_user
 from danswer.configs.chat_configs import DISABLE_LLM_CHUNK_FILTER
 from danswer.db.engine import get_session
 from danswer.db.models import User
+from danswer.db.tag import get_tags_by_value_prefix_for_source_types
 from danswer.document_index.factory import get_default_document_index
 from danswer.document_index.vespa.index import VespaIndex
 from danswer.one_shot_answer.answer_question import stream_search_answer
@@ -30,6 +31,9 @@ from danswer.server.query_and_chat.models import DocumentSearchRequest
 from danswer.server.query_and_chat.models import HelperResponse
 from danswer.server.query_and_chat.models import QueryValidationResponse
 from danswer.server.query_and_chat.models import SimpleQueryRequest
+from danswer.server.query_and_chat.models import SourceTag
+from danswer.server.query_and_chat.models import TagRequest
+from danswer.server.query_and_chat.models import TagResponse
 from danswer.utils.logger import setup_logger
 
 logger = setup_logger()
@@ -75,6 +79,29 @@ def admin_search(
     return AdminSearchResponse(documents=deduplicated_documents)
 
 
+@basic_router.post("/valid-tags")
+def get_tags(
+    tag_request: TagRequest,
+    _: User = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> TagResponse:
+    if not tag_request.allow_prefix:
+        raise NotImplementedError("Cannot disable prefix match for now")
+
+    db_tags = get_tags_by_value_prefix_for_source_types(
+        tag_value_prefix=tag_request.match_pattern,
+        sources=tag_request.sources,
+        db_session=db_session,
+    )
+    server_tags = [
+        SourceTag(
+            tag_key=db_tag.tag_key, tag_value=db_tag.tag_value, source=db_tag.source
+        )
+        for db_tag in db_tags
+    ]
+    return TagResponse(tags=server_tags)
+
+
 @basic_router.post("/search-intent")
 def get_search_type(
     simple_query: SimpleQueryRequest, _: User = Depends(current_user)
diff --git a/backend/scripts/simulate_chat_frontend.py b/backend/scripts/simulate_chat_frontend.py
index 51c077d29..2344c166d 100644
--- a/backend/scripts/simulate_chat_frontend.py
+++ b/backend/scripts/simulate_chat_frontend.py
@@ -30,7 +30,11 @@ def send_chat_message(
         "chat_session_id": chat_session_id,
         "parent_message_id": parent_message,
         "prompt_id": 0,  # Global default Prompt
-        "retrieval_options": {"run_search": "always", "real_time": True},
+        "retrieval_options": {
+            "run_search": "always",
+            "real_time": True,
+            "filters": {"tags": []},
+        },
     }
 
     docs: list[dict] | None = None