From d7141df5fc28ef14e608e04dc9d8c3cca619d0aa Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Tue, 2 Jan 2024 11:25:50 -0800 Subject: [PATCH] Metadata and Title Search (#903) --- backend/alembic/versions/904e5138fffb_tags.py | 61 +++++++ backend/danswer/configs/chat_configs.py | 9 +- backend/danswer/configs/constants.py | 8 + .../danswer/connectors/bookstack/connector.py | 64 +++++-- .../connectors/confluence/connector.py | 6 +- .../cross_connector_utils/__init__.py | 0 .../connectors/danswer_jira/connector.py | 29 +-- .../connectors/document360/connector.py | 11 +- backend/danswer/connectors/file/__init__.py | 0 .../danswer/connectors/github/connector.py | 8 +- backend/danswer/connectors/gong/connector.py | 10 +- .../connectors/google_drive/connector.py | 10 +- .../connectors/google_site/__init__.py | 0 backend/danswer/connectors/guru/connector.py | 2 +- .../danswer/connectors/hubspot/connector.py | 2 +- backend/danswer/connectors/linear/__init__.py | 0 .../danswer/connectors/linear/connector.py | 4 +- backend/danswer/connectors/models.py | 27 ++- .../danswer/connectors/notion/connector.py | 3 +- .../connectors/productboard/connector.py | 57 +++--- .../connectors/requesttracker/connector.py | 3 +- backend/danswer/connectors/zulip/__init__.py | 0 backend/danswer/db/chat.py | 2 + backend/danswer/db/document.py | 2 + backend/danswer/db/models.py | 37 ++++ backend/danswer/db/tag.py | 116 ++++++++++++ .../vespa/app_config/schemas/danswer_chunk.sd | 165 ++++++++++++------ backend/danswer/document_index/vespa/index.py | 38 +++- backend/danswer/indexing/chunker.py | 24 +-- backend/danswer/indexing/embedder.py | 15 ++ backend/danswer/indexing/indexing_pipeline.py | 25 +++ backend/danswer/indexing/models.py | 4 +- backend/danswer/search/models.py | 7 + .../danswer/search/request_preprocessing.py | 1 + backend/danswer/search/search_runner.py | 1 + .../danswer/server/query_and_chat/models.py | 17 ++ .../server/query_and_chat/query_backend.py | 27 +++ backend/scripts/simulate_chat_frontend.py | 6 +- 38 files changed, 639 insertions(+), 162 deletions(-) create mode 100644 backend/alembic/versions/904e5138fffb_tags.py create mode 100644 backend/danswer/connectors/cross_connector_utils/__init__.py create mode 100644 backend/danswer/connectors/file/__init__.py create mode 100644 backend/danswer/connectors/google_site/__init__.py create mode 100644 backend/danswer/connectors/linear/__init__.py create mode 100644 backend/danswer/connectors/zulip/__init__.py create mode 100644 backend/danswer/db/tag.py diff --git a/backend/alembic/versions/904e5138fffb_tags.py b/backend/alembic/versions/904e5138fffb_tags.py new file mode 100644 index 000000000..aaf4bd51f --- /dev/null +++ b/backend/alembic/versions/904e5138fffb_tags.py @@ -0,0 +1,61 @@ +"""Tags + +Revision ID: 904e5138fffb +Revises: 891cd83c87a8 +Create Date: 2024-01-01 10:44:43.733974 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "904e5138fffb" +down_revision = "891cd83c87a8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "tag", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("tag_key", sa.String(), nullable=False), + sa.Column("tag_value", sa.String(), nullable=False), + sa.Column("source", sa.String(), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint( + "tag_key", "tag_value", "source", name="_tag_key_value_source_uc" + ), + ) + op.create_table( + "document__tag", + sa.Column("document_id", sa.String(), nullable=False), + sa.Column("tag_id", sa.Integer(), nullable=False), + sa.ForeignKeyConstraint( + ["document_id"], + ["document.id"], + ), + sa.ForeignKeyConstraint( + ["tag_id"], + ["tag.id"], + ), + sa.PrimaryKeyConstraint("document_id", "tag_id"), + ) + + op.add_column( + "search_doc", + sa.Column( + "doc_metadata", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + ) + op.execute("UPDATE search_doc SET doc_metadata = '{}' WHERE doc_metadata IS NULL") + op.alter_column("search_doc", "doc_metadata", nullable=False) + + +def downgrade() -> None: + op.drop_table("document__tag") + op.drop_table("tag") + op.drop_column("search_doc", "doc_metadata") diff --git a/backend/danswer/configs/chat_configs.py b/backend/danswer/configs/chat_configs.py index 872e54e38..21d7b8c28 100644 --- a/backend/danswer/configs/chat_configs.py +++ b/backend/danswer/configs/chat_configs.py @@ -59,7 +59,14 @@ if os.environ.get("EDIT_KEYWORD_QUERY"): else: EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL") # Weighting factor between Vector and Keyword Search, 1 for completely vector search -HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.6))) +HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.66))) +# Weighting factor between Title and Content of documents during search, 1 for completely +# Title based. Default heavily favors Content because Title is also included at the top of +# Content. This is to avoid cases where the Content is very relevant but it may not be clear +# if the title is separated out. Title is most of a "boost" than a separate field. +TITLE_CONTENT_RATIO = max( + 0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20)) +) # A list of languages passed to the LLM to rephase the query # For example "English,French,Spanish", be sure to use the "," separator MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index dd1019d7e..d86f980c9 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -11,11 +11,13 @@ SEMANTIC_IDENTIFIER = "semantic_identifier" TITLE = "title" SECTION_CONTINUATION = "section_continuation" EMBEDDINGS = "embeddings" +TITLE_EMBEDDING = "title_embedding" ALLOWED_USERS = "allowed_users" ACCESS_CONTROL_LIST = "access_control_list" DOCUMENT_SETS = "document_sets" TIME_FILTER = "time_filter" METADATA = "metadata" +METADATA_LIST = "metadata_list" MATCH_HIGHLIGHTS = "match_highlights" # stored in the `metadata` of a chunk. Used to signify that this chunk should # not be used for QA. For example, Google Drive file types which can't be parsed @@ -38,6 +40,12 @@ SESSION_KEY = "session" QUERY_EVENT_ID = "query_event_id" LLM_CHUNKS = "llm_chunks" +# For chunking/processing chunks +TITLE_SEPARATOR = "\n\r\n" +SECTION_SEPARATOR = "\n\n" +# For combining attributes, doesn't have to be unique/perfect to work +INDEX_SEPARATOR = "===" + class DocumentSource(str, Enum): # Special case, document passed in via Danswer APIs without specifying a source type diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index 1bc8d3f9e..606866b42 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -8,6 +8,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.bookstack.client import BookStackApiClient from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic +from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -72,13 +73,21 @@ class BookstackConnector(LoadConnector, PollConnector): bookstack_client: BookStackApiClient, book: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url("/books/" + str(book.get("slug"))) + title = str(book.get("name", "")) text = book.get("name", "") + "\n" + book.get("description", "") + updated_at_str = ( + str(book.get("updated_at")) if book.get("updated_at") is not None else None + ) return Document( - id="book:" + str(book.get("id")), + id="book__" + str(book.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, - semantic_identifier="Book: " + str(book.get("name")), - metadata={"type": "book", "updated_at": str(book.get("updated_at"))}, + semantic_identifier="Book: " + title, + title=title, + doc_updated_at=time_str_to_utc(updated_at_str) + if updated_at_str is not None + else None, + metadata={"type": "book"}, ) @staticmethod @@ -91,13 +100,23 @@ class BookstackConnector(LoadConnector, PollConnector): + "/chapter/" + str(chapter.get("slug")) ) + title = str(chapter.get("name", "")) text = chapter.get("name", "") + "\n" + chapter.get("description", "") + updated_at_str = ( + str(chapter.get("updated_at")) + if chapter.get("updated_at") is not None + else None + ) return Document( - id="chapter:" + str(chapter.get("id")), + id="chapter__" + str(chapter.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, - semantic_identifier="Chapter: " + str(chapter.get("name")), - metadata={"type": "chapter", "updated_at": str(chapter.get("updated_at"))}, + semantic_identifier="Chapter: " + title, + title=title, + doc_updated_at=time_str_to_utc(updated_at_str) + if updated_at_str is not None + else None, + metadata={"type": "chapter"}, ) @staticmethod @@ -105,13 +124,23 @@ class BookstackConnector(LoadConnector, PollConnector): bookstack_client: BookStackApiClient, shelf: dict[str, Any] ) -> Document: url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug"))) + title = str(shelf.get("name", "")) text = shelf.get("name", "") + "\n" + shelf.get("description", "") + updated_at_str = ( + str(shelf.get("updated_at")) + if shelf.get("updated_at") is not None + else None + ) return Document( id="shelf:" + str(shelf.get("id")), sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, - semantic_identifier="Shelf: " + str(shelf.get("name")), - metadata={"type": "shelf", "updated_at": shelf.get("updated_at")}, + semantic_identifier="Shelf: " + title, + title=title, + doc_updated_at=time_str_to_utc(updated_at_str) + if updated_at_str is not None + else None, + metadata={"type": "shelf"}, ) @staticmethod @@ -119,7 +148,7 @@ class BookstackConnector(LoadConnector, PollConnector): bookstack_client: BookStackApiClient, page: dict[str, Any] ) -> Document: page_id = str(page.get("id")) - page_name = str(page.get("name")) + title = str(page.get("name", "")) page_data = bookstack_client.get("/pages/" + page_id, {}) url = bookstack_client.build_app_url( "/books/" @@ -127,17 +156,24 @@ class BookstackConnector(LoadConnector, PollConnector): + "/page/" + str(page_data.get("slug")) ) - page_html = ( - "

" + html.escape(page_name) + "

" + str(page_data.get("html")) - ) + page_html = "

" + html.escape(title) + "

" + str(page_data.get("html")) text = parse_html_page_basic(page_html) + updated_at_str = ( + str(page_data.get("updated_at")) + if page_data.get("updated_at") is not None + else None + ) time.sleep(0.1) return Document( id="page:" + page_id, sections=[Section(link=url, text=text)], source=DocumentSource.BOOKSTACK, - semantic_identifier="Page: " + str(page_name), - metadata={"type": "page", "updated_at": page_data.get("updated_at")}, + semantic_identifier="Page: " + str(title), + title=str(title), + doc_updated_at=time_str_to_utc(updated_at_str) + if updated_at_str is not None + else None, + metadata={"type": "page"}, ) def load_from_state(self) -> GenerateDocumentsOutput: diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index bf9b9c00f..f2d091e4d 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -333,11 +333,7 @@ class ConfluenceConnector(LoadConnector, PollConnector): if not page_html: logger.debug("Page is empty, skipping: %s", page_url) continue - page_text = ( - page.get("title", "") - + "\n" - + parse_html_page(page_html, self.confluence_client) - ) + page_text = parse_html_page(page_html, self.confluence_client) comments_text = self._fetch_comments(self.confluence_client, page_id) page_text += comments_text diff --git a/backend/danswer/connectors/cross_connector_utils/__init__.py b/backend/danswer/connectors/cross_connector_utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py index 0bfd74f60..8d82fd8b4 100644 --- a/backend/danswer/connectors/danswer_jira/connector.py +++ b/backend/danswer/connectors/danswer_jira/connector.py @@ -3,16 +3,17 @@ from datetime import timezone from typing import Any from urllib.parse import urlparse -from dateutil.parser import parse from jira import JIRA from jira.resources import Issue from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section @@ -60,26 +61,32 @@ def fetch_jira_issues_batch( logger.warning(f"Found Jira object not of type Issue {jira}") continue - ticket_updated_time = parse(jira.fields.updated) - - semantic_rep = ( - f"Jira Ticket Summary: {jira.fields.summary}\n" - f"Description: {jira.fields.description}\n" - + "\n".join( - [f"Comment: {comment.body}" for comment in jira.fields.comment.comments] - ) + semantic_rep = f"{jira.fields.description}\n" + "\n".join( + [f"Comment: {comment.body}" for comment in jira.fields.comment.comments] ) page_url = f"{jira_client.client_info()}/browse/{jira.key}" + author = None + try: + author = BasicExpertInfo( + display_name=jira.fields.creator.displayName, + email=jira.fields.creator.emailAddress, + ) + except Exception: + # Author should exist but if not, doesn't matter + pass + doc_batch.append( Document( id=page_url, sections=[Section(link=page_url, text=semantic_rep)], source=DocumentSource.JIRA, semantic_identifier=jira.fields.summary, - doc_updated_at=ticket_updated_time.astimezone(timezone.utc), - metadata={}, + doc_updated_at=time_str_to_utc(jira.fields.updated), + primary_owners=[author] if author is not None else None, + # TODO add secondary_owners if needed + metadata={"label": jira.fields.labels} if jira.fields.labels else {}, ) ) return doc_batch, len(batch) diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py index 5324aa1d7..82ac51f17 100644 --- a/backend/danswer/connectors/document360/connector.py +++ b/backend/danswer/connectors/document360/connector.py @@ -140,11 +140,7 @@ class Document360Connector(LoadConnector, PollConnector): html_content = article_details["html_content"] article_content = parse_html_page_basic(html_content) doc_text = ( - f"workspace: {self.workspace}\n" - f"category: {article['category_name']}\n" - f"article: {article_details['title']} - " - f"{article_details.get('description', '')}\n" - f"{article_content}" + f"{article_details.get('description', '')}\n{article_content}".strip() ) document = Document( @@ -154,7 +150,10 @@ class Document360Connector(LoadConnector, PollConnector): semantic_identifier=article_details["title"], doc_updated_at=updated_at, primary_owners=authors, - metadata={}, + metadata={ + "workspace": self.workspace, + "category": article["category_name"], + }, ) doc_batch.append(document) diff --git a/backend/danswer/connectors/file/__init__.py b/backend/danswer/connectors/file/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/connectors/github/connector.py b/backend/danswer/connectors/github/connector.py index 7fa19b1c4..4fd104e29 100644 --- a/backend/danswer/connectors/github/connector.py +++ b/backend/danswer/connectors/github/connector.py @@ -37,10 +37,9 @@ def _batch_github_objects( def _convert_pr_to_document(pull_request: PullRequest) -> Document: - full_context = f"Pull-Request {pull_request.title}\n{pull_request.body}" return Document( id=pull_request.html_url, - sections=[Section(link=pull_request.html_url, text=full_context)], + sections=[Section(link=pull_request.html_url, text=pull_request.body or "")], source=DocumentSource.GITHUB, semantic_identifier=pull_request.title, # updated_at is UTC time but is timezone unaware, explicitly add UTC @@ -48,7 +47,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: # due to local time discrepancies with UTC doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc), metadata={ - "merged": pull_request.merged, + "merged": str(pull_request.merged), "state": pull_request.state, }, ) @@ -60,10 +59,9 @@ def _fetch_issue_comments(issue: Issue) -> str: def _convert_issue_to_document(issue: Issue) -> Document: - full_context = f"Issue {issue.title}\n{issue.body}" return Document( id=issue.html_url, - sections=[Section(link=issue.html_url, text=full_context)], + sections=[Section(link=issue.html_url, text=issue.body or "")], source=DocumentSource.GITHUB, semantic_identifier=issue.title, # updated_at is UTC time but is timezone unaware diff --git a/backend/danswer/connectors/gong/connector.py b/backend/danswer/connectors/gong/connector.py index e7691bdf3..711bdf11b 100644 --- a/backend/danswer/connectors/gong/connector.py +++ b/backend/danswer/connectors/gong/connector.py @@ -206,9 +206,6 @@ class GongConnector(LoadConnector, PollConnector): speaker_to_name: dict[str, str] = {} transcript_text = "" - if call_title: - transcript_text += f"Call Title: {call_title}\n\n" - call_purpose = call_metadata["purpose"] if call_purpose: transcript_text += f"Call Description: {call_purpose}\n\n" @@ -234,6 +231,11 @@ class GongConnector(LoadConnector, PollConnector): ) transcript_text += f"{speaker_name}: {monolog}\n\n" + metadata = {} + if call_metadata.get("system"): + metadata["client"] = call_metadata.get("system") + # TODO calls have a clientUniqueId field, can pull that in later + doc_batch.append( Document( id=call_id, @@ -246,7 +248,7 @@ class GongConnector(LoadConnector, PollConnector): doc_updated_at=datetime.fromisoformat(call_time_str).astimezone( timezone.utc ), - metadata={}, + metadata={"client": call_metadata.get("system")}, ) ) yield doc_batch diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 4a745a8b1..4eda2d532 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -466,24 +466,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector): doc_batch = [] for file in files_batch: try: - text_contents = extract_text(file, service) - if text_contents: - full_context = file["name"] + " - " + text_contents - else: - full_context = file["name"] + text_contents = extract_text(file, service) or "" doc_batch.append( Document( id=file["webViewLink"], sections=[ - Section(link=file["webViewLink"], text=full_context) + Section(link=file["webViewLink"], text=text_contents) ], source=DocumentSource.GOOGLE_DRIVE, semantic_identifier=file["name"], doc_updated_at=datetime.fromisoformat( file["modifiedTime"] ).astimezone(timezone.utc), - metadata={} if text_contents else {IGNORE_FOR_QA: True}, + metadata={} if text_contents else {IGNORE_FOR_QA: "True"}, ) ) except Exception as e: diff --git a/backend/danswer/connectors/google_site/__init__.py b/backend/danswer/connectors/google_site/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py index dff2c366e..89f63b131 100644 --- a/backend/danswer/connectors/guru/connector.py +++ b/backend/danswer/connectors/guru/connector.py @@ -77,7 +77,7 @@ class GuruConnector(LoadConnector, PollConnector): for card in cards: title = card["preferredPhrase"] link = GURU_CARDS_URL + card["slug"] - content_text = title + "\n" + parse_html_page_basic(card["content"]) + content_text = parse_html_page_basic(card["content"]) last_updated = time_str_to_utc(card["lastModified"]) last_verified = ( time_str_to_utc(card.get("lastVerified")) diff --git a/backend/danswer/connectors/hubspot/connector.py b/backend/danswer/connectors/hubspot/connector.py index e59a4c9eb..861f53ee6 100644 --- a/backend/danswer/connectors/hubspot/connector.py +++ b/backend/danswer/connectors/hubspot/connector.py @@ -73,7 +73,7 @@ class HubSpotConnector(LoadConnector, PollConnector): title = ticket.properties["subject"] link = self.ticket_base_url + ticket.id - content_text = title + "\n" + ticket.properties["content"] + content_text = ticket.properties["content"] associated_emails: list[str] = [] associated_notes: list[str] = [] diff --git a/backend/danswer/connectors/linear/__init__.py b/backend/danswer/connectors/linear/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/connectors/linear/connector.py b/backend/danswer/connectors/linear/connector.py index 067fb561d..7d81a2286 100644 --- a/backend/danswer/connectors/linear/connector.py +++ b/backend/danswer/connectors/linear/connector.py @@ -8,6 +8,7 @@ import requests from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -30,7 +31,6 @@ def _make_query(request_body: dict[str, Any], api_key: str) -> requests.Response "Content-Type": "application/json", } - response: requests.Response | None = None for i in range(_NUM_RETRIES): try: response = requests.post( @@ -187,8 +187,8 @@ class LinearConnector(LoadConnector, PollConnector): ], source=DocumentSource.LINEAR, semantic_identifier=node["identifier"], + doc_updated_at=time_str_to_utc(node["updatedAt"]), metadata={ - "updated_at": node["updatedAt"], "team": node["team"]["name"], }, ) diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 871382f49..2934d2fa2 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -1,10 +1,10 @@ from datetime import datetime from enum import Enum -from typing import Any from pydantic import BaseModel from danswer.configs.constants import DocumentSource +from danswer.configs.constants import INDEX_SEPARATOR from danswer.utils.text_processing import make_url_compatible @@ -50,21 +50,38 @@ class DocumentBase(BaseModel): sections: list[Section] source: DocumentSource | None = None semantic_identifier: str # displayed in the UI as the main identifier for the doc - metadata: dict[str, Any] + metadata: dict[str, str | list[str]] # UTC time doc_updated_at: datetime | None = None # Owner, creator, etc. primary_owners: list[BasicExpertInfo] | None = None # Assignee, space owner, etc. secondary_owners: list[BasicExpertInfo] | None = None - # `title` is used when computing best matches for a query - # if `None`, then we will use the `semantic_identifier` as the title in Vespa + # title is used for search whereas semantic_identifier is used for displaying in the UI + # different because Slack message may display as #general but general should not be part + # of the search, at least not in the same way as a document title should be for like Confluence + # The default title is semantic_identifier though unless otherwise specified title: str | None = None from_ingestion_api: bool = False - def get_title_for_document_index(self) -> str: + def get_title_for_document_index(self) -> str | None: + # If title is explicitly empty, return a None here for embedding purposes + if self.title == "": + return None return self.semantic_identifier if self.title is None else self.title + def get_metadata_str_attributes(self) -> list[str] | None: + if not self.metadata: + return None + # Combined string for the key/value for easy filtering + attributes: list[str] = [] + for k, v in self.metadata.items(): + if isinstance(v, list): + attributes.extend([k + INDEX_SEPARATOR + vi for vi in v]) + else: + attributes.append(k + INDEX_SEPARATOR + v) + return attributes + class Document(DocumentBase): id: str # This must be unique or during indexing/reindexing, chunks will be overwritten diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py index fd3de72b1..aa8a4001b 100644 --- a/backend/danswer/connectors/notion/connector.py +++ b/backend/danswer/connectors/notion/connector.py @@ -267,7 +267,8 @@ class NotionConnector(LoadConnector, PollConnector): yield ( Document( id=page.id, - sections=[Section(link=page.url, text=f"{page_title}\n")] + # Will add title to the first section later in processing + sections=[Section(link=page.url, text="")] + [ Section( link=f"{page.url}#{block_id.replace('-', '')}", diff --git a/backend/danswer/connectors/productboard/connector.py b/backend/danswer/connectors/productboard/connector.py index c5003951b..1c013f42b 100644 --- a/backend/danswer/connectors/productboard/connector.py +++ b/backend/danswer/connectors/productboard/connector.py @@ -14,6 +14,7 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.utils.logger import setup_logger @@ -94,26 +95,24 @@ class ProductboardConnector(PollConnector): for feature in self._fetch_documents( initial_link=f"{_PRODUCT_BOARD_BASE_URL}/features" ): + owner = self._get_owner_email(feature) + experts = [BasicExpertInfo(email=owner)] if owner else None + yield Document( id=feature["id"], sections=[ Section( link=feature["links"]["html"], - text=" - ".join( - ( - feature["name"], - self._parse_description_html(feature["description"]), - ) - ), + text=self._parse_description_html(feature["description"]), ) ], semantic_identifier=feature["name"], source=DocumentSource.PRODUCTBOARD, doc_updated_at=time_str_to_utc(feature["updatedAt"]), + primary_owners=experts, metadata={ - "productboard_entity_type": feature["type"], + "entity_type": feature["type"], "status": feature["status"]["name"], - "owner": self._get_owner_email(feature), }, ) @@ -122,25 +121,23 @@ class ProductboardConnector(PollConnector): for component in self._fetch_documents( initial_link=f"{_PRODUCT_BOARD_BASE_URL}/components" ): + owner = self._get_owner_email(component) + experts = [BasicExpertInfo(email=owner)] if owner else None + yield Document( id=component["id"], sections=[ Section( link=component["links"]["html"], - text=" - ".join( - ( - component["name"], - self._parse_description_html(component["description"]), - ) - ), + text=self._parse_description_html(component["description"]), ) ], semantic_identifier=component["name"], source=DocumentSource.PRODUCTBOARD, doc_updated_at=time_str_to_utc(component["updatedAt"]), + primary_owners=experts, metadata={ - "productboard_entity_type": "component", - "owner": self._get_owner_email(component), + "entity_type": "component", }, ) @@ -150,25 +147,23 @@ class ProductboardConnector(PollConnector): for product in self._fetch_documents( initial_link=f"{_PRODUCT_BOARD_BASE_URL}/products" ): + owner = self._get_owner_email(product) + experts = [BasicExpertInfo(email=owner)] if owner else None + yield Document( id=product["id"], sections=[ Section( link=product["links"]["html"], - text=" - ".join( - ( - product["name"], - self._parse_description_html(product["description"]), - ) - ), + text=self._parse_description_html(product["description"]), ) ], semantic_identifier=product["name"], source=DocumentSource.PRODUCTBOARD, doc_updated_at=time_str_to_utc(product["updatedAt"]), + primary_owners=experts, metadata={ - "productboard_entity_type": "product", - "owner": self._get_owner_email(product), + "entity_type": "product", }, ) @@ -176,26 +171,24 @@ class ProductboardConnector(PollConnector): for objective in self._fetch_documents( initial_link=f"{_PRODUCT_BOARD_BASE_URL}/objectives" ): + owner = self._get_owner_email(objective) + experts = [BasicExpertInfo(email=owner)] if owner else None + yield Document( id=objective["id"], sections=[ Section( link=objective["links"]["html"], - text=" - ".join( - ( - objective["name"], - self._parse_description_html(objective["description"]), - ) - ), + text=self._parse_description_html(objective["description"]), ) ], semantic_identifier=objective["name"], source=DocumentSource.PRODUCTBOARD, doc_updated_at=time_str_to_utc(objective["updatedAt"]), + primary_owners=experts, metadata={ - "productboard_entity_type": "release", + "entity_type": "release", "state": objective["state"], - "owner": self._get_owner_email(objective), }, ) diff --git a/backend/danswer/connectors/requesttracker/connector.py b/backend/danswer/connectors/requesttracker/connector.py index 6b20504d3..9c4590fc2 100644 --- a/backend/danswer/connectors/requesttracker/connector.py +++ b/backend/danswer/connectors/requesttracker/connector.py @@ -97,7 +97,8 @@ class RequestTrackerConnector(PollConnector): logger.info(f"Processing ticket {tid}") doc = Document( id=ticket["id"], - sections=[Section(link=ticketLink, text=f"{ticket['Subject']}\n")] + # Will add title to the first section later in processing + sections=[Section(link=ticketLink, text="")] + self.build_doc_sections_from_txn(Rt0, tid), source=DocumentSource.REQUESTTRACKER, semantic_identifier=ticket["Subject"], diff --git a/backend/danswer/connectors/zulip/__init__.py b/backend/danswer/connectors/zulip/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/danswer/db/chat.py b/backend/danswer/db/chat.py index be1d7a56b..15236b599 100644 --- a/backend/danswer/db/chat.py +++ b/backend/danswer/db/chat.py @@ -642,6 +642,7 @@ def create_db_search_doc( source_type=server_search_doc.source_type, boost=server_search_doc.boost, hidden=server_search_doc.hidden, + doc_metadata=server_search_doc.metadata, score=server_search_doc.score, match_highlights=server_search_doc.match_highlights, updated_at=server_search_doc.updated_at, @@ -674,6 +675,7 @@ def translate_db_search_doc_to_server_search_doc( source_type=db_search_doc.source_type, boost=db_search_doc.boost, hidden=db_search_doc.hidden, + metadata=db_search_doc.doc_metadata, score=db_search_doc.score, match_highlights=db_search_doc.match_highlights, updated_at=db_search_doc.updated_at, diff --git a/backend/danswer/db/document.py b/backend/danswer/db/document.py index 58b9c837b..b1620fb60 100644 --- a/backend/danswer/db/document.py +++ b/backend/danswer/db/document.py @@ -17,6 +17,7 @@ from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Credential from danswer.db.models import Document as DbDocument from danswer.db.models import DocumentByConnectorCredentialPair +from danswer.db.tag import delete_document_tags_for_documents from danswer.db.utils import model_to_dict from danswer.document_index.interfaces import DocumentMetadata from danswer.server.documents.models import ConnectorCredentialPairIdentifier @@ -272,6 +273,7 @@ def delete_documents_complete(db_session: Session, document_ids: list[str]) -> N delete_document_feedback_for_documents( document_ids=document_ids, db_session=db_session ) + delete_document_tags_for_documents(document_ids=document_ids, db_session=db_session) delete_documents(db_session, document_ids) db_session.commit() diff --git a/backend/danswer/db/models.py b/backend/danswer/db/models.py index 8937210ba..dabe66d21 100644 --- a/backend/danswer/db/models.py +++ b/backend/danswer/db/models.py @@ -22,6 +22,7 @@ from sqlalchemy import Integer from sqlalchemy import Sequence from sqlalchemy import String from sqlalchemy import Text +from sqlalchemy import UniqueConstraint from sqlalchemy.dialects import postgresql from sqlalchemy.orm import DeclarativeBase from sqlalchemy.orm import Mapped @@ -153,6 +154,15 @@ class ChatMessage__SearchDoc(Base): ) +class Document__Tag(Base): + __tablename__ = "document__tag" + + document_id: Mapped[str] = mapped_column( + ForeignKey("document.id"), primary_key=True + ) + tag_id: Mapped[int] = mapped_column(ForeignKey("tag.id"), primary_key=True) + + """ Documents/Indexing Tables """ @@ -247,6 +257,32 @@ class Document(Base): retrieval_feedbacks: Mapped[List["DocumentRetrievalFeedback"]] = relationship( "DocumentRetrievalFeedback", back_populates="document" ) + tags = relationship( + "Tag", + secondary="document__tag", + back_populates="documents", + ) + + +class Tag(Base): + __tablename__ = "tag" + + id: Mapped[int] = mapped_column(primary_key=True) + tag_key: Mapped[str] = mapped_column(String) + tag_value: Mapped[str] = mapped_column(String) + source: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource)) + + documents = relationship( + "Document", + secondary="document__tag", + back_populates="tags", + ) + + __table_args__ = ( + UniqueConstraint( + "tag_key", "tag_value", "source", name="_tag_key_value_source_uc" + ), + ) class Connector(Base): @@ -424,6 +460,7 @@ class SearchDoc(Base): boost: Mapped[int] = mapped_column(Integer) source_type: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource)) hidden: Mapped[bool] = mapped_column(Boolean) + doc_metadata: Mapped[dict[str, str | list[str]]] = mapped_column(postgresql.JSONB()) score: Mapped[float] = mapped_column(Float) match_highlights: Mapped[list[str]] = mapped_column(postgresql.ARRAY(String)) # This is for the document, not this row in the table diff --git a/backend/danswer/db/tag.py b/backend/danswer/db/tag.py new file mode 100644 index 000000000..bf70f7308 --- /dev/null +++ b/backend/danswer/db/tag.py @@ -0,0 +1,116 @@ +from sqlalchemy import delete +from sqlalchemy import func +from sqlalchemy import select +from sqlalchemy.orm import Session + +from danswer.configs.constants import DocumentSource +from danswer.db.models import Document +from danswer.db.models import Document__Tag +from danswer.db.models import Tag +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def create_or_add_document_tag( + tag_key: str, + tag_value: str, + source: DocumentSource, + document_id: str, + db_session: Session, +) -> Tag: + document = db_session.get(Document, document_id) + if not document: + raise ValueError("Invalid Document, cannot attach Tags") + + tag_stmt = select(Tag).where( + Tag.tag_key == tag_key, + Tag.tag_value == tag_value, + Tag.source == source, + ) + tag = db_session.execute(tag_stmt).scalar_one_or_none() + + if not tag: + tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source) + db_session.add(tag) + + if tag not in document.tags: + document.tags.append(tag) + + db_session.commit() + return tag + + +def create_or_add_document_tag_list( + tag_key: str, + tag_values: list[str], + source: DocumentSource, + document_id: str, + db_session: Session, +) -> list[Tag]: + document = db_session.get(Document, document_id) + if not document: + raise ValueError("Invalid Document, cannot attach Tags") + + existing_tags_stmt = select(Tag).where( + Tag.tag_key == tag_key, Tag.tag_value.in_(tag_values), Tag.source == source + ) + existing_tags = list(db_session.execute(existing_tags_stmt).scalars().all()) + existing_tag_values = {tag.tag_value for tag in existing_tags} + + new_tags = [] + for tag_value in tag_values: + if tag_value not in existing_tag_values: + new_tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source) + db_session.add(new_tag) + new_tags.append(new_tag) + + all_tags = existing_tags + new_tags + + for tag in all_tags: + if tag not in document.tags: + document.tags.append(tag) + + db_session.commit() + return all_tags + + +def get_tags_by_value_prefix_for_source_types( + tag_value_prefix: str | None, + sources: list[DocumentSource] | None, + db_session: Session, +) -> list[Tag]: + query = select(Tag) + + if tag_value_prefix: + query = query.where(Tag.tag_value.startswith(tag_value_prefix)) + + if sources: + query = query.where(Tag.source.in_(sources)) + + result = db_session.execute(query) + + tags = result.scalars().all() + return list(tags) + + +def delete_document_tags_for_documents( + document_ids: list[str], db_session: Session +) -> None: + stmt = delete(Document__Tag).where(Document__Tag.document_id.in_(document_ids)) + db_session.execute(stmt) + db_session.commit() + + orphan_tags_query = ( + select(Tag.id) + .outerjoin(Document__Tag, Tag.id == Document__Tag.tag_id) + .group_by(Tag.id) + .having(func.count(Document__Tag.document_id) == 0) + ) + + orphan_tags = db_session.execute(orphan_tags_query).scalars().all() + + if orphan_tags: + delete_orphan_tags_stmt = delete(Tag).where(Tag.id.in_(orphan_tags)) + db_session.execute(delete_orphan_tags_stmt) + db_session.commit() diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd index 8d68fb497..dcdda7a85 100644 --- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd @@ -7,12 +7,20 @@ schema danswer_chunk { field chunk_id type int { indexing: summary | attribute } - field blurb type string { + # Displayed in the UI as the main identifier for the doc + field semantic_identifier type string { indexing: summary | attribute } - # Can separate out title in the future and give heavier bm-25 weighting - # Need to consider that not every doc has a separable title (ie. slack message) - # Set summary options to enable bolding + # May not always match the `semantic_identifier` e.g. for Slack docs the + # `semantic_identifier` will be the channel name, but the `title` will be empty + field title type string { + indexing: summary | index + match { + gram + gram-size: 3 + } + index: enable-bm25 + } field content type string { indexing: summary | index match { @@ -28,6 +36,25 @@ schema danswer_chunk { indexing: summary | index summary: dynamic } + # Title embedding (x1) + field title_embedding type tensor(x[384]) { + indexing: attribute + attribute { + distance-metric: angular + } + } + # Content embeddings (chunk + optional mini chunks embeddings) + # "t" and "x" are arbitrary names, not special keywords + field embeddings type tensor(t{},x[384]) { + indexing: attribute + attribute { + distance-metric: angular + } + } + # Starting section of the doc, currently unused as it has been replaced by match highlighting + field blurb type string { + indexing: summary | attribute + } # https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it field source_type type string { indexing: summary | attribute @@ -39,21 +66,6 @@ schema danswer_chunk { field source_links type string { indexing: summary | attribute } - # displayed in the UI as the main identifier for the doc - field semantic_identifier type string { - indexing: summary | attribute - } - # this is used when computing best matches based on the title of the document - # may not always match the `semantic_identifier` e.g. for Slack docs the - # `semantic_identifier` will be the channel name, but the `title` will be empty - field title type string { - indexing: summary | index - match { - gram - gram-size: 3 - } - index: enable-bm25 - } field section_continuation type bool { indexing: summary | attribute } @@ -65,15 +77,15 @@ schema danswer_chunk { indexing: summary | attribute rank: filter } + # Needs to have a separate Attribute list for efficient filtering + field metadata_list type array { + indexing: summary | attribute + rank:filter + attribute: fast-search + } field metadata type string { indexing: summary | attribute } - field embeddings type tensor(t{},x[384]) { - indexing: attribute - attribute { - distance-metric: angular - } - } field doc_updated_at type int { indexing: summary | attribute } @@ -95,6 +107,11 @@ schema danswer_chunk { } } + # If using different tokenization settings, the fieldset has to be removed, and the field must + # be specified in the yql like: + # + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) ' + # + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) ' + # Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores fieldset default { fields: content, title } @@ -124,6 +141,79 @@ schema danswer_chunk { match-features: recency_bias } + rank-profile hybrid_search inherits default, default_rank { + inputs { + query(query_embedding) tensor(x[384]) + } + + # This must be separate function for normalize_linear to work + function vector_score() { + expression { + (query(title_content_ratio) * closeness(field, title_embedding)) + + ((1 - query(title_content_ratio)) * closeness(field, embeddings)) + } + } + + # This must be separate function for normalize_linear to work + function keyword_score() { + expression { + (query(title_content_ratio) * bm25(title)) + + ((1 - query(title_content_ratio)) * bm25(content)) + } + } + + first-phase { + expression: vector_score + } + + # Weighted average between Vector Search and BM-25 + # Each is a weighted average between the Title and Content fields + # Finally each doc is boosted by it's user feedback based boost and recency + # If any embedding or index field is missing, it just receives a score of 0 + # Assumptions: + # - For a given query + corpus, the BM-25 scores will be relatively similar in distribution + # therefore not normalizing before combining. + # - For documents without title, it gets a score of 0 for that and this is ok as documents + # without any title match should be penalized. + global-phase { + expression { + ( + # Weighted Vector Similarity Score + (query(alpha) * normalize_linear(vector_score)) + + # Weighted Keyword Similarity Score + ((1 - query(alpha)) * normalize_linear(keyword_score)) + ) + # Boost based on user feedback + * document_boost + # Decay factor based on time document was last updated + * recency_bias + } + rerank-count: 1000 + } + + match-features { + bm25(title) + bm25(content) + closeness(field, title_embedding) + closeness(field, embeddings) + keyword_score + vector_score + document_boost + recency_bias + closest(embeddings) + } + } + + # Used when searching from the admin UI for a specific doc to hide / boost + # Very heavily prioritize title + rank-profile admin_search inherits default, default_rank { + first-phase { + expression: bm25(content) + (5 * bm25(title)) + } + } + + # THE ONES BELOW ARE OUT OF DATE, DO NOT USE + # THEY MIGHT NOT EVEN WORK AT ALL rank-profile keyword_search inherits default, default_rank { first-phase { expression: bm25(content) * document_boost * recency_bias @@ -145,29 +235,4 @@ schema danswer_chunk { match-features: recency_bias document_boost closest(embeddings) } - - rank-profile hybrid_search inherits default, default_rank { - inputs { - query(query_embedding) tensor(x[384]) - } - - first-phase { - expression: closeness(field, embeddings) - } - - global-phase { - expression: ((query(alpha) * normalize_linear(closeness(field, embeddings))) + ((1 - query(alpha)) * normalize_linear(bm25(content)))) * document_boost * recency_bias - rerank-count: 1000 - } - - # Cannot pass normalize_linear features in match-features - match-features: recency_bias document_boost closest(embeddings) - } - - # used when searching from the admin UI for a specific doc to hide / boost - rank-profile admin_search inherits default, default_rank { - first-phase { - expression: bm25(content) + (5 * bm25(title)) - } - } } diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index f649621bf..de053f71e 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -25,6 +25,7 @@ from danswer.configs.chat_configs import DOC_TIME_DECAY from danswer.configs.chat_configs import EDIT_KEYWORD_QUERY from danswer.configs.chat_configs import HYBRID_ALPHA from danswer.configs.chat_configs import NUM_RETURNED_HITS +from danswer.configs.chat_configs import TITLE_CONTENT_RATIO from danswer.configs.constants import ACCESS_CONTROL_LIST from danswer.configs.constants import BLURB from danswer.configs.constants import BOOST @@ -35,7 +36,9 @@ from danswer.configs.constants import DOCUMENT_ID from danswer.configs.constants import DOCUMENT_SETS from danswer.configs.constants import EMBEDDINGS from danswer.configs.constants import HIDDEN +from danswer.configs.constants import INDEX_SEPARATOR from danswer.configs.constants import METADATA +from danswer.configs.constants import METADATA_LIST from danswer.configs.constants import PRIMARY_OWNERS from danswer.configs.constants import RECENCY_BIAS from danswer.configs.constants import SECONDARY_OWNERS @@ -44,6 +47,8 @@ from danswer.configs.constants import SEMANTIC_IDENTIFIER from danswer.configs.constants import SOURCE_LINKS from danswer.configs.constants import SOURCE_TYPE from danswer.configs.constants import TITLE +from danswer.configs.constants import TITLE_EMBEDDING +from danswer.configs.constants import TITLE_SEPARATOR from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF from danswer.connectors.cross_connector_utils.miscellaneous_utils import ( get_experts_stores_representations, @@ -239,20 +244,25 @@ def _index_vespa_chunk( for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings): embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed + title = document.get_title_for_document_index() + vespa_document_fields = { DOCUMENT_ID: document.id, CHUNK_ID: chunk.chunk_id, BLURB: remove_invalid_unicode_chars(chunk.blurb), - # this duplication of `content` is needed for keyword highlighting :( + TITLE: remove_invalid_unicode_chars(title) if title else None, CONTENT: remove_invalid_unicode_chars(chunk.content), + # This duplication of `content` is needed for keyword highlighting :( CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content), SOURCE_TYPE: str(document.source.value), SOURCE_LINKS: json.dumps(chunk.source_links), SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier), - TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()), SECTION_CONTINUATION: chunk.section_continuation, METADATA: json.dumps(document.metadata), + # Save as a list for efficient extraction as an Attribute + METADATA_LIST: chunk.source_document.get_metadata_str_attributes(), EMBEDDINGS: embeddings_name_vector_map, + TITLE_EMBEDDING: chunk.title_embedding, BOOST: chunk.boost, DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at), PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners), @@ -394,6 +404,12 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) -> ) filter_str += _build_or_filters(SOURCE_TYPE, source_strs) + tag_attributes = None + tags = filters.tags + if tags: + tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags] + filter_str += _build_or_filters(METADATA_LIST, tag_attributes) + filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set) filter_str += _build_time_filter(filters.time_cutoff) @@ -448,6 +464,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk: if DOC_UPDATED_AT in fields else None ) + + # The highlights might include the title but this is the best way we have so far to show the highlighting match_highlights = _process_dynamic_summary( # fallback to regular `content` if the `content_summary` field # isn't present @@ -459,6 +477,13 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk: f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier" ) + # Remove the title from the first chunk as every chunk already included + # its semantic identifier for LLM + content = fields[CONTENT] + if fields[CHUNK_ID] == 0: + parts = content.split(TITLE_SEPARATOR, maxsplit=1) + content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content + # User ran into this, not sure why this could happen, error checking here blurb = fields.get(BLURB) if not blurb: @@ -477,7 +502,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk: return InferenceChunk( chunk_id=fields[CHUNK_ID], blurb=blurb, - content=fields[CONTENT], + content=content, source_links=source_links_dict, section_continuation=fields[SECTION_CONTINUATION], document_id=fields[DOCUMENT_ID], @@ -725,6 +750,7 @@ class VespaIndex(DocumentIndex): num_to_retrieve: int = NUM_RETURNED_HITS, edit_keyword_query: bool = EDIT_KEYWORD_QUERY, ) -> list[InferenceChunk]: + # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY vespa_where_clauses = _build_vespa_filters(filters) yql = ( VespaIndex.yql_base @@ -759,6 +785,7 @@ class VespaIndex(DocumentIndex): distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF, edit_keyword_query: bool = EDIT_KEYWORD_QUERY, ) -> list[InferenceChunk]: + # IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY vespa_where_clauses = _build_vespa_filters(filters) yql = ( VespaIndex.yql_base @@ -798,6 +825,7 @@ class VespaIndex(DocumentIndex): time_decay_multiplier: float, num_to_retrieve: int, hybrid_alpha: float | None = HYBRID_ALPHA, + title_content_ratio: float | None = TITLE_CONTENT_RATIO, distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF, edit_keyword_query: bool = EDIT_KEYWORD_QUERY, ) -> list[InferenceChunk]: @@ -808,6 +836,7 @@ class VespaIndex(DocumentIndex): VespaIndex.yql_base + vespa_where_clauses + f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) " + + f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) " + 'or ({grammar: "weakAnd"}userInput(@query)) ' + f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))' ) @@ -828,6 +857,9 @@ class VespaIndex(DocumentIndex): "input.query(alpha)": hybrid_alpha if hybrid_alpha is not None else HYBRID_ALPHA, + "input.query(title_content_ratio)": title_content_ratio + if title_content_ratio is not None + else TITLE_CONTENT_RATIO, "hits": num_to_retrieve, "offset": 0, "ranking.profile": "hybrid_search", diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py index ae45725b1..9d27885d3 100644 --- a/backend/danswer/indexing/chunker.py +++ b/backend/danswer/indexing/chunker.py @@ -7,15 +7,15 @@ from transformers import AutoTokenizer # type:ignore from danswer.configs.app_configs import BLURB_SIZE from danswer.configs.app_configs import CHUNK_OVERLAP from danswer.configs.app_configs import MINI_CHUNK_SIZE +from danswer.configs.constants import SECTION_SEPARATOR +from danswer.configs.constants import TITLE_SEPARATOR from danswer.configs.model_configs import CHUNK_SIZE from danswer.connectors.models import Document -from danswer.connectors.models import Section from danswer.indexing.models import DocAwareChunk from danswer.search.search_nlp_models import get_default_tokenizer from danswer.utils.text_processing import shared_precompare_cleanup -SECTION_SEPARATOR = "\n\n" ChunkFunc = Callable[[Document], list[DocAwareChunk]] @@ -29,7 +29,8 @@ def extract_blurb(text: str, blurb_size: int) -> str: def chunk_large_section( - section: Section, + section_text: str, + section_link_text: str, document: Document, start_chunk_id: int, tokenizer: AutoTokenizer, @@ -37,8 +38,6 @@ def chunk_large_section( chunk_overlap: int = CHUNK_OVERLAP, blurb_size: int = BLURB_SIZE, ) -> list[DocAwareChunk]: - section_text = section.text - section_link_text = section.link or "" blurb = extract_blurb(section_text, blurb_size) sentence_aware_splitter = SentenceSplitter( @@ -67,14 +66,18 @@ def chunk_document( subsection_overlap: int = CHUNK_OVERLAP, blurb_size: int = BLURB_SIZE, ) -> list[DocAwareChunk]: + title = document.get_title_for_document_index() + title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else "" tokenizer = get_default_tokenizer() chunks: list[DocAwareChunk] = [] link_offsets: dict[int, str] = {} chunk_text = "" - for section in document.sections: + for ind, section in enumerate(document.sections): + section_text = title_prefix + section.text if ind == 0 else section.text section_link_text = section.link or "" - section_tok_length = len(tokenizer.tokenize(section.text)) + + section_tok_length = len(tokenizer.tokenize(section_text)) current_tok_length = len(tokenizer.tokenize(chunk_text)) curr_offset_len = len(shared_precompare_cleanup(chunk_text)) @@ -96,7 +99,8 @@ def chunk_document( chunk_text = "" large_section_chunks = chunk_large_section( - section=section, + section_text=section_text, + section_link_text=section_link_text, document=document, start_chunk_id=len(chunks), tokenizer=tokenizer, @@ -115,7 +119,7 @@ def chunk_document( <= chunk_tok_size ): chunk_text += ( - SECTION_SEPARATOR + section.text if chunk_text else section.text + SECTION_SEPARATOR + section_text if chunk_text else section_text ) link_offsets[curr_offset_len] = section_link_text else: @@ -130,7 +134,7 @@ def chunk_document( ) ) link_offsets = {0: section_link_text} - chunk_text = section.text + chunk_text = section_text # Once we hit the end, if we're still in the process of building a chunk, add what we have if chunk_text: diff --git a/backend/danswer/indexing/embedder.py b/backend/danswer/indexing/embedder.py index 2ca1092b9..63cb569ee 100644 --- a/backend/danswer/indexing/embedder.py +++ b/backend/danswer/indexing/embedder.py @@ -21,6 +21,9 @@ def embed_chunks( enable_mini_chunk: bool = ENABLE_MINI_CHUNK, passage_prefix: str = ASYM_PASSAGE_PREFIX, ) -> list[IndexChunk]: + # Cache the Title embeddings to only have to do it once + title_embed_dict: dict[str, list[float]] = {} + embedded_chunks: list[IndexChunk] = [] if embedding_model is None: embedding_model = EmbeddingModel() @@ -58,12 +61,24 @@ def embed_chunks( chunk_embeddings = embeddings[ embedding_ind_start : embedding_ind_start + num_embeddings ] + + title = chunk.source_document.get_title_for_document_index() + + title_embedding = None + if title: + if title in title_embed_dict: + title_embedding = title_embed_dict[title] + else: + title_embedding = embedding_model.encode([title])[0] + title_embed_dict[title] = title_embedding + new_embedded_chunk = IndexChunk( **{k: getattr(chunk, k) for k in chunk.__dataclass_fields__}, embeddings=ChunkEmbedding( full_embedding=chunk_embeddings[0], mini_chunk_embeddings=chunk_embeddings[1:], ), + title_embedding=title_embedding, ) embedded_chunks.append(new_embedded_chunk) embedding_ind_start += num_embeddings diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index c6ffbc278..be676e035 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -17,6 +17,8 @@ from danswer.db.document import update_docs_updated_at from danswer.db.document import upsert_documents_complete from danswer.db.document_set import fetch_document_sets_for_documents from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.tag import create_or_add_document_tag +from danswer.db.tag import create_or_add_document_tag_list from danswer.document_index.factory import get_default_document_index from danswer.document_index.interfaces import DocumentIndex from danswer.document_index.interfaces import DocumentMetadata @@ -44,6 +46,7 @@ def upsert_documents_in_db( index_attempt_metadata: IndexAttemptMetadata, db_session: Session, ) -> None: + # Metadata here refers to basic document info, not metadata about the actual content doc_m_batch: list[DocumentMetadata] = [] for doc in documents: first_link = next( @@ -66,6 +69,26 @@ def upsert_documents_in_db( document_metadata_batch=doc_m_batch, ) + # Insert document content metadata + for doc in documents: + for k, v in doc.metadata.items(): + if isinstance(v, list): + create_or_add_document_tag_list( + tag_key=k, + tag_values=v, + source=doc.source, + document_id=doc.id, + db_session=db_session, + ) + else: + create_or_add_document_tag( + tag_key=k, + tag_value=v, + source=doc.source, + document_id=doc.id, + db_session=db_session, + ) + @log_function_time() def index_doc_batch( @@ -121,6 +144,8 @@ def index_doc_batch( ) logger.debug("Starting chunking") + + # The first chunk additionally contains the Title of the Document chunks: list[DocAwareChunk] = list( chain(*[chunker.chunk(document=document) for document in updatable_docs]) ) diff --git a/backend/danswer/indexing/models.py b/backend/danswer/indexing/models.py index 66def05f3..331a5fed4 100644 --- a/backend/danswer/indexing/models.py +++ b/backend/danswer/indexing/models.py @@ -1,7 +1,6 @@ from dataclasses import dataclass from dataclasses import fields from datetime import datetime -from typing import Any from danswer.access.models import DocumentAccess from danswer.configs.constants import DocumentSource @@ -48,6 +47,7 @@ class DocAwareChunk(BaseChunk): @dataclass class IndexChunk(DocAwareChunk): embeddings: ChunkEmbedding + title_embedding: Embedding | None @dataclass @@ -95,7 +95,7 @@ class InferenceChunk(BaseChunk): recency_bias: float score: float | None hidden: bool - metadata: dict[str, Any] + metadata: dict[str, str | list[str]] # Matched sections in the chunk. Uses Vespa syntax e.g. TEXT # to specify that a set of words should be highlighted. For example: # ["the answer is 42", "he couldn't find an answer"] diff --git a/backend/danswer/search/models.py b/backend/danswer/search/models.py index 3fbf1bbb1..0b6cef70d 100644 --- a/backend/danswer/search/models.py +++ b/backend/danswer/search/models.py @@ -48,10 +48,16 @@ class Embedder: raise NotImplementedError +class Tag(BaseModel): + tag_key: str + tag_value: str + + class BaseFilters(BaseModel): source_type: list[DocumentSource] | None = None document_set: list[str] | None = None time_cutoff: datetime | None = None + tags: list[Tag] | None = None class IndexFilters(BaseFilters): @@ -110,6 +116,7 @@ class SearchDoc(BaseModel): # since a standard search will never find a hidden doc, this can only ever # be `True` when doing an admin search hidden: bool + metadata: dict[str, str | list[str]] score: float | None # Matched sections in the doc. Uses Vespa syntax e.g. TEXT # to specify that a set of words should be highlighted. For example: diff --git a/backend/danswer/search/request_preprocessing.py b/backend/danswer/search/request_preprocessing.py index ea131eee7..77dee972f 100644 --- a/backend/danswer/search/request_preprocessing.py +++ b/backend/danswer/search/request_preprocessing.py @@ -121,6 +121,7 @@ def retrieval_preprocessing( source_type=preset_filters.source_type or predicted_source_filters, document_set=preset_filters.document_set, time_cutoff=preset_filters.time_cutoff or predicted_time_cutoff, + tags=preset_filters.tags, # Tags are never auto-extracted access_control_list=user_acl_filters, ) diff --git a/backend/danswer/search/search_runner.py b/backend/danswer/search/search_runner.py index f77aa2e6c..3ca5280e2 100644 --- a/backend/danswer/search/search_runner.py +++ b/backend/danswer/search/search_runner.py @@ -96,6 +96,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc source_type=chunk.source_type, boost=chunk.boost, hidden=chunk.hidden, + metadata=chunk.metadata, score=chunk.score, match_highlights=chunk.match_highlights, updated_at=chunk.updated_at, diff --git a/backend/danswer/server/query_and_chat/models.py b/backend/danswer/server/query_and_chat/models.py index 035331a17..cf0d009f3 100644 --- a/backend/danswer/server/query_and_chat/models.py +++ b/backend/danswer/server/query_and_chat/models.py @@ -5,12 +5,29 @@ from pydantic import BaseModel from pydantic import root_validator from danswer.chat.models import RetrievalDocs +from danswer.configs.constants import DocumentSource from danswer.configs.constants import MessageType from danswer.configs.constants import SearchFeedbackType from danswer.search.models import BaseFilters from danswer.search.models import RetrievalDetails from danswer.search.models import SearchDoc from danswer.search.models import SearchType +from danswer.search.models import Tag + + +class TagRequest(BaseModel): + match_pattern: str | None + # If this is empty or None, then tags for all sources are considered + sources: list[DocumentSource] | None + allow_prefix: bool = True # This is currently the only option + + +class SourceTag(Tag): + source: DocumentSource + + +class TagResponse(BaseModel): + tags: list[SourceTag] class SimpleQueryRequest(BaseModel): diff --git a/backend/danswer/server/query_and_chat/query_backend.py b/backend/danswer/server/query_and_chat/query_backend.py index 576ac87dc..305a7de01 100644 --- a/backend/danswer/server/query_and_chat/query_backend.py +++ b/backend/danswer/server/query_and_chat/query_backend.py @@ -9,6 +9,7 @@ from danswer.auth.users import current_user from danswer.configs.chat_configs import DISABLE_LLM_CHUNK_FILTER from danswer.db.engine import get_session from danswer.db.models import User +from danswer.db.tag import get_tags_by_value_prefix_for_source_types from danswer.document_index.factory import get_default_document_index from danswer.document_index.vespa.index import VespaIndex from danswer.one_shot_answer.answer_question import stream_search_answer @@ -30,6 +31,9 @@ from danswer.server.query_and_chat.models import DocumentSearchRequest from danswer.server.query_and_chat.models import HelperResponse from danswer.server.query_and_chat.models import QueryValidationResponse from danswer.server.query_and_chat.models import SimpleQueryRequest +from danswer.server.query_and_chat.models import SourceTag +from danswer.server.query_and_chat.models import TagRequest +from danswer.server.query_and_chat.models import TagResponse from danswer.utils.logger import setup_logger logger = setup_logger() @@ -75,6 +79,29 @@ def admin_search( return AdminSearchResponse(documents=deduplicated_documents) +@basic_router.post("/valid-tags") +def get_tags( + tag_request: TagRequest, + _: User = Depends(current_user), + db_session: Session = Depends(get_session), +) -> TagResponse: + if not tag_request.allow_prefix: + raise NotImplementedError("Cannot disable prefix match for now") + + db_tags = get_tags_by_value_prefix_for_source_types( + tag_value_prefix=tag_request.match_pattern, + sources=tag_request.sources, + db_session=db_session, + ) + server_tags = [ + SourceTag( + tag_key=db_tag.tag_key, tag_value=db_tag.tag_value, source=db_tag.source + ) + for db_tag in db_tags + ] + return TagResponse(tags=server_tags) + + @basic_router.post("/search-intent") def get_search_type( simple_query: SimpleQueryRequest, _: User = Depends(current_user) diff --git a/backend/scripts/simulate_chat_frontend.py b/backend/scripts/simulate_chat_frontend.py index 51c077d29..2344c166d 100644 --- a/backend/scripts/simulate_chat_frontend.py +++ b/backend/scripts/simulate_chat_frontend.py @@ -30,7 +30,11 @@ def send_chat_message( "chat_session_id": chat_session_id, "parent_message_id": parent_message, "prompt_id": 0, # Global default Prompt - "retrieval_options": {"run_search": "always", "real_time": True}, + "retrieval_options": { + "run_search": "always", + "real_time": True, + "filters": {"tags": []}, + }, } docs: list[dict] | None = None