Metadata and Title Search (#903)

This commit is contained in:
Yuhong Sun 2024-01-02 11:25:50 -08:00 committed by GitHub
parent 615bb7b095
commit d7141df5fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 639 additions and 162 deletions

View File

@ -0,0 +1,61 @@
"""Tags
Revision ID: 904e5138fffb
Revises: 891cd83c87a8
Create Date: 2024-01-01 10:44:43.733974
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = "904e5138fffb"
down_revision = "891cd83c87a8"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_table(
"tag",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("tag_key", sa.String(), nullable=False),
sa.Column("tag_value", sa.String(), nullable=False),
sa.Column("source", sa.String(), nullable=False),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint(
"tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
),
)
op.create_table(
"document__tag",
sa.Column("document_id", sa.String(), nullable=False),
sa.Column("tag_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
["document_id"],
["document.id"],
),
sa.ForeignKeyConstraint(
["tag_id"],
["tag.id"],
),
sa.PrimaryKeyConstraint("document_id", "tag_id"),
)
op.add_column(
"search_doc",
sa.Column(
"doc_metadata",
postgresql.JSONB(astext_type=sa.Text()),
nullable=True,
),
)
op.execute("UPDATE search_doc SET doc_metadata = '{}' WHERE doc_metadata IS NULL")
op.alter_column("search_doc", "doc_metadata", nullable=False)
def downgrade() -> None:
op.drop_table("document__tag")
op.drop_table("tag")
op.drop_column("search_doc", "doc_metadata")

View File

@ -59,7 +59,14 @@ if os.environ.get("EDIT_KEYWORD_QUERY"):
else:
EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
# Weighting factor between Vector and Keyword Search, 1 for completely vector search
HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.6)))
HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.66)))
# Weighting factor between Title and Content of documents during search, 1 for completely
# Title based. Default heavily favors Content because Title is also included at the top of
# Content. This is to avoid cases where the Content is very relevant but it may not be clear
# if the title is separated out. Title is most of a "boost" than a separate field.
TITLE_CONTENT_RATIO = max(
0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20))
)
# A list of languages passed to the LLM to rephase the query
# For example "English,French,Spanish", be sure to use the "," separator
MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None

View File

@ -11,11 +11,13 @@ SEMANTIC_IDENTIFIER = "semantic_identifier"
TITLE = "title"
SECTION_CONTINUATION = "section_continuation"
EMBEDDINGS = "embeddings"
TITLE_EMBEDDING = "title_embedding"
ALLOWED_USERS = "allowed_users"
ACCESS_CONTROL_LIST = "access_control_list"
DOCUMENT_SETS = "document_sets"
TIME_FILTER = "time_filter"
METADATA = "metadata"
METADATA_LIST = "metadata_list"
MATCH_HIGHLIGHTS = "match_highlights"
# stored in the `metadata` of a chunk. Used to signify that this chunk should
# not be used for QA. For example, Google Drive file types which can't be parsed
@ -38,6 +40,12 @@ SESSION_KEY = "session"
QUERY_EVENT_ID = "query_event_id"
LLM_CHUNKS = "llm_chunks"
# For chunking/processing chunks
TITLE_SEPARATOR = "\n\r\n"
SECTION_SEPARATOR = "\n\n"
# For combining attributes, doesn't have to be unique/perfect to work
INDEX_SEPARATOR = "==="
class DocumentSource(str, Enum):
# Special case, document passed in via Danswer APIs without specifying a source type

View File

@ -8,6 +8,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.bookstack.client import BookStackApiClient
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@ -72,13 +73,21 @@ class BookstackConnector(LoadConnector, PollConnector):
bookstack_client: BookStackApiClient, book: dict[str, Any]
) -> Document:
url = bookstack_client.build_app_url("/books/" + str(book.get("slug")))
title = str(book.get("name", ""))
text = book.get("name", "") + "\n" + book.get("description", "")
updated_at_str = (
str(book.get("updated_at")) if book.get("updated_at") is not None else None
)
return Document(
id="book:" + str(book.get("id")),
id="book__" + str(book.get("id")),
sections=[Section(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Book: " + str(book.get("name")),
metadata={"type": "book", "updated_at": str(book.get("updated_at"))},
semantic_identifier="Book: " + title,
title=title,
doc_updated_at=time_str_to_utc(updated_at_str)
if updated_at_str is not None
else None,
metadata={"type": "book"},
)
@staticmethod
@ -91,13 +100,23 @@ class BookstackConnector(LoadConnector, PollConnector):
+ "/chapter/"
+ str(chapter.get("slug"))
)
title = str(chapter.get("name", ""))
text = chapter.get("name", "") + "\n" + chapter.get("description", "")
updated_at_str = (
str(chapter.get("updated_at"))
if chapter.get("updated_at") is not None
else None
)
return Document(
id="chapter:" + str(chapter.get("id")),
id="chapter__" + str(chapter.get("id")),
sections=[Section(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Chapter: " + str(chapter.get("name")),
metadata={"type": "chapter", "updated_at": str(chapter.get("updated_at"))},
semantic_identifier="Chapter: " + title,
title=title,
doc_updated_at=time_str_to_utc(updated_at_str)
if updated_at_str is not None
else None,
metadata={"type": "chapter"},
)
@staticmethod
@ -105,13 +124,23 @@ class BookstackConnector(LoadConnector, PollConnector):
bookstack_client: BookStackApiClient, shelf: dict[str, Any]
) -> Document:
url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug")))
title = str(shelf.get("name", ""))
text = shelf.get("name", "") + "\n" + shelf.get("description", "")
updated_at_str = (
str(shelf.get("updated_at"))
if shelf.get("updated_at") is not None
else None
)
return Document(
id="shelf:" + str(shelf.get("id")),
sections=[Section(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Shelf: " + str(shelf.get("name")),
metadata={"type": "shelf", "updated_at": shelf.get("updated_at")},
semantic_identifier="Shelf: " + title,
title=title,
doc_updated_at=time_str_to_utc(updated_at_str)
if updated_at_str is not None
else None,
metadata={"type": "shelf"},
)
@staticmethod
@ -119,7 +148,7 @@ class BookstackConnector(LoadConnector, PollConnector):
bookstack_client: BookStackApiClient, page: dict[str, Any]
) -> Document:
page_id = str(page.get("id"))
page_name = str(page.get("name"))
title = str(page.get("name", ""))
page_data = bookstack_client.get("/pages/" + page_id, {})
url = bookstack_client.build_app_url(
"/books/"
@ -127,17 +156,24 @@ class BookstackConnector(LoadConnector, PollConnector):
+ "/page/"
+ str(page_data.get("slug"))
)
page_html = (
"<h1>" + html.escape(page_name) + "</h1>" + str(page_data.get("html"))
)
page_html = "<h1>" + html.escape(title) + "</h1>" + str(page_data.get("html"))
text = parse_html_page_basic(page_html)
updated_at_str = (
str(page_data.get("updated_at"))
if page_data.get("updated_at") is not None
else None
)
time.sleep(0.1)
return Document(
id="page:" + page_id,
sections=[Section(link=url, text=text)],
source=DocumentSource.BOOKSTACK,
semantic_identifier="Page: " + str(page_name),
metadata={"type": "page", "updated_at": page_data.get("updated_at")},
semantic_identifier="Page: " + str(title),
title=str(title),
doc_updated_at=time_str_to_utc(updated_at_str)
if updated_at_str is not None
else None,
metadata={"type": "page"},
)
def load_from_state(self) -> GenerateDocumentsOutput:

View File

@ -333,11 +333,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
if not page_html:
logger.debug("Page is empty, skipping: %s", page_url)
continue
page_text = (
page.get("title", "")
+ "\n"
+ parse_html_page(page_html, self.confluence_client)
)
page_text = parse_html_page(page_html, self.confluence_client)
comments_text = self._fetch_comments(self.confluence_client, page_id)
page_text += comments_text

View File

@ -3,16 +3,17 @@ from datetime import timezone
from typing import Any
from urllib.parse import urlparse
from dateutil.parser import parse
from jira import JIRA
from jira.resources import Issue
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
@ -60,26 +61,32 @@ def fetch_jira_issues_batch(
logger.warning(f"Found Jira object not of type Issue {jira}")
continue
ticket_updated_time = parse(jira.fields.updated)
semantic_rep = (
f"Jira Ticket Summary: {jira.fields.summary}\n"
f"Description: {jira.fields.description}\n"
+ "\n".join(
[f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
)
semantic_rep = f"{jira.fields.description}\n" + "\n".join(
[f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
)
page_url = f"{jira_client.client_info()}/browse/{jira.key}"
author = None
try:
author = BasicExpertInfo(
display_name=jira.fields.creator.displayName,
email=jira.fields.creator.emailAddress,
)
except Exception:
# Author should exist but if not, doesn't matter
pass
doc_batch.append(
Document(
id=page_url,
sections=[Section(link=page_url, text=semantic_rep)],
source=DocumentSource.JIRA,
semantic_identifier=jira.fields.summary,
doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
metadata={},
doc_updated_at=time_str_to_utc(jira.fields.updated),
primary_owners=[author] if author is not None else None,
# TODO add secondary_owners if needed
metadata={"label": jira.fields.labels} if jira.fields.labels else {},
)
)
return doc_batch, len(batch)

View File

@ -140,11 +140,7 @@ class Document360Connector(LoadConnector, PollConnector):
html_content = article_details["html_content"]
article_content = parse_html_page_basic(html_content)
doc_text = (
f"workspace: {self.workspace}\n"
f"category: {article['category_name']}\n"
f"article: {article_details['title']} - "
f"{article_details.get('description', '')}\n"
f"{article_content}"
f"{article_details.get('description', '')}\n{article_content}".strip()
)
document = Document(
@ -154,7 +150,10 @@ class Document360Connector(LoadConnector, PollConnector):
semantic_identifier=article_details["title"],
doc_updated_at=updated_at,
primary_owners=authors,
metadata={},
metadata={
"workspace": self.workspace,
"category": article["category_name"],
},
)
doc_batch.append(document)

View File

@ -37,10 +37,9 @@ def _batch_github_objects(
def _convert_pr_to_document(pull_request: PullRequest) -> Document:
full_context = f"Pull-Request {pull_request.title}\n{pull_request.body}"
return Document(
id=pull_request.html_url,
sections=[Section(link=pull_request.html_url, text=full_context)],
sections=[Section(link=pull_request.html_url, text=pull_request.body or "")],
source=DocumentSource.GITHUB,
semantic_identifier=pull_request.title,
# updated_at is UTC time but is timezone unaware, explicitly add UTC
@ -48,7 +47,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
# due to local time discrepancies with UTC
doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
metadata={
"merged": pull_request.merged,
"merged": str(pull_request.merged),
"state": pull_request.state,
},
)
@ -60,10 +59,9 @@ def _fetch_issue_comments(issue: Issue) -> str:
def _convert_issue_to_document(issue: Issue) -> Document:
full_context = f"Issue {issue.title}\n{issue.body}"
return Document(
id=issue.html_url,
sections=[Section(link=issue.html_url, text=full_context)],
sections=[Section(link=issue.html_url, text=issue.body or "")],
source=DocumentSource.GITHUB,
semantic_identifier=issue.title,
# updated_at is UTC time but is timezone unaware

View File

@ -206,9 +206,6 @@ class GongConnector(LoadConnector, PollConnector):
speaker_to_name: dict[str, str] = {}
transcript_text = ""
if call_title:
transcript_text += f"Call Title: {call_title}\n\n"
call_purpose = call_metadata["purpose"]
if call_purpose:
transcript_text += f"Call Description: {call_purpose}\n\n"
@ -234,6 +231,11 @@ class GongConnector(LoadConnector, PollConnector):
)
transcript_text += f"{speaker_name}: {monolog}\n\n"
metadata = {}
if call_metadata.get("system"):
metadata["client"] = call_metadata.get("system")
# TODO calls have a clientUniqueId field, can pull that in later
doc_batch.append(
Document(
id=call_id,
@ -246,7 +248,7 @@ class GongConnector(LoadConnector, PollConnector):
doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
timezone.utc
),
metadata={},
metadata={"client": call_metadata.get("system")},
)
)
yield doc_batch

View File

@ -466,24 +466,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
doc_batch = []
for file in files_batch:
try:
text_contents = extract_text(file, service)
if text_contents:
full_context = file["name"] + " - " + text_contents
else:
full_context = file["name"]
text_contents = extract_text(file, service) or ""
doc_batch.append(
Document(
id=file["webViewLink"],
sections=[
Section(link=file["webViewLink"], text=full_context)
Section(link=file["webViewLink"], text=text_contents)
],
source=DocumentSource.GOOGLE_DRIVE,
semantic_identifier=file["name"],
doc_updated_at=datetime.fromisoformat(
file["modifiedTime"]
).astimezone(timezone.utc),
metadata={} if text_contents else {IGNORE_FOR_QA: True},
metadata={} if text_contents else {IGNORE_FOR_QA: "True"},
)
)
except Exception as e:

View File

@ -77,7 +77,7 @@ class GuruConnector(LoadConnector, PollConnector):
for card in cards:
title = card["preferredPhrase"]
link = GURU_CARDS_URL + card["slug"]
content_text = title + "\n" + parse_html_page_basic(card["content"])
content_text = parse_html_page_basic(card["content"])
last_updated = time_str_to_utc(card["lastModified"])
last_verified = (
time_str_to_utc(card.get("lastVerified"))

View File

@ -73,7 +73,7 @@ class HubSpotConnector(LoadConnector, PollConnector):
title = ticket.properties["subject"]
link = self.ticket_base_url + ticket.id
content_text = title + "\n" + ticket.properties["content"]
content_text = ticket.properties["content"]
associated_emails: list[str] = []
associated_notes: list[str] = []

View File

@ -8,6 +8,7 @@ import requests
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@ -30,7 +31,6 @@ def _make_query(request_body: dict[str, Any], api_key: str) -> requests.Response
"Content-Type": "application/json",
}
response: requests.Response | None = None
for i in range(_NUM_RETRIES):
try:
response = requests.post(
@ -187,8 +187,8 @@ class LinearConnector(LoadConnector, PollConnector):
],
source=DocumentSource.LINEAR,
semantic_identifier=node["identifier"],
doc_updated_at=time_str_to_utc(node["updatedAt"]),
metadata={
"updated_at": node["updatedAt"],
"team": node["team"]["name"],
},
)

View File

@ -1,10 +1,10 @@
from datetime import datetime
from enum import Enum
from typing import Any
from pydantic import BaseModel
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import INDEX_SEPARATOR
from danswer.utils.text_processing import make_url_compatible
@ -50,21 +50,38 @@ class DocumentBase(BaseModel):
sections: list[Section]
source: DocumentSource | None = None
semantic_identifier: str # displayed in the UI as the main identifier for the doc
metadata: dict[str, Any]
metadata: dict[str, str | list[str]]
# UTC time
doc_updated_at: datetime | None = None
# Owner, creator, etc.
primary_owners: list[BasicExpertInfo] | None = None
# Assignee, space owner, etc.
secondary_owners: list[BasicExpertInfo] | None = None
# `title` is used when computing best matches for a query
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
# title is used for search whereas semantic_identifier is used for displaying in the UI
# different because Slack message may display as #general but general should not be part
# of the search, at least not in the same way as a document title should be for like Confluence
# The default title is semantic_identifier though unless otherwise specified
title: str | None = None
from_ingestion_api: bool = False
def get_title_for_document_index(self) -> str:
def get_title_for_document_index(self) -> str | None:
# If title is explicitly empty, return a None here for embedding purposes
if self.title == "":
return None
return self.semantic_identifier if self.title is None else self.title
def get_metadata_str_attributes(self) -> list[str] | None:
if not self.metadata:
return None
# Combined string for the key/value for easy filtering
attributes: list[str] = []
for k, v in self.metadata.items():
if isinstance(v, list):
attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
else:
attributes.append(k + INDEX_SEPARATOR + v)
return attributes
class Document(DocumentBase):
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten

View File

@ -267,7 +267,8 @@ class NotionConnector(LoadConnector, PollConnector):
yield (
Document(
id=page.id,
sections=[Section(link=page.url, text=f"{page_title}\n")]
# Will add title to the first section later in processing
sections=[Section(link=page.url, text="")]
+ [
Section(
link=f"{page.url}#{block_id.replace('-', '')}",

View File

@ -14,6 +14,7 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
@ -94,26 +95,24 @@ class ProductboardConnector(PollConnector):
for feature in self._fetch_documents(
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/features"
):
owner = self._get_owner_email(feature)
experts = [BasicExpertInfo(email=owner)] if owner else None
yield Document(
id=feature["id"],
sections=[
Section(
link=feature["links"]["html"],
text=" - ".join(
(
feature["name"],
self._parse_description_html(feature["description"]),
)
),
text=self._parse_description_html(feature["description"]),
)
],
semantic_identifier=feature["name"],
source=DocumentSource.PRODUCTBOARD,
doc_updated_at=time_str_to_utc(feature["updatedAt"]),
primary_owners=experts,
metadata={
"productboard_entity_type": feature["type"],
"entity_type": feature["type"],
"status": feature["status"]["name"],
"owner": self._get_owner_email(feature),
},
)
@ -122,25 +121,23 @@ class ProductboardConnector(PollConnector):
for component in self._fetch_documents(
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/components"
):
owner = self._get_owner_email(component)
experts = [BasicExpertInfo(email=owner)] if owner else None
yield Document(
id=component["id"],
sections=[
Section(
link=component["links"]["html"],
text=" - ".join(
(
component["name"],
self._parse_description_html(component["description"]),
)
),
text=self._parse_description_html(component["description"]),
)
],
semantic_identifier=component["name"],
source=DocumentSource.PRODUCTBOARD,
doc_updated_at=time_str_to_utc(component["updatedAt"]),
primary_owners=experts,
metadata={
"productboard_entity_type": "component",
"owner": self._get_owner_email(component),
"entity_type": "component",
},
)
@ -150,25 +147,23 @@ class ProductboardConnector(PollConnector):
for product in self._fetch_documents(
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/products"
):
owner = self._get_owner_email(product)
experts = [BasicExpertInfo(email=owner)] if owner else None
yield Document(
id=product["id"],
sections=[
Section(
link=product["links"]["html"],
text=" - ".join(
(
product["name"],
self._parse_description_html(product["description"]),
)
),
text=self._parse_description_html(product["description"]),
)
],
semantic_identifier=product["name"],
source=DocumentSource.PRODUCTBOARD,
doc_updated_at=time_str_to_utc(product["updatedAt"]),
primary_owners=experts,
metadata={
"productboard_entity_type": "product",
"owner": self._get_owner_email(product),
"entity_type": "product",
},
)
@ -176,26 +171,24 @@ class ProductboardConnector(PollConnector):
for objective in self._fetch_documents(
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/objectives"
):
owner = self._get_owner_email(objective)
experts = [BasicExpertInfo(email=owner)] if owner else None
yield Document(
id=objective["id"],
sections=[
Section(
link=objective["links"]["html"],
text=" - ".join(
(
objective["name"],
self._parse_description_html(objective["description"]),
)
),
text=self._parse_description_html(objective["description"]),
)
],
semantic_identifier=objective["name"],
source=DocumentSource.PRODUCTBOARD,
doc_updated_at=time_str_to_utc(objective["updatedAt"]),
primary_owners=experts,
metadata={
"productboard_entity_type": "release",
"entity_type": "release",
"state": objective["state"],
"owner": self._get_owner_email(objective),
},
)

View File

@ -97,7 +97,8 @@ class RequestTrackerConnector(PollConnector):
logger.info(f"Processing ticket {tid}")
doc = Document(
id=ticket["id"],
sections=[Section(link=ticketLink, text=f"{ticket['Subject']}\n")]
# Will add title to the first section later in processing
sections=[Section(link=ticketLink, text="")]
+ self.build_doc_sections_from_txn(Rt0, tid),
source=DocumentSource.REQUESTTRACKER,
semantic_identifier=ticket["Subject"],

View File

@ -642,6 +642,7 @@ def create_db_search_doc(
source_type=server_search_doc.source_type,
boost=server_search_doc.boost,
hidden=server_search_doc.hidden,
doc_metadata=server_search_doc.metadata,
score=server_search_doc.score,
match_highlights=server_search_doc.match_highlights,
updated_at=server_search_doc.updated_at,
@ -674,6 +675,7 @@ def translate_db_search_doc_to_server_search_doc(
source_type=db_search_doc.source_type,
boost=db_search_doc.boost,
hidden=db_search_doc.hidden,
metadata=db_search_doc.doc_metadata,
score=db_search_doc.score,
match_highlights=db_search_doc.match_highlights,
updated_at=db_search_doc.updated_at,

View File

@ -17,6 +17,7 @@ from danswer.db.models import ConnectorCredentialPair
from danswer.db.models import Credential
from danswer.db.models import Document as DbDocument
from danswer.db.models import DocumentByConnectorCredentialPair
from danswer.db.tag import delete_document_tags_for_documents
from danswer.db.utils import model_to_dict
from danswer.document_index.interfaces import DocumentMetadata
from danswer.server.documents.models import ConnectorCredentialPairIdentifier
@ -272,6 +273,7 @@ def delete_documents_complete(db_session: Session, document_ids: list[str]) -> N
delete_document_feedback_for_documents(
document_ids=document_ids, db_session=db_session
)
delete_document_tags_for_documents(document_ids=document_ids, db_session=db_session)
delete_documents(db_session, document_ids)
db_session.commit()

View File

@ -22,6 +22,7 @@ from sqlalchemy import Integer
from sqlalchemy import Sequence
from sqlalchemy import String
from sqlalchemy import Text
from sqlalchemy import UniqueConstraint
from sqlalchemy.dialects import postgresql
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped
@ -153,6 +154,15 @@ class ChatMessage__SearchDoc(Base):
)
class Document__Tag(Base):
__tablename__ = "document__tag"
document_id: Mapped[str] = mapped_column(
ForeignKey("document.id"), primary_key=True
)
tag_id: Mapped[int] = mapped_column(ForeignKey("tag.id"), primary_key=True)
"""
Documents/Indexing Tables
"""
@ -247,6 +257,32 @@ class Document(Base):
retrieval_feedbacks: Mapped[List["DocumentRetrievalFeedback"]] = relationship(
"DocumentRetrievalFeedback", back_populates="document"
)
tags = relationship(
"Tag",
secondary="document__tag",
back_populates="documents",
)
class Tag(Base):
__tablename__ = "tag"
id: Mapped[int] = mapped_column(primary_key=True)
tag_key: Mapped[str] = mapped_column(String)
tag_value: Mapped[str] = mapped_column(String)
source: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
documents = relationship(
"Document",
secondary="document__tag",
back_populates="tags",
)
__table_args__ = (
UniqueConstraint(
"tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
),
)
class Connector(Base):
@ -424,6 +460,7 @@ class SearchDoc(Base):
boost: Mapped[int] = mapped_column(Integer)
source_type: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
hidden: Mapped[bool] = mapped_column(Boolean)
doc_metadata: Mapped[dict[str, str | list[str]]] = mapped_column(postgresql.JSONB())
score: Mapped[float] = mapped_column(Float)
match_highlights: Mapped[list[str]] = mapped_column(postgresql.ARRAY(String))
# This is for the document, not this row in the table

116
backend/danswer/db/tag.py Normal file
View File

@ -0,0 +1,116 @@
from sqlalchemy import delete
from sqlalchemy import func
from sqlalchemy import select
from sqlalchemy.orm import Session
from danswer.configs.constants import DocumentSource
from danswer.db.models import Document
from danswer.db.models import Document__Tag
from danswer.db.models import Tag
from danswer.utils.logger import setup_logger
logger = setup_logger()
def create_or_add_document_tag(
tag_key: str,
tag_value: str,
source: DocumentSource,
document_id: str,
db_session: Session,
) -> Tag:
document = db_session.get(Document, document_id)
if not document:
raise ValueError("Invalid Document, cannot attach Tags")
tag_stmt = select(Tag).where(
Tag.tag_key == tag_key,
Tag.tag_value == tag_value,
Tag.source == source,
)
tag = db_session.execute(tag_stmt).scalar_one_or_none()
if not tag:
tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
db_session.add(tag)
if tag not in document.tags:
document.tags.append(tag)
db_session.commit()
return tag
def create_or_add_document_tag_list(
tag_key: str,
tag_values: list[str],
source: DocumentSource,
document_id: str,
db_session: Session,
) -> list[Tag]:
document = db_session.get(Document, document_id)
if not document:
raise ValueError("Invalid Document, cannot attach Tags")
existing_tags_stmt = select(Tag).where(
Tag.tag_key == tag_key, Tag.tag_value.in_(tag_values), Tag.source == source
)
existing_tags = list(db_session.execute(existing_tags_stmt).scalars().all())
existing_tag_values = {tag.tag_value for tag in existing_tags}
new_tags = []
for tag_value in tag_values:
if tag_value not in existing_tag_values:
new_tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
db_session.add(new_tag)
new_tags.append(new_tag)
all_tags = existing_tags + new_tags
for tag in all_tags:
if tag not in document.tags:
document.tags.append(tag)
db_session.commit()
return all_tags
def get_tags_by_value_prefix_for_source_types(
tag_value_prefix: str | None,
sources: list[DocumentSource] | None,
db_session: Session,
) -> list[Tag]:
query = select(Tag)
if tag_value_prefix:
query = query.where(Tag.tag_value.startswith(tag_value_prefix))
if sources:
query = query.where(Tag.source.in_(sources))
result = db_session.execute(query)
tags = result.scalars().all()
return list(tags)
def delete_document_tags_for_documents(
document_ids: list[str], db_session: Session
) -> None:
stmt = delete(Document__Tag).where(Document__Tag.document_id.in_(document_ids))
db_session.execute(stmt)
db_session.commit()
orphan_tags_query = (
select(Tag.id)
.outerjoin(Document__Tag, Tag.id == Document__Tag.tag_id)
.group_by(Tag.id)
.having(func.count(Document__Tag.document_id) == 0)
)
orphan_tags = db_session.execute(orphan_tags_query).scalars().all()
if orphan_tags:
delete_orphan_tags_stmt = delete(Tag).where(Tag.id.in_(orphan_tags))
db_session.execute(delete_orphan_tags_stmt)
db_session.commit()

View File

@ -7,12 +7,20 @@ schema danswer_chunk {
field chunk_id type int {
indexing: summary | attribute
}
field blurb type string {
# Displayed in the UI as the main identifier for the doc
field semantic_identifier type string {
indexing: summary | attribute
}
# Can separate out title in the future and give heavier bm-25 weighting
# Need to consider that not every doc has a separable title (ie. slack message)
# Set summary options to enable bolding
# May not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
indexing: summary | index
match {
gram
gram-size: 3
}
index: enable-bm25
}
field content type string {
indexing: summary | index
match {
@ -28,6 +36,25 @@ schema danswer_chunk {
indexing: summary | index
summary: dynamic
}
# Title embedding (x1)
field title_embedding type tensor<float>(x[384]) {
indexing: attribute
attribute {
distance-metric: angular
}
}
# Content embeddings (chunk + optional mini chunks embeddings)
# "t" and "x" are arbitrary names, not special keywords
field embeddings type tensor<float>(t{},x[384]) {
indexing: attribute
attribute {
distance-metric: angular
}
}
# Starting section of the doc, currently unused as it has been replaced by match highlighting
field blurb type string {
indexing: summary | attribute
}
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
field source_type type string {
indexing: summary | attribute
@ -39,21 +66,6 @@ schema danswer_chunk {
field source_links type string {
indexing: summary | attribute
}
# displayed in the UI as the main identifier for the doc
field semantic_identifier type string {
indexing: summary | attribute
}
# this is used when computing best matches based on the title of the document
# may not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
field title type string {
indexing: summary | index
match {
gram
gram-size: 3
}
index: enable-bm25
}
field section_continuation type bool {
indexing: summary | attribute
}
@ -65,15 +77,15 @@ schema danswer_chunk {
indexing: summary | attribute
rank: filter
}
# Needs to have a separate Attribute list for efficient filtering
field metadata_list type array<string> {
indexing: summary | attribute
rank:filter
attribute: fast-search
}
field metadata type string {
indexing: summary | attribute
}
field embeddings type tensor<float>(t{},x[384]) {
indexing: attribute
attribute {
distance-metric: angular
}
}
field doc_updated_at type int {
indexing: summary | attribute
}
@ -95,6 +107,11 @@ schema danswer_chunk {
}
}
# If using different tokenization settings, the fieldset has to be removed, and the field must
# be specified in the yql like:
# + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) '
# + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) '
# Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores
fieldset default {
fields: content, title
}
@ -124,6 +141,79 @@ schema danswer_chunk {
match-features: recency_bias
}
rank-profile hybrid_search inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[384])
}
# This must be separate function for normalize_linear to work
function vector_score() {
expression {
(query(title_content_ratio) * closeness(field, title_embedding)) +
((1 - query(title_content_ratio)) * closeness(field, embeddings))
}
}
# This must be separate function for normalize_linear to work
function keyword_score() {
expression {
(query(title_content_ratio) * bm25(title)) +
((1 - query(title_content_ratio)) * bm25(content))
}
}
first-phase {
expression: vector_score
}
# Weighted average between Vector Search and BM-25
# Each is a weighted average between the Title and Content fields
# Finally each doc is boosted by it's user feedback based boost and recency
# If any embedding or index field is missing, it just receives a score of 0
# Assumptions:
# - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
# therefore not normalizing before combining.
# - For documents without title, it gets a score of 0 for that and this is ok as documents
# without any title match should be penalized.
global-phase {
expression {
(
# Weighted Vector Similarity Score
(query(alpha) * normalize_linear(vector_score)) +
# Weighted Keyword Similarity Score
((1 - query(alpha)) * normalize_linear(keyword_score))
)
# Boost based on user feedback
* document_boost
# Decay factor based on time document was last updated
* recency_bias
}
rerank-count: 1000
}
match-features {
bm25(title)
bm25(content)
closeness(field, title_embedding)
closeness(field, embeddings)
keyword_score
vector_score
document_boost
recency_bias
closest(embeddings)
}
}
# Used when searching from the admin UI for a specific doc to hide / boost
# Very heavily prioritize title
rank-profile admin_search inherits default, default_rank {
first-phase {
expression: bm25(content) + (5 * bm25(title))
}
}
# THE ONES BELOW ARE OUT OF DATE, DO NOT USE
# THEY MIGHT NOT EVEN WORK AT ALL
rank-profile keyword_search inherits default, default_rank {
first-phase {
expression: bm25(content) * document_boost * recency_bias
@ -145,29 +235,4 @@ schema danswer_chunk {
match-features: recency_bias document_boost closest(embeddings)
}
rank-profile hybrid_search inherits default, default_rank {
inputs {
query(query_embedding) tensor<float>(x[384])
}
first-phase {
expression: closeness(field, embeddings)
}
global-phase {
expression: ((query(alpha) * normalize_linear(closeness(field, embeddings))) + ((1 - query(alpha)) * normalize_linear(bm25(content)))) * document_boost * recency_bias
rerank-count: 1000
}
# Cannot pass normalize_linear features in match-features
match-features: recency_bias document_boost closest(embeddings)
}
# used when searching from the admin UI for a specific doc to hide / boost
rank-profile admin_search inherits default, default_rank {
first-phase {
expression: bm25(content) + (5 * bm25(title))
}
}
}

View File

@ -25,6 +25,7 @@ from danswer.configs.chat_configs import DOC_TIME_DECAY
from danswer.configs.chat_configs import EDIT_KEYWORD_QUERY
from danswer.configs.chat_configs import HYBRID_ALPHA
from danswer.configs.chat_configs import NUM_RETURNED_HITS
from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
from danswer.configs.constants import ACCESS_CONTROL_LIST
from danswer.configs.constants import BLURB
from danswer.configs.constants import BOOST
@ -35,7 +36,9 @@ from danswer.configs.constants import DOCUMENT_ID
from danswer.configs.constants import DOCUMENT_SETS
from danswer.configs.constants import EMBEDDINGS
from danswer.configs.constants import HIDDEN
from danswer.configs.constants import INDEX_SEPARATOR
from danswer.configs.constants import METADATA
from danswer.configs.constants import METADATA_LIST
from danswer.configs.constants import PRIMARY_OWNERS
from danswer.configs.constants import RECENCY_BIAS
from danswer.configs.constants import SECONDARY_OWNERS
@ -44,6 +47,8 @@ from danswer.configs.constants import SEMANTIC_IDENTIFIER
from danswer.configs.constants import SOURCE_LINKS
from danswer.configs.constants import SOURCE_TYPE
from danswer.configs.constants import TITLE
from danswer.configs.constants import TITLE_EMBEDDING
from danswer.configs.constants import TITLE_SEPARATOR
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
@ -239,20 +244,25 @@ def _index_vespa_chunk(
for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings):
embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed
title = document.get_title_for_document_index()
vespa_document_fields = {
DOCUMENT_ID: document.id,
CHUNK_ID: chunk.chunk_id,
BLURB: remove_invalid_unicode_chars(chunk.blurb),
# this duplication of `content` is needed for keyword highlighting :(
TITLE: remove_invalid_unicode_chars(title) if title else None,
CONTENT: remove_invalid_unicode_chars(chunk.content),
# This duplication of `content` is needed for keyword highlighting :(
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
SOURCE_TYPE: str(document.source.value),
SOURCE_LINKS: json.dumps(chunk.source_links),
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()),
SECTION_CONTINUATION: chunk.section_continuation,
METADATA: json.dumps(document.metadata),
# Save as a list for efficient extraction as an Attribute
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
EMBEDDINGS: embeddings_name_vector_map,
TITLE_EMBEDDING: chunk.title_embedding,
BOOST: chunk.boost,
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
@ -394,6 +404,12 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) ->
)
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
tag_attributes = None
tags = filters.tags
if tags:
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
filter_str += _build_time_filter(filters.time_cutoff)
@ -448,6 +464,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
if DOC_UPDATED_AT in fields
else None
)
# The highlights might include the title but this is the best way we have so far to show the highlighting
match_highlights = _process_dynamic_summary(
# fallback to regular `content` if the `content_summary` field
# isn't present
@ -459,6 +477,13 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
)
# Remove the title from the first chunk as every chunk already included
# its semantic identifier for LLM
content = fields[CONTENT]
if fields[CHUNK_ID] == 0:
parts = content.split(TITLE_SEPARATOR, maxsplit=1)
content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
# User ran into this, not sure why this could happen, error checking here
blurb = fields.get(BLURB)
if not blurb:
@ -477,7 +502,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
return InferenceChunk(
chunk_id=fields[CHUNK_ID],
blurb=blurb,
content=fields[CONTENT],
content=content,
source_links=source_links_dict,
section_continuation=fields[SECTION_CONTINUATION],
document_id=fields[DOCUMENT_ID],
@ -725,6 +750,7 @@ class VespaIndex(DocumentIndex):
num_to_retrieve: int = NUM_RETURNED_HITS,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
vespa_where_clauses = _build_vespa_filters(filters)
yql = (
VespaIndex.yql_base
@ -759,6 +785,7 @@ class VespaIndex(DocumentIndex):
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
vespa_where_clauses = _build_vespa_filters(filters)
yql = (
VespaIndex.yql_base
@ -798,6 +825,7 @@ class VespaIndex(DocumentIndex):
time_decay_multiplier: float,
num_to_retrieve: int,
hybrid_alpha: float | None = HYBRID_ALPHA,
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
) -> list[InferenceChunk]:
@ -808,6 +836,7 @@ class VespaIndex(DocumentIndex):
VespaIndex.yql_base
+ vespa_where_clauses
+ f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
+ f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) "
+ 'or ({grammar: "weakAnd"}userInput(@query)) '
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
)
@ -828,6 +857,9 @@ class VespaIndex(DocumentIndex):
"input.query(alpha)": hybrid_alpha
if hybrid_alpha is not None
else HYBRID_ALPHA,
"input.query(title_content_ratio)": title_content_ratio
if title_content_ratio is not None
else TITLE_CONTENT_RATIO,
"hits": num_to_retrieve,
"offset": 0,
"ranking.profile": "hybrid_search",

View File

@ -7,15 +7,15 @@ from transformers import AutoTokenizer # type:ignore
from danswer.configs.app_configs import BLURB_SIZE
from danswer.configs.app_configs import CHUNK_OVERLAP
from danswer.configs.app_configs import MINI_CHUNK_SIZE
from danswer.configs.constants import SECTION_SEPARATOR
from danswer.configs.constants import TITLE_SEPARATOR
from danswer.configs.model_configs import CHUNK_SIZE
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.indexing.models import DocAwareChunk
from danswer.search.search_nlp_models import get_default_tokenizer
from danswer.utils.text_processing import shared_precompare_cleanup
SECTION_SEPARATOR = "\n\n"
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
@ -29,7 +29,8 @@ def extract_blurb(text: str, blurb_size: int) -> str:
def chunk_large_section(
section: Section,
section_text: str,
section_link_text: str,
document: Document,
start_chunk_id: int,
tokenizer: AutoTokenizer,
@ -37,8 +38,6 @@ def chunk_large_section(
chunk_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE,
) -> list[DocAwareChunk]:
section_text = section.text
section_link_text = section.link or ""
blurb = extract_blurb(section_text, blurb_size)
sentence_aware_splitter = SentenceSplitter(
@ -67,14 +66,18 @@ def chunk_document(
subsection_overlap: int = CHUNK_OVERLAP,
blurb_size: int = BLURB_SIZE,
) -> list[DocAwareChunk]:
title = document.get_title_for_document_index()
title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
tokenizer = get_default_tokenizer()
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""
for section in document.sections:
for ind, section in enumerate(document.sections):
section_text = title_prefix + section.text if ind == 0 else section.text
section_link_text = section.link or ""
section_tok_length = len(tokenizer.tokenize(section.text))
section_tok_length = len(tokenizer.tokenize(section_text))
current_tok_length = len(tokenizer.tokenize(chunk_text))
curr_offset_len = len(shared_precompare_cleanup(chunk_text))
@ -96,7 +99,8 @@ def chunk_document(
chunk_text = ""
large_section_chunks = chunk_large_section(
section=section,
section_text=section_text,
section_link_text=section_link_text,
document=document,
start_chunk_id=len(chunks),
tokenizer=tokenizer,
@ -115,7 +119,7 @@ def chunk_document(
<= chunk_tok_size
):
chunk_text += (
SECTION_SEPARATOR + section.text if chunk_text else section.text
SECTION_SEPARATOR + section_text if chunk_text else section_text
)
link_offsets[curr_offset_len] = section_link_text
else:
@ -130,7 +134,7 @@ def chunk_document(
)
)
link_offsets = {0: section_link_text}
chunk_text = section.text
chunk_text = section_text
# Once we hit the end, if we're still in the process of building a chunk, add what we have
if chunk_text:

View File

@ -21,6 +21,9 @@ def embed_chunks(
enable_mini_chunk: bool = ENABLE_MINI_CHUNK,
passage_prefix: str = ASYM_PASSAGE_PREFIX,
) -> list[IndexChunk]:
# Cache the Title embeddings to only have to do it once
title_embed_dict: dict[str, list[float]] = {}
embedded_chunks: list[IndexChunk] = []
if embedding_model is None:
embedding_model = EmbeddingModel()
@ -58,12 +61,24 @@ def embed_chunks(
chunk_embeddings = embeddings[
embedding_ind_start : embedding_ind_start + num_embeddings
]
title = chunk.source_document.get_title_for_document_index()
title_embedding = None
if title:
if title in title_embed_dict:
title_embedding = title_embed_dict[title]
else:
title_embedding = embedding_model.encode([title])[0]
title_embed_dict[title] = title_embedding
new_embedded_chunk = IndexChunk(
**{k: getattr(chunk, k) for k in chunk.__dataclass_fields__},
embeddings=ChunkEmbedding(
full_embedding=chunk_embeddings[0],
mini_chunk_embeddings=chunk_embeddings[1:],
),
title_embedding=title_embedding,
)
embedded_chunks.append(new_embedded_chunk)
embedding_ind_start += num_embeddings

View File

@ -17,6 +17,8 @@ from danswer.db.document import update_docs_updated_at
from danswer.db.document import upsert_documents_complete
from danswer.db.document_set import fetch_document_sets_for_documents
from danswer.db.engine import get_sqlalchemy_engine
from danswer.db.tag import create_or_add_document_tag
from danswer.db.tag import create_or_add_document_tag_list
from danswer.document_index.factory import get_default_document_index
from danswer.document_index.interfaces import DocumentIndex
from danswer.document_index.interfaces import DocumentMetadata
@ -44,6 +46,7 @@ def upsert_documents_in_db(
index_attempt_metadata: IndexAttemptMetadata,
db_session: Session,
) -> None:
# Metadata here refers to basic document info, not metadata about the actual content
doc_m_batch: list[DocumentMetadata] = []
for doc in documents:
first_link = next(
@ -66,6 +69,26 @@ def upsert_documents_in_db(
document_metadata_batch=doc_m_batch,
)
# Insert document content metadata
for doc in documents:
for k, v in doc.metadata.items():
if isinstance(v, list):
create_or_add_document_tag_list(
tag_key=k,
tag_values=v,
source=doc.source,
document_id=doc.id,
db_session=db_session,
)
else:
create_or_add_document_tag(
tag_key=k,
tag_value=v,
source=doc.source,
document_id=doc.id,
db_session=db_session,
)
@log_function_time()
def index_doc_batch(
@ -121,6 +144,8 @@ def index_doc_batch(
)
logger.debug("Starting chunking")
# The first chunk additionally contains the Title of the Document
chunks: list[DocAwareChunk] = list(
chain(*[chunker.chunk(document=document) for document in updatable_docs])
)

View File

@ -1,7 +1,6 @@
from dataclasses import dataclass
from dataclasses import fields
from datetime import datetime
from typing import Any
from danswer.access.models import DocumentAccess
from danswer.configs.constants import DocumentSource
@ -48,6 +47,7 @@ class DocAwareChunk(BaseChunk):
@dataclass
class IndexChunk(DocAwareChunk):
embeddings: ChunkEmbedding
title_embedding: Embedding | None
@dataclass
@ -95,7 +95,7 @@ class InferenceChunk(BaseChunk):
recency_bias: float
score: float | None
hidden: bool
metadata: dict[str, Any]
metadata: dict[str, str | list[str]]
# Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
# to specify that a set of words should be highlighted. For example:
# ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]

View File

@ -48,10 +48,16 @@ class Embedder:
raise NotImplementedError
class Tag(BaseModel):
tag_key: str
tag_value: str
class BaseFilters(BaseModel):
source_type: list[DocumentSource] | None = None
document_set: list[str] | None = None
time_cutoff: datetime | None = None
tags: list[Tag] | None = None
class IndexFilters(BaseFilters):
@ -110,6 +116,7 @@ class SearchDoc(BaseModel):
# since a standard search will never find a hidden doc, this can only ever
# be `True` when doing an admin search
hidden: bool
metadata: dict[str, str | list[str]]
score: float | None
# Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
# to specify that a set of words should be highlighted. For example:

View File

@ -121,6 +121,7 @@ def retrieval_preprocessing(
source_type=preset_filters.source_type or predicted_source_filters,
document_set=preset_filters.document_set,
time_cutoff=preset_filters.time_cutoff or predicted_time_cutoff,
tags=preset_filters.tags, # Tags are never auto-extracted
access_control_list=user_acl_filters,
)

View File

@ -96,6 +96,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
source_type=chunk.source_type,
boost=chunk.boost,
hidden=chunk.hidden,
metadata=chunk.metadata,
score=chunk.score,
match_highlights=chunk.match_highlights,
updated_at=chunk.updated_at,

View File

@ -5,12 +5,29 @@ from pydantic import BaseModel
from pydantic import root_validator
from danswer.chat.models import RetrievalDocs
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import MessageType
from danswer.configs.constants import SearchFeedbackType
from danswer.search.models import BaseFilters
from danswer.search.models import RetrievalDetails
from danswer.search.models import SearchDoc
from danswer.search.models import SearchType
from danswer.search.models import Tag
class TagRequest(BaseModel):
match_pattern: str | None
# If this is empty or None, then tags for all sources are considered
sources: list[DocumentSource] | None
allow_prefix: bool = True # This is currently the only option
class SourceTag(Tag):
source: DocumentSource
class TagResponse(BaseModel):
tags: list[SourceTag]
class SimpleQueryRequest(BaseModel):

View File

@ -9,6 +9,7 @@ from danswer.auth.users import current_user
from danswer.configs.chat_configs import DISABLE_LLM_CHUNK_FILTER
from danswer.db.engine import get_session
from danswer.db.models import User
from danswer.db.tag import get_tags_by_value_prefix_for_source_types
from danswer.document_index.factory import get_default_document_index
from danswer.document_index.vespa.index import VespaIndex
from danswer.one_shot_answer.answer_question import stream_search_answer
@ -30,6 +31,9 @@ from danswer.server.query_and_chat.models import DocumentSearchRequest
from danswer.server.query_and_chat.models import HelperResponse
from danswer.server.query_and_chat.models import QueryValidationResponse
from danswer.server.query_and_chat.models import SimpleQueryRequest
from danswer.server.query_and_chat.models import SourceTag
from danswer.server.query_and_chat.models import TagRequest
from danswer.server.query_and_chat.models import TagResponse
from danswer.utils.logger import setup_logger
logger = setup_logger()
@ -75,6 +79,29 @@ def admin_search(
return AdminSearchResponse(documents=deduplicated_documents)
@basic_router.post("/valid-tags")
def get_tags(
tag_request: TagRequest,
_: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> TagResponse:
if not tag_request.allow_prefix:
raise NotImplementedError("Cannot disable prefix match for now")
db_tags = get_tags_by_value_prefix_for_source_types(
tag_value_prefix=tag_request.match_pattern,
sources=tag_request.sources,
db_session=db_session,
)
server_tags = [
SourceTag(
tag_key=db_tag.tag_key, tag_value=db_tag.tag_value, source=db_tag.source
)
for db_tag in db_tags
]
return TagResponse(tags=server_tags)
@basic_router.post("/search-intent")
def get_search_type(
simple_query: SimpleQueryRequest, _: User = Depends(current_user)

View File

@ -30,7 +30,11 @@ def send_chat_message(
"chat_session_id": chat_session_id,
"parent_message_id": parent_message,
"prompt_id": 0, # Global default Prompt
"retrieval_options": {"run_search": "always", "real_time": True},
"retrieval_options": {
"run_search": "always",
"real_time": True,
"filters": {"tags": []},
},
}
docs: list[dict] | None = None