mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-30 01:30:21 +02:00
Metadata and Title Search (#903)
This commit is contained in:
parent
615bb7b095
commit
d7141df5fc
61
backend/alembic/versions/904e5138fffb_tags.py
Normal file
61
backend/alembic/versions/904e5138fffb_tags.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""Tags
|
||||
|
||||
Revision ID: 904e5138fffb
|
||||
Revises: 891cd83c87a8
|
||||
Create Date: 2024-01-01 10:44:43.733974
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "904e5138fffb"
|
||||
down_revision = "891cd83c87a8"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"tag",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("tag_key", sa.String(), nullable=False),
|
||||
sa.Column("tag_value", sa.String(), nullable=False),
|
||||
sa.Column("source", sa.String(), nullable=False),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint(
|
||||
"tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
|
||||
),
|
||||
)
|
||||
op.create_table(
|
||||
"document__tag",
|
||||
sa.Column("document_id", sa.String(), nullable=False),
|
||||
sa.Column("tag_id", sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(
|
||||
["document_id"],
|
||||
["document.id"],
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["tag_id"],
|
||||
["tag.id"],
|
||||
),
|
||||
sa.PrimaryKeyConstraint("document_id", "tag_id"),
|
||||
)
|
||||
|
||||
op.add_column(
|
||||
"search_doc",
|
||||
sa.Column(
|
||||
"doc_metadata",
|
||||
postgresql.JSONB(astext_type=sa.Text()),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
op.execute("UPDATE search_doc SET doc_metadata = '{}' WHERE doc_metadata IS NULL")
|
||||
op.alter_column("search_doc", "doc_metadata", nullable=False)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("document__tag")
|
||||
op.drop_table("tag")
|
||||
op.drop_column("search_doc", "doc_metadata")
|
@ -59,7 +59,14 @@ if os.environ.get("EDIT_KEYWORD_QUERY"):
|
||||
else:
|
||||
EDIT_KEYWORD_QUERY = not os.environ.get("DOCUMENT_ENCODER_MODEL")
|
||||
# Weighting factor between Vector and Keyword Search, 1 for completely vector search
|
||||
HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.6)))
|
||||
HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.66)))
|
||||
# Weighting factor between Title and Content of documents during search, 1 for completely
|
||||
# Title based. Default heavily favors Content because Title is also included at the top of
|
||||
# Content. This is to avoid cases where the Content is very relevant but it may not be clear
|
||||
# if the title is separated out. Title is most of a "boost" than a separate field.
|
||||
TITLE_CONTENT_RATIO = max(
|
||||
0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20))
|
||||
)
|
||||
# A list of languages passed to the LLM to rephase the query
|
||||
# For example "English,French,Spanish", be sure to use the "," separator
|
||||
MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None
|
||||
|
@ -11,11 +11,13 @@ SEMANTIC_IDENTIFIER = "semantic_identifier"
|
||||
TITLE = "title"
|
||||
SECTION_CONTINUATION = "section_continuation"
|
||||
EMBEDDINGS = "embeddings"
|
||||
TITLE_EMBEDDING = "title_embedding"
|
||||
ALLOWED_USERS = "allowed_users"
|
||||
ACCESS_CONTROL_LIST = "access_control_list"
|
||||
DOCUMENT_SETS = "document_sets"
|
||||
TIME_FILTER = "time_filter"
|
||||
METADATA = "metadata"
|
||||
METADATA_LIST = "metadata_list"
|
||||
MATCH_HIGHLIGHTS = "match_highlights"
|
||||
# stored in the `metadata` of a chunk. Used to signify that this chunk should
|
||||
# not be used for QA. For example, Google Drive file types which can't be parsed
|
||||
@ -38,6 +40,12 @@ SESSION_KEY = "session"
|
||||
QUERY_EVENT_ID = "query_event_id"
|
||||
LLM_CHUNKS = "llm_chunks"
|
||||
|
||||
# For chunking/processing chunks
|
||||
TITLE_SEPARATOR = "\n\r\n"
|
||||
SECTION_SEPARATOR = "\n\n"
|
||||
# For combining attributes, doesn't have to be unique/perfect to work
|
||||
INDEX_SEPARATOR = "==="
|
||||
|
||||
|
||||
class DocumentSource(str, Enum):
|
||||
# Special case, document passed in via Danswer APIs without specifying a source type
|
||||
|
@ -8,6 +8,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.bookstack.client import BookStackApiClient
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@ -72,13 +73,21 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
bookstack_client: BookStackApiClient, book: dict[str, Any]
|
||||
) -> Document:
|
||||
url = bookstack_client.build_app_url("/books/" + str(book.get("slug")))
|
||||
title = str(book.get("name", ""))
|
||||
text = book.get("name", "") + "\n" + book.get("description", "")
|
||||
updated_at_str = (
|
||||
str(book.get("updated_at")) if book.get("updated_at") is not None else None
|
||||
)
|
||||
return Document(
|
||||
id="book:" + str(book.get("id")),
|
||||
id="book__" + str(book.get("id")),
|
||||
sections=[Section(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Book: " + str(book.get("name")),
|
||||
metadata={"type": "book", "updated_at": str(book.get("updated_at"))},
|
||||
semantic_identifier="Book: " + title,
|
||||
title=title,
|
||||
doc_updated_at=time_str_to_utc(updated_at_str)
|
||||
if updated_at_str is not None
|
||||
else None,
|
||||
metadata={"type": "book"},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@ -91,13 +100,23 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
+ "/chapter/"
|
||||
+ str(chapter.get("slug"))
|
||||
)
|
||||
title = str(chapter.get("name", ""))
|
||||
text = chapter.get("name", "") + "\n" + chapter.get("description", "")
|
||||
updated_at_str = (
|
||||
str(chapter.get("updated_at"))
|
||||
if chapter.get("updated_at") is not None
|
||||
else None
|
||||
)
|
||||
return Document(
|
||||
id="chapter:" + str(chapter.get("id")),
|
||||
id="chapter__" + str(chapter.get("id")),
|
||||
sections=[Section(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Chapter: " + str(chapter.get("name")),
|
||||
metadata={"type": "chapter", "updated_at": str(chapter.get("updated_at"))},
|
||||
semantic_identifier="Chapter: " + title,
|
||||
title=title,
|
||||
doc_updated_at=time_str_to_utc(updated_at_str)
|
||||
if updated_at_str is not None
|
||||
else None,
|
||||
metadata={"type": "chapter"},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@ -105,13 +124,23 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
bookstack_client: BookStackApiClient, shelf: dict[str, Any]
|
||||
) -> Document:
|
||||
url = bookstack_client.build_app_url("/shelves/" + str(shelf.get("slug")))
|
||||
title = str(shelf.get("name", ""))
|
||||
text = shelf.get("name", "") + "\n" + shelf.get("description", "")
|
||||
updated_at_str = (
|
||||
str(shelf.get("updated_at"))
|
||||
if shelf.get("updated_at") is not None
|
||||
else None
|
||||
)
|
||||
return Document(
|
||||
id="shelf:" + str(shelf.get("id")),
|
||||
sections=[Section(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Shelf: " + str(shelf.get("name")),
|
||||
metadata={"type": "shelf", "updated_at": shelf.get("updated_at")},
|
||||
semantic_identifier="Shelf: " + title,
|
||||
title=title,
|
||||
doc_updated_at=time_str_to_utc(updated_at_str)
|
||||
if updated_at_str is not None
|
||||
else None,
|
||||
metadata={"type": "shelf"},
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@ -119,7 +148,7 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
bookstack_client: BookStackApiClient, page: dict[str, Any]
|
||||
) -> Document:
|
||||
page_id = str(page.get("id"))
|
||||
page_name = str(page.get("name"))
|
||||
title = str(page.get("name", ""))
|
||||
page_data = bookstack_client.get("/pages/" + page_id, {})
|
||||
url = bookstack_client.build_app_url(
|
||||
"/books/"
|
||||
@ -127,17 +156,24 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
+ "/page/"
|
||||
+ str(page_data.get("slug"))
|
||||
)
|
||||
page_html = (
|
||||
"<h1>" + html.escape(page_name) + "</h1>" + str(page_data.get("html"))
|
||||
)
|
||||
page_html = "<h1>" + html.escape(title) + "</h1>" + str(page_data.get("html"))
|
||||
text = parse_html_page_basic(page_html)
|
||||
updated_at_str = (
|
||||
str(page_data.get("updated_at"))
|
||||
if page_data.get("updated_at") is not None
|
||||
else None
|
||||
)
|
||||
time.sleep(0.1)
|
||||
return Document(
|
||||
id="page:" + page_id,
|
||||
sections=[Section(link=url, text=text)],
|
||||
source=DocumentSource.BOOKSTACK,
|
||||
semantic_identifier="Page: " + str(page_name),
|
||||
metadata={"type": "page", "updated_at": page_data.get("updated_at")},
|
||||
semantic_identifier="Page: " + str(title),
|
||||
title=str(title),
|
||||
doc_updated_at=time_str_to_utc(updated_at_str)
|
||||
if updated_at_str is not None
|
||||
else None,
|
||||
metadata={"type": "page"},
|
||||
)
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
|
@ -333,11 +333,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
|
||||
if not page_html:
|
||||
logger.debug("Page is empty, skipping: %s", page_url)
|
||||
continue
|
||||
page_text = (
|
||||
page.get("title", "")
|
||||
+ "\n"
|
||||
+ parse_html_page(page_html, self.confluence_client)
|
||||
)
|
||||
page_text = parse_html_page(page_html, self.confluence_client)
|
||||
comments_text = self._fetch_comments(self.confluence_client, page_id)
|
||||
page_text += comments_text
|
||||
|
||||
|
@ -3,16 +3,17 @@ from datetime import timezone
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from dateutil.parser import parse
|
||||
from jira import JIRA
|
||||
from jira.resources import Issue
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
@ -60,26 +61,32 @@ def fetch_jira_issues_batch(
|
||||
logger.warning(f"Found Jira object not of type Issue {jira}")
|
||||
continue
|
||||
|
||||
ticket_updated_time = parse(jira.fields.updated)
|
||||
|
||||
semantic_rep = (
|
||||
f"Jira Ticket Summary: {jira.fields.summary}\n"
|
||||
f"Description: {jira.fields.description}\n"
|
||||
+ "\n".join(
|
||||
[f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
|
||||
)
|
||||
semantic_rep = f"{jira.fields.description}\n" + "\n".join(
|
||||
[f"Comment: {comment.body}" for comment in jira.fields.comment.comments]
|
||||
)
|
||||
|
||||
page_url = f"{jira_client.client_info()}/browse/{jira.key}"
|
||||
|
||||
author = None
|
||||
try:
|
||||
author = BasicExpertInfo(
|
||||
display_name=jira.fields.creator.displayName,
|
||||
email=jira.fields.creator.emailAddress,
|
||||
)
|
||||
except Exception:
|
||||
# Author should exist but if not, doesn't matter
|
||||
pass
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=page_url,
|
||||
sections=[Section(link=page_url, text=semantic_rep)],
|
||||
source=DocumentSource.JIRA,
|
||||
semantic_identifier=jira.fields.summary,
|
||||
doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
|
||||
metadata={},
|
||||
doc_updated_at=time_str_to_utc(jira.fields.updated),
|
||||
primary_owners=[author] if author is not None else None,
|
||||
# TODO add secondary_owners if needed
|
||||
metadata={"label": jira.fields.labels} if jira.fields.labels else {},
|
||||
)
|
||||
)
|
||||
return doc_batch, len(batch)
|
||||
|
@ -140,11 +140,7 @@ class Document360Connector(LoadConnector, PollConnector):
|
||||
html_content = article_details["html_content"]
|
||||
article_content = parse_html_page_basic(html_content)
|
||||
doc_text = (
|
||||
f"workspace: {self.workspace}\n"
|
||||
f"category: {article['category_name']}\n"
|
||||
f"article: {article_details['title']} - "
|
||||
f"{article_details.get('description', '')}\n"
|
||||
f"{article_content}"
|
||||
f"{article_details.get('description', '')}\n{article_content}".strip()
|
||||
)
|
||||
|
||||
document = Document(
|
||||
@ -154,7 +150,10 @@ class Document360Connector(LoadConnector, PollConnector):
|
||||
semantic_identifier=article_details["title"],
|
||||
doc_updated_at=updated_at,
|
||||
primary_owners=authors,
|
||||
metadata={},
|
||||
metadata={
|
||||
"workspace": self.workspace,
|
||||
"category": article["category_name"],
|
||||
},
|
||||
)
|
||||
|
||||
doc_batch.append(document)
|
||||
|
0
backend/danswer/connectors/file/__init__.py
Normal file
0
backend/danswer/connectors/file/__init__.py
Normal file
@ -37,10 +37,9 @@ def _batch_github_objects(
|
||||
|
||||
|
||||
def _convert_pr_to_document(pull_request: PullRequest) -> Document:
|
||||
full_context = f"Pull-Request {pull_request.title}\n{pull_request.body}"
|
||||
return Document(
|
||||
id=pull_request.html_url,
|
||||
sections=[Section(link=pull_request.html_url, text=full_context)],
|
||||
sections=[Section(link=pull_request.html_url, text=pull_request.body or "")],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=pull_request.title,
|
||||
# updated_at is UTC time but is timezone unaware, explicitly add UTC
|
||||
@ -48,7 +47,7 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
|
||||
# due to local time discrepancies with UTC
|
||||
doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata={
|
||||
"merged": pull_request.merged,
|
||||
"merged": str(pull_request.merged),
|
||||
"state": pull_request.state,
|
||||
},
|
||||
)
|
||||
@ -60,10 +59,9 @@ def _fetch_issue_comments(issue: Issue) -> str:
|
||||
|
||||
|
||||
def _convert_issue_to_document(issue: Issue) -> Document:
|
||||
full_context = f"Issue {issue.title}\n{issue.body}"
|
||||
return Document(
|
||||
id=issue.html_url,
|
||||
sections=[Section(link=issue.html_url, text=full_context)],
|
||||
sections=[Section(link=issue.html_url, text=issue.body or "")],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=issue.title,
|
||||
# updated_at is UTC time but is timezone unaware
|
||||
|
@ -206,9 +206,6 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
speaker_to_name: dict[str, str] = {}
|
||||
|
||||
transcript_text = ""
|
||||
if call_title:
|
||||
transcript_text += f"Call Title: {call_title}\n\n"
|
||||
|
||||
call_purpose = call_metadata["purpose"]
|
||||
if call_purpose:
|
||||
transcript_text += f"Call Description: {call_purpose}\n\n"
|
||||
@ -234,6 +231,11 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
)
|
||||
transcript_text += f"{speaker_name}: {monolog}\n\n"
|
||||
|
||||
metadata = {}
|
||||
if call_metadata.get("system"):
|
||||
metadata["client"] = call_metadata.get("system")
|
||||
# TODO calls have a clientUniqueId field, can pull that in later
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=call_id,
|
||||
@ -246,7 +248,7 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
|
||||
timezone.utc
|
||||
),
|
||||
metadata={},
|
||||
metadata={"client": call_metadata.get("system")},
|
||||
)
|
||||
)
|
||||
yield doc_batch
|
||||
|
@ -466,24 +466,20 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
doc_batch = []
|
||||
for file in files_batch:
|
||||
try:
|
||||
text_contents = extract_text(file, service)
|
||||
if text_contents:
|
||||
full_context = file["name"] + " - " + text_contents
|
||||
else:
|
||||
full_context = file["name"]
|
||||
text_contents = extract_text(file, service) or ""
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=file["webViewLink"],
|
||||
sections=[
|
||||
Section(link=file["webViewLink"], text=full_context)
|
||||
Section(link=file["webViewLink"], text=text_contents)
|
||||
],
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=datetime.fromisoformat(
|
||||
file["modifiedTime"]
|
||||
).astimezone(timezone.utc),
|
||||
metadata={} if text_contents else {IGNORE_FOR_QA: True},
|
||||
metadata={} if text_contents else {IGNORE_FOR_QA: "True"},
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
|
0
backend/danswer/connectors/google_site/__init__.py
Normal file
0
backend/danswer/connectors/google_site/__init__.py
Normal file
@ -77,7 +77,7 @@ class GuruConnector(LoadConnector, PollConnector):
|
||||
for card in cards:
|
||||
title = card["preferredPhrase"]
|
||||
link = GURU_CARDS_URL + card["slug"]
|
||||
content_text = title + "\n" + parse_html_page_basic(card["content"])
|
||||
content_text = parse_html_page_basic(card["content"])
|
||||
last_updated = time_str_to_utc(card["lastModified"])
|
||||
last_verified = (
|
||||
time_str_to_utc(card.get("lastVerified"))
|
||||
|
@ -73,7 +73,7 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
|
||||
title = ticket.properties["subject"]
|
||||
link = self.ticket_base_url + ticket.id
|
||||
content_text = title + "\n" + ticket.properties["content"]
|
||||
content_text = ticket.properties["content"]
|
||||
|
||||
associated_emails: list[str] = []
|
||||
associated_notes: list[str] = []
|
||||
|
0
backend/danswer/connectors/linear/__init__.py
Normal file
0
backend/danswer/connectors/linear/__init__.py
Normal file
@ -8,6 +8,7 @@ import requests
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@ -30,7 +31,6 @@ def _make_query(request_body: dict[str, Any], api_key: str) -> requests.Response
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
response: requests.Response | None = None
|
||||
for i in range(_NUM_RETRIES):
|
||||
try:
|
||||
response = requests.post(
|
||||
@ -187,8 +187,8 @@ class LinearConnector(LoadConnector, PollConnector):
|
||||
],
|
||||
source=DocumentSource.LINEAR,
|
||||
semantic_identifier=node["identifier"],
|
||||
doc_updated_at=time_str_to_utc(node["updatedAt"]),
|
||||
metadata={
|
||||
"updated_at": node["updatedAt"],
|
||||
"team": node["team"]["name"],
|
||||
},
|
||||
)
|
||||
|
@ -1,10 +1,10 @@
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import INDEX_SEPARATOR
|
||||
from danswer.utils.text_processing import make_url_compatible
|
||||
|
||||
|
||||
@ -50,21 +50,38 @@ class DocumentBase(BaseModel):
|
||||
sections: list[Section]
|
||||
source: DocumentSource | None = None
|
||||
semantic_identifier: str # displayed in the UI as the main identifier for the doc
|
||||
metadata: dict[str, Any]
|
||||
metadata: dict[str, str | list[str]]
|
||||
# UTC time
|
||||
doc_updated_at: datetime | None = None
|
||||
# Owner, creator, etc.
|
||||
primary_owners: list[BasicExpertInfo] | None = None
|
||||
# Assignee, space owner, etc.
|
||||
secondary_owners: list[BasicExpertInfo] | None = None
|
||||
# `title` is used when computing best matches for a query
|
||||
# if `None`, then we will use the `semantic_identifier` as the title in Vespa
|
||||
# title is used for search whereas semantic_identifier is used for displaying in the UI
|
||||
# different because Slack message may display as #general but general should not be part
|
||||
# of the search, at least not in the same way as a document title should be for like Confluence
|
||||
# The default title is semantic_identifier though unless otherwise specified
|
||||
title: str | None = None
|
||||
from_ingestion_api: bool = False
|
||||
|
||||
def get_title_for_document_index(self) -> str:
|
||||
def get_title_for_document_index(self) -> str | None:
|
||||
# If title is explicitly empty, return a None here for embedding purposes
|
||||
if self.title == "":
|
||||
return None
|
||||
return self.semantic_identifier if self.title is None else self.title
|
||||
|
||||
def get_metadata_str_attributes(self) -> list[str] | None:
|
||||
if not self.metadata:
|
||||
return None
|
||||
# Combined string for the key/value for easy filtering
|
||||
attributes: list[str] = []
|
||||
for k, v in self.metadata.items():
|
||||
if isinstance(v, list):
|
||||
attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
|
||||
else:
|
||||
attributes.append(k + INDEX_SEPARATOR + v)
|
||||
return attributes
|
||||
|
||||
|
||||
class Document(DocumentBase):
|
||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||
|
@ -267,7 +267,8 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
yield (
|
||||
Document(
|
||||
id=page.id,
|
||||
sections=[Section(link=page.url, text=f"{page_title}\n")]
|
||||
# Will add title to the first section later in processing
|
||||
sections=[Section(link=page.url, text="")]
|
||||
+ [
|
||||
Section(
|
||||
link=f"{page.url}#{block_id.replace('-', '')}",
|
||||
|
@ -14,6 +14,7 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_st
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
@ -94,26 +95,24 @@ class ProductboardConnector(PollConnector):
|
||||
for feature in self._fetch_documents(
|
||||
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/features"
|
||||
):
|
||||
owner = self._get_owner_email(feature)
|
||||
experts = [BasicExpertInfo(email=owner)] if owner else None
|
||||
|
||||
yield Document(
|
||||
id=feature["id"],
|
||||
sections=[
|
||||
Section(
|
||||
link=feature["links"]["html"],
|
||||
text=" - ".join(
|
||||
(
|
||||
feature["name"],
|
||||
self._parse_description_html(feature["description"]),
|
||||
)
|
||||
),
|
||||
text=self._parse_description_html(feature["description"]),
|
||||
)
|
||||
],
|
||||
semantic_identifier=feature["name"],
|
||||
source=DocumentSource.PRODUCTBOARD,
|
||||
doc_updated_at=time_str_to_utc(feature["updatedAt"]),
|
||||
primary_owners=experts,
|
||||
metadata={
|
||||
"productboard_entity_type": feature["type"],
|
||||
"entity_type": feature["type"],
|
||||
"status": feature["status"]["name"],
|
||||
"owner": self._get_owner_email(feature),
|
||||
},
|
||||
)
|
||||
|
||||
@ -122,25 +121,23 @@ class ProductboardConnector(PollConnector):
|
||||
for component in self._fetch_documents(
|
||||
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/components"
|
||||
):
|
||||
owner = self._get_owner_email(component)
|
||||
experts = [BasicExpertInfo(email=owner)] if owner else None
|
||||
|
||||
yield Document(
|
||||
id=component["id"],
|
||||
sections=[
|
||||
Section(
|
||||
link=component["links"]["html"],
|
||||
text=" - ".join(
|
||||
(
|
||||
component["name"],
|
||||
self._parse_description_html(component["description"]),
|
||||
)
|
||||
),
|
||||
text=self._parse_description_html(component["description"]),
|
||||
)
|
||||
],
|
||||
semantic_identifier=component["name"],
|
||||
source=DocumentSource.PRODUCTBOARD,
|
||||
doc_updated_at=time_str_to_utc(component["updatedAt"]),
|
||||
primary_owners=experts,
|
||||
metadata={
|
||||
"productboard_entity_type": "component",
|
||||
"owner": self._get_owner_email(component),
|
||||
"entity_type": "component",
|
||||
},
|
||||
)
|
||||
|
||||
@ -150,25 +147,23 @@ class ProductboardConnector(PollConnector):
|
||||
for product in self._fetch_documents(
|
||||
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/products"
|
||||
):
|
||||
owner = self._get_owner_email(product)
|
||||
experts = [BasicExpertInfo(email=owner)] if owner else None
|
||||
|
||||
yield Document(
|
||||
id=product["id"],
|
||||
sections=[
|
||||
Section(
|
||||
link=product["links"]["html"],
|
||||
text=" - ".join(
|
||||
(
|
||||
product["name"],
|
||||
self._parse_description_html(product["description"]),
|
||||
)
|
||||
),
|
||||
text=self._parse_description_html(product["description"]),
|
||||
)
|
||||
],
|
||||
semantic_identifier=product["name"],
|
||||
source=DocumentSource.PRODUCTBOARD,
|
||||
doc_updated_at=time_str_to_utc(product["updatedAt"]),
|
||||
primary_owners=experts,
|
||||
metadata={
|
||||
"productboard_entity_type": "product",
|
||||
"owner": self._get_owner_email(product),
|
||||
"entity_type": "product",
|
||||
},
|
||||
)
|
||||
|
||||
@ -176,26 +171,24 @@ class ProductboardConnector(PollConnector):
|
||||
for objective in self._fetch_documents(
|
||||
initial_link=f"{_PRODUCT_BOARD_BASE_URL}/objectives"
|
||||
):
|
||||
owner = self._get_owner_email(objective)
|
||||
experts = [BasicExpertInfo(email=owner)] if owner else None
|
||||
|
||||
yield Document(
|
||||
id=objective["id"],
|
||||
sections=[
|
||||
Section(
|
||||
link=objective["links"]["html"],
|
||||
text=" - ".join(
|
||||
(
|
||||
objective["name"],
|
||||
self._parse_description_html(objective["description"]),
|
||||
)
|
||||
),
|
||||
text=self._parse_description_html(objective["description"]),
|
||||
)
|
||||
],
|
||||
semantic_identifier=objective["name"],
|
||||
source=DocumentSource.PRODUCTBOARD,
|
||||
doc_updated_at=time_str_to_utc(objective["updatedAt"]),
|
||||
primary_owners=experts,
|
||||
metadata={
|
||||
"productboard_entity_type": "release",
|
||||
"entity_type": "release",
|
||||
"state": objective["state"],
|
||||
"owner": self._get_owner_email(objective),
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -97,7 +97,8 @@ class RequestTrackerConnector(PollConnector):
|
||||
logger.info(f"Processing ticket {tid}")
|
||||
doc = Document(
|
||||
id=ticket["id"],
|
||||
sections=[Section(link=ticketLink, text=f"{ticket['Subject']}\n")]
|
||||
# Will add title to the first section later in processing
|
||||
sections=[Section(link=ticketLink, text="")]
|
||||
+ self.build_doc_sections_from_txn(Rt0, tid),
|
||||
source=DocumentSource.REQUESTTRACKER,
|
||||
semantic_identifier=ticket["Subject"],
|
||||
|
0
backend/danswer/connectors/zulip/__init__.py
Normal file
0
backend/danswer/connectors/zulip/__init__.py
Normal file
@ -642,6 +642,7 @@ def create_db_search_doc(
|
||||
source_type=server_search_doc.source_type,
|
||||
boost=server_search_doc.boost,
|
||||
hidden=server_search_doc.hidden,
|
||||
doc_metadata=server_search_doc.metadata,
|
||||
score=server_search_doc.score,
|
||||
match_highlights=server_search_doc.match_highlights,
|
||||
updated_at=server_search_doc.updated_at,
|
||||
@ -674,6 +675,7 @@ def translate_db_search_doc_to_server_search_doc(
|
||||
source_type=db_search_doc.source_type,
|
||||
boost=db_search_doc.boost,
|
||||
hidden=db_search_doc.hidden,
|
||||
metadata=db_search_doc.doc_metadata,
|
||||
score=db_search_doc.score,
|
||||
match_highlights=db_search_doc.match_highlights,
|
||||
updated_at=db_search_doc.updated_at,
|
||||
|
@ -17,6 +17,7 @@ from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.models import Credential
|
||||
from danswer.db.models import Document as DbDocument
|
||||
from danswer.db.models import DocumentByConnectorCredentialPair
|
||||
from danswer.db.tag import delete_document_tags_for_documents
|
||||
from danswer.db.utils import model_to_dict
|
||||
from danswer.document_index.interfaces import DocumentMetadata
|
||||
from danswer.server.documents.models import ConnectorCredentialPairIdentifier
|
||||
@ -272,6 +273,7 @@ def delete_documents_complete(db_session: Session, document_ids: list[str]) -> N
|
||||
delete_document_feedback_for_documents(
|
||||
document_ids=document_ids, db_session=db_session
|
||||
)
|
||||
delete_document_tags_for_documents(document_ids=document_ids, db_session=db_session)
|
||||
delete_documents(db_session, document_ids)
|
||||
db_session.commit()
|
||||
|
||||
|
@ -22,6 +22,7 @@ from sqlalchemy import Integer
|
||||
from sqlalchemy import Sequence
|
||||
from sqlalchemy import String
|
||||
from sqlalchemy import Text
|
||||
from sqlalchemy import UniqueConstraint
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
from sqlalchemy.orm import Mapped
|
||||
@ -153,6 +154,15 @@ class ChatMessage__SearchDoc(Base):
|
||||
)
|
||||
|
||||
|
||||
class Document__Tag(Base):
|
||||
__tablename__ = "document__tag"
|
||||
|
||||
document_id: Mapped[str] = mapped_column(
|
||||
ForeignKey("document.id"), primary_key=True
|
||||
)
|
||||
tag_id: Mapped[int] = mapped_column(ForeignKey("tag.id"), primary_key=True)
|
||||
|
||||
|
||||
"""
|
||||
Documents/Indexing Tables
|
||||
"""
|
||||
@ -247,6 +257,32 @@ class Document(Base):
|
||||
retrieval_feedbacks: Mapped[List["DocumentRetrievalFeedback"]] = relationship(
|
||||
"DocumentRetrievalFeedback", back_populates="document"
|
||||
)
|
||||
tags = relationship(
|
||||
"Tag",
|
||||
secondary="document__tag",
|
||||
back_populates="documents",
|
||||
)
|
||||
|
||||
|
||||
class Tag(Base):
|
||||
__tablename__ = "tag"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
tag_key: Mapped[str] = mapped_column(String)
|
||||
tag_value: Mapped[str] = mapped_column(String)
|
||||
source: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
|
||||
|
||||
documents = relationship(
|
||||
"Document",
|
||||
secondary="document__tag",
|
||||
back_populates="tags",
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
UniqueConstraint(
|
||||
"tag_key", "tag_value", "source", name="_tag_key_value_source_uc"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class Connector(Base):
|
||||
@ -424,6 +460,7 @@ class SearchDoc(Base):
|
||||
boost: Mapped[int] = mapped_column(Integer)
|
||||
source_type: Mapped[DocumentSource] = mapped_column(Enum(DocumentSource))
|
||||
hidden: Mapped[bool] = mapped_column(Boolean)
|
||||
doc_metadata: Mapped[dict[str, str | list[str]]] = mapped_column(postgresql.JSONB())
|
||||
score: Mapped[float] = mapped_column(Float)
|
||||
match_highlights: Mapped[list[str]] = mapped_column(postgresql.ARRAY(String))
|
||||
# This is for the document, not this row in the table
|
||||
|
116
backend/danswer/db/tag.py
Normal file
116
backend/danswer/db/tag.py
Normal file
@ -0,0 +1,116 @@
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.db.models import Document
|
||||
from danswer.db.models import Document__Tag
|
||||
from danswer.db.models import Tag
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def create_or_add_document_tag(
|
||||
tag_key: str,
|
||||
tag_value: str,
|
||||
source: DocumentSource,
|
||||
document_id: str,
|
||||
db_session: Session,
|
||||
) -> Tag:
|
||||
document = db_session.get(Document, document_id)
|
||||
if not document:
|
||||
raise ValueError("Invalid Document, cannot attach Tags")
|
||||
|
||||
tag_stmt = select(Tag).where(
|
||||
Tag.tag_key == tag_key,
|
||||
Tag.tag_value == tag_value,
|
||||
Tag.source == source,
|
||||
)
|
||||
tag = db_session.execute(tag_stmt).scalar_one_or_none()
|
||||
|
||||
if not tag:
|
||||
tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
|
||||
db_session.add(tag)
|
||||
|
||||
if tag not in document.tags:
|
||||
document.tags.append(tag)
|
||||
|
||||
db_session.commit()
|
||||
return tag
|
||||
|
||||
|
||||
def create_or_add_document_tag_list(
|
||||
tag_key: str,
|
||||
tag_values: list[str],
|
||||
source: DocumentSource,
|
||||
document_id: str,
|
||||
db_session: Session,
|
||||
) -> list[Tag]:
|
||||
document = db_session.get(Document, document_id)
|
||||
if not document:
|
||||
raise ValueError("Invalid Document, cannot attach Tags")
|
||||
|
||||
existing_tags_stmt = select(Tag).where(
|
||||
Tag.tag_key == tag_key, Tag.tag_value.in_(tag_values), Tag.source == source
|
||||
)
|
||||
existing_tags = list(db_session.execute(existing_tags_stmt).scalars().all())
|
||||
existing_tag_values = {tag.tag_value for tag in existing_tags}
|
||||
|
||||
new_tags = []
|
||||
for tag_value in tag_values:
|
||||
if tag_value not in existing_tag_values:
|
||||
new_tag = Tag(tag_key=tag_key, tag_value=tag_value, source=source)
|
||||
db_session.add(new_tag)
|
||||
new_tags.append(new_tag)
|
||||
|
||||
all_tags = existing_tags + new_tags
|
||||
|
||||
for tag in all_tags:
|
||||
if tag not in document.tags:
|
||||
document.tags.append(tag)
|
||||
|
||||
db_session.commit()
|
||||
return all_tags
|
||||
|
||||
|
||||
def get_tags_by_value_prefix_for_source_types(
|
||||
tag_value_prefix: str | None,
|
||||
sources: list[DocumentSource] | None,
|
||||
db_session: Session,
|
||||
) -> list[Tag]:
|
||||
query = select(Tag)
|
||||
|
||||
if tag_value_prefix:
|
||||
query = query.where(Tag.tag_value.startswith(tag_value_prefix))
|
||||
|
||||
if sources:
|
||||
query = query.where(Tag.source.in_(sources))
|
||||
|
||||
result = db_session.execute(query)
|
||||
|
||||
tags = result.scalars().all()
|
||||
return list(tags)
|
||||
|
||||
|
||||
def delete_document_tags_for_documents(
|
||||
document_ids: list[str], db_session: Session
|
||||
) -> None:
|
||||
stmt = delete(Document__Tag).where(Document__Tag.document_id.in_(document_ids))
|
||||
db_session.execute(stmt)
|
||||
db_session.commit()
|
||||
|
||||
orphan_tags_query = (
|
||||
select(Tag.id)
|
||||
.outerjoin(Document__Tag, Tag.id == Document__Tag.tag_id)
|
||||
.group_by(Tag.id)
|
||||
.having(func.count(Document__Tag.document_id) == 0)
|
||||
)
|
||||
|
||||
orphan_tags = db_session.execute(orphan_tags_query).scalars().all()
|
||||
|
||||
if orphan_tags:
|
||||
delete_orphan_tags_stmt = delete(Tag).where(Tag.id.in_(orphan_tags))
|
||||
db_session.execute(delete_orphan_tags_stmt)
|
||||
db_session.commit()
|
@ -7,12 +7,20 @@ schema danswer_chunk {
|
||||
field chunk_id type int {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field blurb type string {
|
||||
# Displayed in the UI as the main identifier for the doc
|
||||
field semantic_identifier type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# Can separate out title in the future and give heavier bm-25 weighting
|
||||
# Need to consider that not every doc has a separable title (ie. slack message)
|
||||
# Set summary options to enable bolding
|
||||
# May not always match the `semantic_identifier` e.g. for Slack docs the
|
||||
# `semantic_identifier` will be the channel name, but the `title` will be empty
|
||||
field title type string {
|
||||
indexing: summary | index
|
||||
match {
|
||||
gram
|
||||
gram-size: 3
|
||||
}
|
||||
index: enable-bm25
|
||||
}
|
||||
field content type string {
|
||||
indexing: summary | index
|
||||
match {
|
||||
@ -28,6 +36,25 @@ schema danswer_chunk {
|
||||
indexing: summary | index
|
||||
summary: dynamic
|
||||
}
|
||||
# Title embedding (x1)
|
||||
field title_embedding type tensor<float>(x[384]) {
|
||||
indexing: attribute
|
||||
attribute {
|
||||
distance-metric: angular
|
||||
}
|
||||
}
|
||||
# Content embeddings (chunk + optional mini chunks embeddings)
|
||||
# "t" and "x" are arbitrary names, not special keywords
|
||||
field embeddings type tensor<float>(t{},x[384]) {
|
||||
indexing: attribute
|
||||
attribute {
|
||||
distance-metric: angular
|
||||
}
|
||||
}
|
||||
# Starting section of the doc, currently unused as it has been replaced by match highlighting
|
||||
field blurb type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
|
||||
field source_type type string {
|
||||
indexing: summary | attribute
|
||||
@ -39,21 +66,6 @@ schema danswer_chunk {
|
||||
field source_links type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# displayed in the UI as the main identifier for the doc
|
||||
field semantic_identifier type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# this is used when computing best matches based on the title of the document
|
||||
# may not always match the `semantic_identifier` e.g. for Slack docs the
|
||||
# `semantic_identifier` will be the channel name, but the `title` will be empty
|
||||
field title type string {
|
||||
indexing: summary | index
|
||||
match {
|
||||
gram
|
||||
gram-size: 3
|
||||
}
|
||||
index: enable-bm25
|
||||
}
|
||||
field section_continuation type bool {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
@ -65,15 +77,15 @@ schema danswer_chunk {
|
||||
indexing: summary | attribute
|
||||
rank: filter
|
||||
}
|
||||
# Needs to have a separate Attribute list for efficient filtering
|
||||
field metadata_list type array<string> {
|
||||
indexing: summary | attribute
|
||||
rank:filter
|
||||
attribute: fast-search
|
||||
}
|
||||
field metadata type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field embeddings type tensor<float>(t{},x[384]) {
|
||||
indexing: attribute
|
||||
attribute {
|
||||
distance-metric: angular
|
||||
}
|
||||
}
|
||||
field doc_updated_at type int {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
@ -95,6 +107,11 @@ schema danswer_chunk {
|
||||
}
|
||||
}
|
||||
|
||||
# If using different tokenization settings, the fieldset has to be removed, and the field must
|
||||
# be specified in the yql like:
|
||||
# + 'or ({grammar: "weakAnd", defaultIndex:"title"}userInput(@query)) '
|
||||
# + 'or ({grammar: "weakAnd", defaultIndex:"content"}userInput(@query)) '
|
||||
# Note: for BM-25, the ngram size (and whether ngrams are used) changes the range of the scores
|
||||
fieldset default {
|
||||
fields: content, title
|
||||
}
|
||||
@ -124,6 +141,79 @@ schema danswer_chunk {
|
||||
match-features: recency_bias
|
||||
}
|
||||
|
||||
rank-profile hybrid_search inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[384])
|
||||
}
|
||||
|
||||
# This must be separate function for normalize_linear to work
|
||||
function vector_score() {
|
||||
expression {
|
||||
(query(title_content_ratio) * closeness(field, title_embedding)) +
|
||||
((1 - query(title_content_ratio)) * closeness(field, embeddings))
|
||||
}
|
||||
}
|
||||
|
||||
# This must be separate function for normalize_linear to work
|
||||
function keyword_score() {
|
||||
expression {
|
||||
(query(title_content_ratio) * bm25(title)) +
|
||||
((1 - query(title_content_ratio)) * bm25(content))
|
||||
}
|
||||
}
|
||||
|
||||
first-phase {
|
||||
expression: vector_score
|
||||
}
|
||||
|
||||
# Weighted average between Vector Search and BM-25
|
||||
# Each is a weighted average between the Title and Content fields
|
||||
# Finally each doc is boosted by it's user feedback based boost and recency
|
||||
# If any embedding or index field is missing, it just receives a score of 0
|
||||
# Assumptions:
|
||||
# - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
|
||||
# therefore not normalizing before combining.
|
||||
# - For documents without title, it gets a score of 0 for that and this is ok as documents
|
||||
# without any title match should be penalized.
|
||||
global-phase {
|
||||
expression {
|
||||
(
|
||||
# Weighted Vector Similarity Score
|
||||
(query(alpha) * normalize_linear(vector_score)) +
|
||||
# Weighted Keyword Similarity Score
|
||||
((1 - query(alpha)) * normalize_linear(keyword_score))
|
||||
)
|
||||
# Boost based on user feedback
|
||||
* document_boost
|
||||
# Decay factor based on time document was last updated
|
||||
* recency_bias
|
||||
}
|
||||
rerank-count: 1000
|
||||
}
|
||||
|
||||
match-features {
|
||||
bm25(title)
|
||||
bm25(content)
|
||||
closeness(field, title_embedding)
|
||||
closeness(field, embeddings)
|
||||
keyword_score
|
||||
vector_score
|
||||
document_boost
|
||||
recency_bias
|
||||
closest(embeddings)
|
||||
}
|
||||
}
|
||||
|
||||
# Used when searching from the admin UI for a specific doc to hide / boost
|
||||
# Very heavily prioritize title
|
||||
rank-profile admin_search inherits default, default_rank {
|
||||
first-phase {
|
||||
expression: bm25(content) + (5 * bm25(title))
|
||||
}
|
||||
}
|
||||
|
||||
# THE ONES BELOW ARE OUT OF DATE, DO NOT USE
|
||||
# THEY MIGHT NOT EVEN WORK AT ALL
|
||||
rank-profile keyword_search inherits default, default_rank {
|
||||
first-phase {
|
||||
expression: bm25(content) * document_boost * recency_bias
|
||||
@ -145,29 +235,4 @@ schema danswer_chunk {
|
||||
|
||||
match-features: recency_bias document_boost closest(embeddings)
|
||||
}
|
||||
|
||||
rank-profile hybrid_search inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[384])
|
||||
}
|
||||
|
||||
first-phase {
|
||||
expression: closeness(field, embeddings)
|
||||
}
|
||||
|
||||
global-phase {
|
||||
expression: ((query(alpha) * normalize_linear(closeness(field, embeddings))) + ((1 - query(alpha)) * normalize_linear(bm25(content)))) * document_boost * recency_bias
|
||||
rerank-count: 1000
|
||||
}
|
||||
|
||||
# Cannot pass normalize_linear features in match-features
|
||||
match-features: recency_bias document_boost closest(embeddings)
|
||||
}
|
||||
|
||||
# used when searching from the admin UI for a specific doc to hide / boost
|
||||
rank-profile admin_search inherits default, default_rank {
|
||||
first-phase {
|
||||
expression: bm25(content) + (5 * bm25(title))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ from danswer.configs.chat_configs import DOC_TIME_DECAY
|
||||
from danswer.configs.chat_configs import EDIT_KEYWORD_QUERY
|
||||
from danswer.configs.chat_configs import HYBRID_ALPHA
|
||||
from danswer.configs.chat_configs import NUM_RETURNED_HITS
|
||||
from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
|
||||
from danswer.configs.constants import ACCESS_CONTROL_LIST
|
||||
from danswer.configs.constants import BLURB
|
||||
from danswer.configs.constants import BOOST
|
||||
@ -35,7 +36,9 @@ from danswer.configs.constants import DOCUMENT_ID
|
||||
from danswer.configs.constants import DOCUMENT_SETS
|
||||
from danswer.configs.constants import EMBEDDINGS
|
||||
from danswer.configs.constants import HIDDEN
|
||||
from danswer.configs.constants import INDEX_SEPARATOR
|
||||
from danswer.configs.constants import METADATA
|
||||
from danswer.configs.constants import METADATA_LIST
|
||||
from danswer.configs.constants import PRIMARY_OWNERS
|
||||
from danswer.configs.constants import RECENCY_BIAS
|
||||
from danswer.configs.constants import SECONDARY_OWNERS
|
||||
@ -44,6 +47,8 @@ from danswer.configs.constants import SEMANTIC_IDENTIFIER
|
||||
from danswer.configs.constants import SOURCE_LINKS
|
||||
from danswer.configs.constants import SOURCE_TYPE
|
||||
from danswer.configs.constants import TITLE
|
||||
from danswer.configs.constants import TITLE_EMBEDDING
|
||||
from danswer.configs.constants import TITLE_SEPARATOR
|
||||
from danswer.configs.model_configs import SEARCH_DISTANCE_CUTOFF
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
@ -239,20 +244,25 @@ def _index_vespa_chunk(
|
||||
for ind, m_c_embed in enumerate(embeddings.mini_chunk_embeddings):
|
||||
embeddings_name_vector_map[f"mini_chunk_{ind}"] = m_c_embed
|
||||
|
||||
title = document.get_title_for_document_index()
|
||||
|
||||
vespa_document_fields = {
|
||||
DOCUMENT_ID: document.id,
|
||||
CHUNK_ID: chunk.chunk_id,
|
||||
BLURB: remove_invalid_unicode_chars(chunk.blurb),
|
||||
# this duplication of `content` is needed for keyword highlighting :(
|
||||
TITLE: remove_invalid_unicode_chars(title) if title else None,
|
||||
CONTENT: remove_invalid_unicode_chars(chunk.content),
|
||||
# This duplication of `content` is needed for keyword highlighting :(
|
||||
CONTENT_SUMMARY: remove_invalid_unicode_chars(chunk.content),
|
||||
SOURCE_TYPE: str(document.source.value),
|
||||
SOURCE_LINKS: json.dumps(chunk.source_links),
|
||||
SEMANTIC_IDENTIFIER: remove_invalid_unicode_chars(document.semantic_identifier),
|
||||
TITLE: remove_invalid_unicode_chars(document.get_title_for_document_index()),
|
||||
SECTION_CONTINUATION: chunk.section_continuation,
|
||||
METADATA: json.dumps(document.metadata),
|
||||
# Save as a list for efficient extraction as an Attribute
|
||||
METADATA_LIST: chunk.source_document.get_metadata_str_attributes(),
|
||||
EMBEDDINGS: embeddings_name_vector_map,
|
||||
TITLE_EMBEDDING: chunk.title_embedding,
|
||||
BOOST: chunk.boost,
|
||||
DOC_UPDATED_AT: _vespa_get_updated_at_attribute(document.doc_updated_at),
|
||||
PRIMARY_OWNERS: get_experts_stores_representations(document.primary_owners),
|
||||
@ -394,6 +404,12 @@ def _build_vespa_filters(filters: IndexFilters, include_hidden: bool = False) ->
|
||||
)
|
||||
filter_str += _build_or_filters(SOURCE_TYPE, source_strs)
|
||||
|
||||
tag_attributes = None
|
||||
tags = filters.tags
|
||||
if tags:
|
||||
tag_attributes = [tag.tag_key + INDEX_SEPARATOR + tag.tag_value for tag in tags]
|
||||
filter_str += _build_or_filters(METADATA_LIST, tag_attributes)
|
||||
|
||||
filter_str += _build_or_filters(DOCUMENT_SETS, filters.document_set)
|
||||
|
||||
filter_str += _build_time_filter(filters.time_cutoff)
|
||||
@ -448,6 +464,8 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
|
||||
if DOC_UPDATED_AT in fields
|
||||
else None
|
||||
)
|
||||
|
||||
# The highlights might include the title but this is the best way we have so far to show the highlighting
|
||||
match_highlights = _process_dynamic_summary(
|
||||
# fallback to regular `content` if the `content_summary` field
|
||||
# isn't present
|
||||
@ -459,6 +477,13 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
|
||||
f"Chunk with blurb: {fields.get(BLURB, 'Unknown')[:50]}... has no Semantic Identifier"
|
||||
)
|
||||
|
||||
# Remove the title from the first chunk as every chunk already included
|
||||
# its semantic identifier for LLM
|
||||
content = fields[CONTENT]
|
||||
if fields[CHUNK_ID] == 0:
|
||||
parts = content.split(TITLE_SEPARATOR, maxsplit=1)
|
||||
content = parts[1] if len(parts) > 1 and "\n" not in parts[0] else content
|
||||
|
||||
# User ran into this, not sure why this could happen, error checking here
|
||||
blurb = fields.get(BLURB)
|
||||
if not blurb:
|
||||
@ -477,7 +502,7 @@ def _vespa_hit_to_inference_chunk(hit: dict[str, Any]) -> InferenceChunk:
|
||||
return InferenceChunk(
|
||||
chunk_id=fields[CHUNK_ID],
|
||||
blurb=blurb,
|
||||
content=fields[CONTENT],
|
||||
content=content,
|
||||
source_links=source_links_dict,
|
||||
section_continuation=fields[SECTION_CONTINUATION],
|
||||
document_id=fields[DOCUMENT_ID],
|
||||
@ -725,6 +750,7 @@ class VespaIndex(DocumentIndex):
|
||||
num_to_retrieve: int = NUM_RETURNED_HITS,
|
||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||
) -> list[InferenceChunk]:
|
||||
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
|
||||
vespa_where_clauses = _build_vespa_filters(filters)
|
||||
yql = (
|
||||
VespaIndex.yql_base
|
||||
@ -759,6 +785,7 @@ class VespaIndex(DocumentIndex):
|
||||
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
|
||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||
) -> list[InferenceChunk]:
|
||||
# IMPORTANT: THIS FUNCTION IS NOT UP TO DATE, DOES NOT WORK CORRECTLY
|
||||
vespa_where_clauses = _build_vespa_filters(filters)
|
||||
yql = (
|
||||
VespaIndex.yql_base
|
||||
@ -798,6 +825,7 @@ class VespaIndex(DocumentIndex):
|
||||
time_decay_multiplier: float,
|
||||
num_to_retrieve: int,
|
||||
hybrid_alpha: float | None = HYBRID_ALPHA,
|
||||
title_content_ratio: float | None = TITLE_CONTENT_RATIO,
|
||||
distance_cutoff: float | None = SEARCH_DISTANCE_CUTOFF,
|
||||
edit_keyword_query: bool = EDIT_KEYWORD_QUERY,
|
||||
) -> list[InferenceChunk]:
|
||||
@ -808,6 +836,7 @@ class VespaIndex(DocumentIndex):
|
||||
VespaIndex.yql_base
|
||||
+ vespa_where_clauses
|
||||
+ f"(({{targetHits: {target_hits}}}nearestNeighbor(embeddings, query_embedding)) "
|
||||
+ f"or ({{targetHits: {target_hits}}}nearestNeighbor(title_embedding, query_embedding)) "
|
||||
+ 'or ({grammar: "weakAnd"}userInput(@query)) '
|
||||
+ f'or ({{defaultIndex: "{CONTENT_SUMMARY}"}}userInput(@query)))'
|
||||
)
|
||||
@ -828,6 +857,9 @@ class VespaIndex(DocumentIndex):
|
||||
"input.query(alpha)": hybrid_alpha
|
||||
if hybrid_alpha is not None
|
||||
else HYBRID_ALPHA,
|
||||
"input.query(title_content_ratio)": title_content_ratio
|
||||
if title_content_ratio is not None
|
||||
else TITLE_CONTENT_RATIO,
|
||||
"hits": num_to_retrieve,
|
||||
"offset": 0,
|
||||
"ranking.profile": "hybrid_search",
|
||||
|
@ -7,15 +7,15 @@ from transformers import AutoTokenizer # type:ignore
|
||||
from danswer.configs.app_configs import BLURB_SIZE
|
||||
from danswer.configs.app_configs import CHUNK_OVERLAP
|
||||
from danswer.configs.app_configs import MINI_CHUNK_SIZE
|
||||
from danswer.configs.constants import SECTION_SEPARATOR
|
||||
from danswer.configs.constants import TITLE_SEPARATOR
|
||||
from danswer.configs.model_configs import CHUNK_SIZE
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.indexing.models import DocAwareChunk
|
||||
from danswer.search.search_nlp_models import get_default_tokenizer
|
||||
from danswer.utils.text_processing import shared_precompare_cleanup
|
||||
|
||||
|
||||
SECTION_SEPARATOR = "\n\n"
|
||||
ChunkFunc = Callable[[Document], list[DocAwareChunk]]
|
||||
|
||||
|
||||
@ -29,7 +29,8 @@ def extract_blurb(text: str, blurb_size: int) -> str:
|
||||
|
||||
|
||||
def chunk_large_section(
|
||||
section: Section,
|
||||
section_text: str,
|
||||
section_link_text: str,
|
||||
document: Document,
|
||||
start_chunk_id: int,
|
||||
tokenizer: AutoTokenizer,
|
||||
@ -37,8 +38,6 @@ def chunk_large_section(
|
||||
chunk_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_size: int = BLURB_SIZE,
|
||||
) -> list[DocAwareChunk]:
|
||||
section_text = section.text
|
||||
section_link_text = section.link or ""
|
||||
blurb = extract_blurb(section_text, blurb_size)
|
||||
|
||||
sentence_aware_splitter = SentenceSplitter(
|
||||
@ -67,14 +66,18 @@ def chunk_document(
|
||||
subsection_overlap: int = CHUNK_OVERLAP,
|
||||
blurb_size: int = BLURB_SIZE,
|
||||
) -> list[DocAwareChunk]:
|
||||
title = document.get_title_for_document_index()
|
||||
title_prefix = title.replace("\n", " ") + TITLE_SEPARATOR if title else ""
|
||||
tokenizer = get_default_tokenizer()
|
||||
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
for section in document.sections:
|
||||
for ind, section in enumerate(document.sections):
|
||||
section_text = title_prefix + section.text if ind == 0 else section.text
|
||||
section_link_text = section.link or ""
|
||||
section_tok_length = len(tokenizer.tokenize(section.text))
|
||||
|
||||
section_tok_length = len(tokenizer.tokenize(section_text))
|
||||
current_tok_length = len(tokenizer.tokenize(chunk_text))
|
||||
curr_offset_len = len(shared_precompare_cleanup(chunk_text))
|
||||
|
||||
@ -96,7 +99,8 @@ def chunk_document(
|
||||
chunk_text = ""
|
||||
|
||||
large_section_chunks = chunk_large_section(
|
||||
section=section,
|
||||
section_text=section_text,
|
||||
section_link_text=section_link_text,
|
||||
document=document,
|
||||
start_chunk_id=len(chunks),
|
||||
tokenizer=tokenizer,
|
||||
@ -115,7 +119,7 @@ def chunk_document(
|
||||
<= chunk_tok_size
|
||||
):
|
||||
chunk_text += (
|
||||
SECTION_SEPARATOR + section.text if chunk_text else section.text
|
||||
SECTION_SEPARATOR + section_text if chunk_text else section_text
|
||||
)
|
||||
link_offsets[curr_offset_len] = section_link_text
|
||||
else:
|
||||
@ -130,7 +134,7 @@ def chunk_document(
|
||||
)
|
||||
)
|
||||
link_offsets = {0: section_link_text}
|
||||
chunk_text = section.text
|
||||
chunk_text = section_text
|
||||
|
||||
# Once we hit the end, if we're still in the process of building a chunk, add what we have
|
||||
if chunk_text:
|
||||
|
@ -21,6 +21,9 @@ def embed_chunks(
|
||||
enable_mini_chunk: bool = ENABLE_MINI_CHUNK,
|
||||
passage_prefix: str = ASYM_PASSAGE_PREFIX,
|
||||
) -> list[IndexChunk]:
|
||||
# Cache the Title embeddings to only have to do it once
|
||||
title_embed_dict: dict[str, list[float]] = {}
|
||||
|
||||
embedded_chunks: list[IndexChunk] = []
|
||||
if embedding_model is None:
|
||||
embedding_model = EmbeddingModel()
|
||||
@ -58,12 +61,24 @@ def embed_chunks(
|
||||
chunk_embeddings = embeddings[
|
||||
embedding_ind_start : embedding_ind_start + num_embeddings
|
||||
]
|
||||
|
||||
title = chunk.source_document.get_title_for_document_index()
|
||||
|
||||
title_embedding = None
|
||||
if title:
|
||||
if title in title_embed_dict:
|
||||
title_embedding = title_embed_dict[title]
|
||||
else:
|
||||
title_embedding = embedding_model.encode([title])[0]
|
||||
title_embed_dict[title] = title_embedding
|
||||
|
||||
new_embedded_chunk = IndexChunk(
|
||||
**{k: getattr(chunk, k) for k in chunk.__dataclass_fields__},
|
||||
embeddings=ChunkEmbedding(
|
||||
full_embedding=chunk_embeddings[0],
|
||||
mini_chunk_embeddings=chunk_embeddings[1:],
|
||||
),
|
||||
title_embedding=title_embedding,
|
||||
)
|
||||
embedded_chunks.append(new_embedded_chunk)
|
||||
embedding_ind_start += num_embeddings
|
||||
|
@ -17,6 +17,8 @@ from danswer.db.document import update_docs_updated_at
|
||||
from danswer.db.document import upsert_documents_complete
|
||||
from danswer.db.document_set import fetch_document_sets_for_documents
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.db.tag import create_or_add_document_tag
|
||||
from danswer.db.tag import create_or_add_document_tag_list
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.document_index.interfaces import DocumentIndex
|
||||
from danswer.document_index.interfaces import DocumentMetadata
|
||||
@ -44,6 +46,7 @@ def upsert_documents_in_db(
|
||||
index_attempt_metadata: IndexAttemptMetadata,
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
# Metadata here refers to basic document info, not metadata about the actual content
|
||||
doc_m_batch: list[DocumentMetadata] = []
|
||||
for doc in documents:
|
||||
first_link = next(
|
||||
@ -66,6 +69,26 @@ def upsert_documents_in_db(
|
||||
document_metadata_batch=doc_m_batch,
|
||||
)
|
||||
|
||||
# Insert document content metadata
|
||||
for doc in documents:
|
||||
for k, v in doc.metadata.items():
|
||||
if isinstance(v, list):
|
||||
create_or_add_document_tag_list(
|
||||
tag_key=k,
|
||||
tag_values=v,
|
||||
source=doc.source,
|
||||
document_id=doc.id,
|
||||
db_session=db_session,
|
||||
)
|
||||
else:
|
||||
create_or_add_document_tag(
|
||||
tag_key=k,
|
||||
tag_value=v,
|
||||
source=doc.source,
|
||||
document_id=doc.id,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def index_doc_batch(
|
||||
@ -121,6 +144,8 @@ def index_doc_batch(
|
||||
)
|
||||
|
||||
logger.debug("Starting chunking")
|
||||
|
||||
# The first chunk additionally contains the Title of the Document
|
||||
chunks: list[DocAwareChunk] = list(
|
||||
chain(*[chunker.chunk(document=document) for document in updatable_docs])
|
||||
)
|
||||
|
@ -1,7 +1,6 @@
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import fields
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from danswer.access.models import DocumentAccess
|
||||
from danswer.configs.constants import DocumentSource
|
||||
@ -48,6 +47,7 @@ class DocAwareChunk(BaseChunk):
|
||||
@dataclass
|
||||
class IndexChunk(DocAwareChunk):
|
||||
embeddings: ChunkEmbedding
|
||||
title_embedding: Embedding | None
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -95,7 +95,7 @@ class InferenceChunk(BaseChunk):
|
||||
recency_bias: float
|
||||
score: float | None
|
||||
hidden: bool
|
||||
metadata: dict[str, Any]
|
||||
metadata: dict[str, str | list[str]]
|
||||
# Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
|
||||
# to specify that a set of words should be highlighted. For example:
|
||||
# ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
|
||||
|
@ -48,10 +48,16 @@ class Embedder:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class Tag(BaseModel):
|
||||
tag_key: str
|
||||
tag_value: str
|
||||
|
||||
|
||||
class BaseFilters(BaseModel):
|
||||
source_type: list[DocumentSource] | None = None
|
||||
document_set: list[str] | None = None
|
||||
time_cutoff: datetime | None = None
|
||||
tags: list[Tag] | None = None
|
||||
|
||||
|
||||
class IndexFilters(BaseFilters):
|
||||
@ -110,6 +116,7 @@ class SearchDoc(BaseModel):
|
||||
# since a standard search will never find a hidden doc, this can only ever
|
||||
# be `True` when doing an admin search
|
||||
hidden: bool
|
||||
metadata: dict[str, str | list[str]]
|
||||
score: float | None
|
||||
# Matched sections in the doc. Uses Vespa syntax e.g. <hi>TEXT</hi>
|
||||
# to specify that a set of words should be highlighted. For example:
|
||||
|
@ -121,6 +121,7 @@ def retrieval_preprocessing(
|
||||
source_type=preset_filters.source_type or predicted_source_filters,
|
||||
document_set=preset_filters.document_set,
|
||||
time_cutoff=preset_filters.time_cutoff or predicted_time_cutoff,
|
||||
tags=preset_filters.tags, # Tags are never auto-extracted
|
||||
access_control_list=user_acl_filters,
|
||||
)
|
||||
|
||||
|
@ -96,6 +96,7 @@ def chunks_to_search_docs(chunks: list[InferenceChunk] | None) -> list[SearchDoc
|
||||
source_type=chunk.source_type,
|
||||
boost=chunk.boost,
|
||||
hidden=chunk.hidden,
|
||||
metadata=chunk.metadata,
|
||||
score=chunk.score,
|
||||
match_highlights=chunk.match_highlights,
|
||||
updated_at=chunk.updated_at,
|
||||
|
@ -5,12 +5,29 @@ from pydantic import BaseModel
|
||||
from pydantic import root_validator
|
||||
|
||||
from danswer.chat.models import RetrievalDocs
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import MessageType
|
||||
from danswer.configs.constants import SearchFeedbackType
|
||||
from danswer.search.models import BaseFilters
|
||||
from danswer.search.models import RetrievalDetails
|
||||
from danswer.search.models import SearchDoc
|
||||
from danswer.search.models import SearchType
|
||||
from danswer.search.models import Tag
|
||||
|
||||
|
||||
class TagRequest(BaseModel):
|
||||
match_pattern: str | None
|
||||
# If this is empty or None, then tags for all sources are considered
|
||||
sources: list[DocumentSource] | None
|
||||
allow_prefix: bool = True # This is currently the only option
|
||||
|
||||
|
||||
class SourceTag(Tag):
|
||||
source: DocumentSource
|
||||
|
||||
|
||||
class TagResponse(BaseModel):
|
||||
tags: list[SourceTag]
|
||||
|
||||
|
||||
class SimpleQueryRequest(BaseModel):
|
||||
|
@ -9,6 +9,7 @@ from danswer.auth.users import current_user
|
||||
from danswer.configs.chat_configs import DISABLE_LLM_CHUNK_FILTER
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.models import User
|
||||
from danswer.db.tag import get_tags_by_value_prefix_for_source_types
|
||||
from danswer.document_index.factory import get_default_document_index
|
||||
from danswer.document_index.vespa.index import VespaIndex
|
||||
from danswer.one_shot_answer.answer_question import stream_search_answer
|
||||
@ -30,6 +31,9 @@ from danswer.server.query_and_chat.models import DocumentSearchRequest
|
||||
from danswer.server.query_and_chat.models import HelperResponse
|
||||
from danswer.server.query_and_chat.models import QueryValidationResponse
|
||||
from danswer.server.query_and_chat.models import SimpleQueryRequest
|
||||
from danswer.server.query_and_chat.models import SourceTag
|
||||
from danswer.server.query_and_chat.models import TagRequest
|
||||
from danswer.server.query_and_chat.models import TagResponse
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@ -75,6 +79,29 @@ def admin_search(
|
||||
return AdminSearchResponse(documents=deduplicated_documents)
|
||||
|
||||
|
||||
@basic_router.post("/valid-tags")
|
||||
def get_tags(
|
||||
tag_request: TagRequest,
|
||||
_: User = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> TagResponse:
|
||||
if not tag_request.allow_prefix:
|
||||
raise NotImplementedError("Cannot disable prefix match for now")
|
||||
|
||||
db_tags = get_tags_by_value_prefix_for_source_types(
|
||||
tag_value_prefix=tag_request.match_pattern,
|
||||
sources=tag_request.sources,
|
||||
db_session=db_session,
|
||||
)
|
||||
server_tags = [
|
||||
SourceTag(
|
||||
tag_key=db_tag.tag_key, tag_value=db_tag.tag_value, source=db_tag.source
|
||||
)
|
||||
for db_tag in db_tags
|
||||
]
|
||||
return TagResponse(tags=server_tags)
|
||||
|
||||
|
||||
@basic_router.post("/search-intent")
|
||||
def get_search_type(
|
||||
simple_query: SimpleQueryRequest, _: User = Depends(current_user)
|
||||
|
@ -30,7 +30,11 @@ def send_chat_message(
|
||||
"chat_session_id": chat_session_id,
|
||||
"parent_message_id": parent_message,
|
||||
"prompt_id": 0, # Global default Prompt
|
||||
"retrieval_options": {"run_search": "always", "real_time": True},
|
||||
"retrieval_options": {
|
||||
"run_search": "always",
|
||||
"real_time": True,
|
||||
"filters": {"tags": []},
|
||||
},
|
||||
}
|
||||
|
||||
docs: list[dict] | None = None
|
||||
|
Loading…
x
Reference in New Issue
Block a user