2024-12-13 09:56:10 -08:00

578 lines
19 KiB
Python

import re
from datetime import datetime
from re import Match
import pytz
import timeago # type: ignore
from slack_sdk.models.blocks import ActionsBlock
from slack_sdk.models.blocks import Block
from slack_sdk.models.blocks import ButtonElement
from slack_sdk.models.blocks import ContextBlock
from slack_sdk.models.blocks import DividerBlock
from slack_sdk.models.blocks import HeaderBlock
from slack_sdk.models.blocks import Option
from slack_sdk.models.blocks import RadioButtonsElement
from slack_sdk.models.blocks import SectionBlock
from slack_sdk.models.blocks.basic_components import MarkdownTextObject
from slack_sdk.models.blocks.block_elements import ImageElement
from onyx.chat.models import ChatOnyxBotResponse
from onyx.configs.app_configs import DISABLE_GENERATIVE_AI
from onyx.configs.app_configs import WEB_DOMAIN
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import SearchFeedbackType
from onyx.configs.onyxbot_configs import DANSWER_BOT_NUM_DOCS_TO_DISPLAY
from onyx.context.search.models import SavedSearchDoc
from onyx.db.chat import get_chat_session_by_message_id
from onyx.db.engine import get_session_with_tenant
from onyx.db.models import ChannelConfig
from onyx.onyxbot.slack.constants import CONTINUE_IN_WEB_UI_ACTION_ID
from onyx.onyxbot.slack.constants import DISLIKE_BLOCK_ACTION_ID
from onyx.onyxbot.slack.constants import FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID
from onyx.onyxbot.slack.constants import FOLLOWUP_BUTTON_ACTION_ID
from onyx.onyxbot.slack.constants import FOLLOWUP_BUTTON_RESOLVED_ACTION_ID
from onyx.onyxbot.slack.constants import IMMEDIATE_RESOLVED_BUTTON_ACTION_ID
from onyx.onyxbot.slack.constants import LIKE_BLOCK_ACTION_ID
from onyx.onyxbot.slack.formatting import format_slack_message
from onyx.onyxbot.slack.icons import source_to_github_img_link
from onyx.onyxbot.slack.models import SlackMessageInfo
from onyx.onyxbot.slack.utils import build_continue_in_web_ui_id
from onyx.onyxbot.slack.utils import build_feedback_id
from onyx.onyxbot.slack.utils import remove_slack_text_interactions
from onyx.onyxbot.slack.utils import translate_vespa_highlight_to_slack
from onyx.utils.text_processing import decode_escapes
_MAX_BLURB_LEN = 45
def get_feedback_reminder_blocks(thread_link: str, include_followup: bool) -> Block:
text = (
f"Please provide feedback on <{thread_link}|this answer>. "
"This is essential to help us to improve the quality of the answers. "
"Please rate it by clicking the `Helpful` or `Not helpful` button. "
)
if include_followup:
text += "\n\nIf you need more help, click the `I need more help from a human!` button. "
text += "\n\nThanks!"
return SectionBlock(text=text)
def _process_citations_for_slack(text: str) -> str:
"""
Converts instances of [[x]](LINK) in the input text to Slack's link format <LINK|[x]>.
Args:
- text (str): The input string containing markdown links.
Returns:
- str: The string with markdown links converted to Slack format.
"""
# Regular expression to find all instances of [[x]](LINK)
pattern = r"\[\[(.*?)\]\]\((.*?)\)"
# Function to replace each found instance with Slack's format
def slack_link_format(match: Match) -> str:
link_text = match.group(1)
link_url = match.group(2)
# Account for empty link citations
if link_url == "":
return f"[{link_text}]"
return f"<{link_url}|[{link_text}]>"
# Substitute all matches in the input text
return re.sub(pattern, slack_link_format, text)
def _split_text(text: str, limit: int = 3000) -> list[str]:
if len(text) <= limit:
return [text]
chunks = []
while text:
if len(text) <= limit:
chunks.append(text)
break
# Find the nearest space before the limit to avoid splitting a word
split_at = text.rfind(" ", 0, limit)
if split_at == -1: # No spaces found, force split
split_at = limit
chunk = text[:split_at]
chunks.append(chunk)
text = text[split_at:].lstrip() # Remove leading spaces from the next chunk
return chunks
def _clean_markdown_link_text(text: str) -> str:
# Remove any newlines within the text
return text.replace("\n", " ").strip()
def _build_qa_feedback_block(
message_id: int, feedback_reminder_id: str | None = None
) -> Block:
return ActionsBlock(
block_id=build_feedback_id(message_id),
elements=[
ButtonElement(
action_id=LIKE_BLOCK_ACTION_ID,
text="👍 Helpful",
value=feedback_reminder_id,
),
ButtonElement(
action_id=DISLIKE_BLOCK_ACTION_ID,
text="👎 Not helpful",
value=feedback_reminder_id,
),
],
)
def get_document_feedback_blocks() -> Block:
return SectionBlock(
text=(
"- 'Up-Boost' if this document is a good source of information and should be "
"shown more often.\n"
"- 'Down-boost' if this document is a poor source of information and should be "
"shown less often.\n"
"- 'Hide' if this document is deprecated and should never be shown anymore."
),
accessory=RadioButtonsElement(
options=[
Option(
text=":thumbsup: Up-Boost",
value=SearchFeedbackType.ENDORSE.value,
),
Option(
text=":thumbsdown: Down-Boost",
value=SearchFeedbackType.REJECT.value,
),
Option(
text=":x: Hide",
value=SearchFeedbackType.HIDE.value,
),
]
),
)
def _build_doc_feedback_block(
message_id: int,
document_id: str,
document_rank: int,
) -> ButtonElement:
feedback_id = build_feedback_id(message_id, document_id, document_rank)
return ButtonElement(
action_id=FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID,
value=feedback_id,
text="Give Feedback",
)
def get_restate_blocks(
msg: str,
is_bot_msg: bool,
) -> list[Block]:
# Only the slash command needs this context because the user doesn't see their own input
if not is_bot_msg:
return []
return [
HeaderBlock(text="Responding to the Query"),
SectionBlock(text=f"```{msg}```"),
]
def _build_documents_blocks(
documents: list[SavedSearchDoc],
message_id: int | None,
num_docs_to_display: int = DANSWER_BOT_NUM_DOCS_TO_DISPLAY,
) -> list[Block]:
header_text = (
"Retrieved Documents" if DISABLE_GENERATIVE_AI else "Reference Documents"
)
seen_docs_identifiers = set()
section_blocks: list[Block] = [HeaderBlock(text=header_text)]
included_docs = 0
for rank, d in enumerate(documents):
if d.document_id in seen_docs_identifiers:
continue
seen_docs_identifiers.add(d.document_id)
# Strip newlines from the semantic identifier for Slackbot formatting
doc_sem_id = d.semantic_identifier.replace("\n", " ")
if d.source_type == DocumentSource.SLACK.value:
doc_sem_id = "#" + doc_sem_id
used_chars = len(doc_sem_id) + 3
match_str = translate_vespa_highlight_to_slack(d.match_highlights, used_chars)
included_docs += 1
header_line = f"{doc_sem_id}\n"
if d.link:
header_line = f"<{d.link}|{doc_sem_id}>\n"
updated_at_line = ""
if d.updated_at is not None:
updated_at_line = (
f"_Updated {timeago.format(d.updated_at, datetime.now(pytz.utc))}_\n"
)
body_text = f">{remove_slack_text_interactions(match_str)}"
block_text = header_line + updated_at_line + body_text
feedback: ButtonElement | dict = {}
if message_id is not None:
feedback = _build_doc_feedback_block(
message_id=message_id,
document_id=d.document_id,
document_rank=rank,
)
section_blocks.append(
SectionBlock(text=block_text, accessory=feedback),
)
section_blocks.append(DividerBlock())
if included_docs >= num_docs_to_display:
break
return section_blocks
def _build_sources_blocks(
cited_documents: list[tuple[int, SavedSearchDoc]],
num_docs_to_display: int = DANSWER_BOT_NUM_DOCS_TO_DISPLAY,
) -> list[Block]:
if not cited_documents:
return [
SectionBlock(
text="*Warning*: no sources were cited for this answer, so it may be unreliable 😔"
)
]
seen_docs_identifiers = set()
section_blocks: list[Block] = [SectionBlock(text="*Sources:*")]
included_docs = 0
for citation_num, d in cited_documents:
if d.document_id in seen_docs_identifiers:
continue
seen_docs_identifiers.add(d.document_id)
doc_sem_id = d.semantic_identifier
if d.source_type == DocumentSource.SLACK.value:
# for legacy reasons, before the switch to how Slack semantic identifiers are constructed
if "#" not in doc_sem_id:
doc_sem_id = "#" + doc_sem_id
# this is needed to try and prevent the line from overflowing
# if it does overflow, the image gets placed above the title and it
# looks bad
doc_sem_id = (
doc_sem_id[:_MAX_BLURB_LEN] + "..."
if len(doc_sem_id) > _MAX_BLURB_LEN
else doc_sem_id
)
owner_str = f"By {d.primary_owners[0]}" if d.primary_owners else None
days_ago_str = (
timeago.format(d.updated_at, datetime.now(pytz.utc))
if d.updated_at
else None
)
final_metadata_str = " | ".join(
([owner_str] if owner_str else [])
+ ([days_ago_str] if days_ago_str else [])
)
document_title = _clean_markdown_link_text(doc_sem_id)
img_link = source_to_github_img_link(d.source_type)
section_blocks.append(
ContextBlock(
elements=(
[
ImageElement(
image_url=img_link,
alt_text=f"{d.source_type.value} logo",
)
]
if img_link
else []
)
+ [
MarkdownTextObject(text=f"{document_title}")
if d.link == ""
else MarkdownTextObject(
text=f"*<{d.link}|[{citation_num}] {document_title}>*\n{final_metadata_str}"
),
]
)
)
if included_docs >= num_docs_to_display:
break
return section_blocks
def _priority_ordered_documents_blocks(
answer: ChatOnyxBotResponse,
) -> list[Block]:
docs_response = answer.docs if answer.docs else None
top_docs = docs_response.top_documents if docs_response else []
llm_doc_inds = answer.llm_selected_doc_indices or []
llm_docs = [top_docs[i] for i in llm_doc_inds]
remaining_docs = [
doc for idx, doc in enumerate(top_docs) if idx not in llm_doc_inds
]
priority_ordered_docs = llm_docs + remaining_docs
if not priority_ordered_docs:
return []
document_blocks = _build_documents_blocks(
documents=priority_ordered_docs,
message_id=answer.chat_message_id,
)
if document_blocks:
document_blocks = [DividerBlock()] + document_blocks
return document_blocks
def _build_citations_blocks(
answer: ChatOnyxBotResponse,
) -> list[Block]:
docs_response = answer.docs if answer.docs else None
top_docs = docs_response.top_documents if docs_response else []
citations = answer.citations or []
cited_docs = []
for citation in citations:
matching_doc = next(
(d for d in top_docs if d.document_id == citation.document_id),
None,
)
if matching_doc:
cited_docs.append((citation.citation_num, matching_doc))
cited_docs.sort()
citations_block = _build_sources_blocks(cited_documents=cited_docs)
return citations_block
def _build_qa_response_blocks(
answer: ChatOnyxBotResponse,
process_message_for_citations: bool = False,
) -> list[Block]:
retrieval_info = answer.docs
if not retrieval_info:
# This should not happen, even with no docs retrieved, there is still info returned
raise RuntimeError("Failed to retrieve docs, cannot answer question.")
formatted_answer = format_slack_message(answer.answer) if answer.answer else None
if DISABLE_GENERATIVE_AI:
return []
filter_block: Block | None = None
if (
retrieval_info.applied_time_cutoff
or retrieval_info.recency_bias_multiplier > 1
or retrieval_info.applied_source_filters
):
filter_text = "Filters: "
if retrieval_info.applied_source_filters:
sources_str = ", ".join(
[s.value for s in retrieval_info.applied_source_filters]
)
filter_text += f"`Sources in [{sources_str}]`"
if (
retrieval_info.applied_time_cutoff
or retrieval_info.recency_bias_multiplier > 1
):
filter_text += " and "
if retrieval_info.applied_time_cutoff is not None:
time_str = retrieval_info.applied_time_cutoff.strftime("%b %d, %Y")
filter_text += f"`Docs Updated >= {time_str}` "
if retrieval_info.recency_bias_multiplier > 1:
if retrieval_info.applied_time_cutoff is not None:
filter_text += "+ "
filter_text += "`Prioritize Recently Updated Docs`"
filter_block = SectionBlock(text=f"_{filter_text}_")
if not formatted_answer:
answer_blocks = [
SectionBlock(
text="Sorry, I was unable to find an answer, but I did find some potentially relevant docs 🤓"
)
]
else:
answer_processed = decode_escapes(
remove_slack_text_interactions(formatted_answer)
)
if process_message_for_citations:
answer_processed = _process_citations_for_slack(answer_processed)
answer_blocks = [
SectionBlock(text=text) for text in _split_text(answer_processed)
]
response_blocks: list[Block] = []
if filter_block is not None:
response_blocks.append(filter_block)
response_blocks.extend(answer_blocks)
return response_blocks
def _build_continue_in_web_ui_block(
tenant_id: str | None,
message_id: int | None,
) -> Block:
if message_id is None:
raise ValueError("No message id provided to build continue in web ui block")
with get_session_with_tenant(tenant_id) as db_session:
chat_session = get_chat_session_by_message_id(
db_session=db_session,
message_id=message_id,
)
return ActionsBlock(
block_id=build_continue_in_web_ui_id(message_id),
elements=[
ButtonElement(
action_id=CONTINUE_IN_WEB_UI_ACTION_ID,
text="Continue Chat in Onyx!",
style="primary",
url=f"{WEB_DOMAIN}/chat?slackChatId={chat_session.id}",
),
],
)
def _build_follow_up_block(message_id: int | None) -> ActionsBlock:
return ActionsBlock(
block_id=build_feedback_id(message_id) if message_id is not None else None,
elements=[
ButtonElement(
action_id=IMMEDIATE_RESOLVED_BUTTON_ACTION_ID,
style="primary",
text="I'm all set!",
),
ButtonElement(
action_id=FOLLOWUP_BUTTON_ACTION_ID,
style="danger",
text="I need more help from a human!",
),
],
)
def build_follow_up_resolved_blocks(
tag_ids: list[str], group_ids: list[str]
) -> list[Block]:
tag_str = " ".join([f"<@{tag}>" for tag in tag_ids])
if tag_str:
tag_str += " "
group_str = " ".join([f"<!subteam^{group_id}|>" for group_id in group_ids])
if group_str:
group_str += " "
text = (
tag_str
+ group_str
+ "Someone has requested more help.\n\n:point_down:Please mark this resolved after answering!"
)
text_block = SectionBlock(text=text)
button_block = ActionsBlock(
elements=[
ButtonElement(
action_id=FOLLOWUP_BUTTON_RESOLVED_ACTION_ID,
style="primary",
text="Mark Resolved",
)
]
)
return [text_block, button_block]
def build_slack_response_blocks(
answer: ChatOnyxBotResponse,
tenant_id: str | None,
message_info: SlackMessageInfo,
channel_conf: ChannelConfig | None,
use_citations: bool,
feedback_reminder_id: str | None,
skip_ai_feedback: bool = False,
) -> list[Block]:
"""
This function is a top level function that builds all the blocks for the Slack response.
It also handles combining all the blocks together.
"""
# If called with the OnyxBot slash command, the question is lost so we have to reshow it
restate_question_block = get_restate_blocks(
message_info.thread_messages[-1].message, message_info.is_bot_msg
)
answer_blocks = _build_qa_response_blocks(
answer=answer,
process_message_for_citations=use_citations,
)
web_follow_up_block = []
if channel_conf and channel_conf.get("show_continue_in_web_ui"):
web_follow_up_block.append(
_build_continue_in_web_ui_block(
tenant_id=tenant_id,
message_id=answer.chat_message_id,
)
)
follow_up_block = []
if channel_conf and channel_conf.get("follow_up_tags") is not None:
follow_up_block.append(
_build_follow_up_block(message_id=answer.chat_message_id)
)
ai_feedback_block = []
if answer.chat_message_id is not None and not skip_ai_feedback:
ai_feedback_block.append(
_build_qa_feedback_block(
message_id=answer.chat_message_id,
feedback_reminder_id=feedback_reminder_id,
)
)
citations_blocks = []
document_blocks = []
if use_citations and answer.citations:
citations_blocks = _build_citations_blocks(answer)
else:
document_blocks = _priority_ordered_documents_blocks(answer)
citations_divider = [DividerBlock()] if citations_blocks else []
buttons_divider = [DividerBlock()] if web_follow_up_block or follow_up_block else []
all_blocks = (
restate_question_block
+ answer_blocks
+ ai_feedback_block
+ citations_divider
+ citations_blocks
+ document_blocks
+ buttons_divider
+ web_follow_up_block
+ follow_up_block
)
return all_blocks