From 3494d6a13a1c4865d4c2bfc1fb70054ffa2d0765 Mon Sep 17 00:00:00 2001 From: Weves Date: Fri, 7 Jul 2023 18:03:55 -0700 Subject: [PATCH] Replace IDs with names in Slack connector --- backend/danswer/connectors/slack/connector.py | 17 +++++- backend/danswer/connectors/slack/utils.py | 55 +++++++++++++++++++ backend/danswer/listeners/slack_listener.py | 8 +-- 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/backend/danswer/connectors/slack/connector.py b/backend/danswer/connectors/slack/connector.py index 83d82538b..fda9754cb 100644 --- a/backend/danswer/connectors/slack/connector.py +++ b/backend/danswer/connectors/slack/connector.py @@ -17,6 +17,7 @@ from danswer.connectors.models import Section from danswer.connectors.slack.utils import get_message_link from danswer.connectors.slack.utils import make_slack_api_call_paginated from danswer.connectors.slack.utils import make_slack_api_rate_limited +from danswer.connectors.slack.utils import UserIdReplacer from danswer.utils.logging import setup_logger from slack_sdk import WebClient from slack_sdk.web import SlackResponse @@ -83,7 +84,12 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType return threads -def thread_to_doc(workspace: str, channel: ChannelType, thread: ThreadType) -> Document: +def thread_to_doc( + workspace: str, + channel: ChannelType, + thread: ThreadType, + user_id_replacer: UserIdReplacer, +) -> Document: channel_id = channel["id"] return Document( id=f"{channel_id}__{thread[0]['ts']}", @@ -92,7 +98,7 @@ def thread_to_doc(workspace: str, channel: ChannelType, thread: ThreadType) -> D link=get_message_link( event=m, workspace=workspace, channel_id=channel_id ), - text=cast(str, m["text"]), + text=user_id_replacer.replace_user_ids_with_names(cast(str, m["text"])), ) for m in thread ], @@ -131,6 +137,8 @@ def get_all_docs( msg_filter_func: Callable[[MessageType], bool] = _default_msg_filter, ) -> Generator[Document, None, None]: """Get all documents in the workspace, channel by channel""" + user_id_replacer = UserIdReplacer(client=client) + channels = get_channels(client) for channel in channels: @@ -156,7 +164,10 @@ def get_all_docs( if filtered_thread: channel_docs += 1 yield thread_to_doc( - workspace=workspace, channel=channel, thread=filtered_thread + workspace=workspace, + channel=channel, + thread=filtered_thread, + user_id_replacer=user_id_replacer, ) logger.info( diff --git a/backend/danswer/connectors/slack/utils.py b/backend/danswer/connectors/slack/utils.py index 430858b46..1164d3e59 100644 --- a/backend/danswer/connectors/slack/utils.py +++ b/backend/danswer/connectors/slack/utils.py @@ -1,11 +1,16 @@ +import re import time from collections.abc import Callable from typing import Any from typing import cast +from danswer.utils.logging import setup_logger +from slack_sdk import WebClient from slack_sdk.errors import SlackApiError from slack_sdk.web import SlackResponse +logger = setup_logger() + # number of messages we request per page when fetching paginated slack messages _SLACK_LIMIT = 900 @@ -72,3 +77,53 @@ def make_slack_api_rate_limited( raise Exception(f"Max retries ({max_retries}) exceeded") return rate_limited_call + + +class UserIdReplacer: + """Utility class to replace user IDs with usernames in a message. + Handles caching, so the same request is not made multiple times + for the same user ID""" + + def __init__(self, client: WebClient) -> None: + self._client = client + self._user_id_to_name_map: dict[str, str] = {} + + def _get_slack_user_name(self, user_id: str) -> str: + if user_id not in self._user_id_to_name_map: + try: + response = make_slack_api_rate_limited(self._client.users_info)( + user=user_id + ) + # prefer display name if set, since that is what is shown in Slack + self._user_id_to_name_map[user_id] = ( + response["user"]["profile"]["display_name"] + or response["user"]["profile"]["real_name"] + ) + except SlackApiError as e: + logger.exception( + f"Error fetching data for user {user_id}: {e.response['error']}" + ) + raise + + return self._user_id_to_name_map[user_id] + + def replace_user_ids_with_names(self, message: str) -> str: + # Find user IDs in the message + user_ids = re.findall("<@(.*?)>", message) + + # Iterate over each user ID found + for user_id in user_ids: + try: + if user_id in self._user_id_to_name_map: + user_name = self._user_id_to_name_map[user_id] + else: + user_name = self._get_slack_user_name(user_id) + + # Replace the user ID with the username in the message + message = message.replace(f"<@{user_id}>", f"@{user_name}") + except Exception: + logger.exception( + f"Unable to replace user ID with username for user_id '{user_id}" + ) + + return message diff --git a/backend/danswer/listeners/slack_listener.py b/backend/danswer/listeners/slack_listener.py index 7a940f4d6..5663cf0c3 100644 --- a/backend/danswer/listeners/slack_listener.py +++ b/backend/danswer/listeners/slack_listener.py @@ -68,13 +68,13 @@ def _process_quotes( for quote_dict in quotes.values(): doc_id = str(quote_dict.get("document_id", "")) doc_link = quote_dict.get("link") - doc_name = quote_dict.get("semantic_identifier") + doc_name = str(quote_dict.get("semantic_identifier", "")) if doc_link and doc_name and doc_id and doc_id not in doc_identifiers: - doc_identifiers.append(str(doc_id)) + doc_identifiers.append(doc_id) custom_semantic_identifier = _build_custom_semantic_identifier( semantic_identifier=doc_name, - blurb=quote_dict.get("blurb", ""), - source=quote_dict.get("source_type", ""), + blurb=str(quote_dict.get("blurb", "")), + source=str(quote_dict.get("source_type", "")), ) quote_lines.append(f"- <{doc_link}|{custom_semantic_identifier}>")