Replace IDs with names in Slack connector

This commit is contained in:
Weves
2023-07-07 18:03:55 -07:00
committed by Chris Weaver
parent 79013ac9fd
commit 3494d6a13a
3 changed files with 73 additions and 7 deletions

View File

@@ -17,6 +17,7 @@ from danswer.connectors.models import Section
from danswer.connectors.slack.utils import get_message_link from danswer.connectors.slack.utils import get_message_link
from danswer.connectors.slack.utils import make_slack_api_call_paginated from danswer.connectors.slack.utils import make_slack_api_call_paginated
from danswer.connectors.slack.utils import make_slack_api_rate_limited from danswer.connectors.slack.utils import make_slack_api_rate_limited
from danswer.connectors.slack.utils import UserIdReplacer
from danswer.utils.logging import setup_logger from danswer.utils.logging import setup_logger
from slack_sdk import WebClient from slack_sdk import WebClient
from slack_sdk.web import SlackResponse from slack_sdk.web import SlackResponse
@@ -83,7 +84,12 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
return threads return threads
def thread_to_doc(workspace: str, channel: ChannelType, thread: ThreadType) -> Document: def thread_to_doc(
workspace: str,
channel: ChannelType,
thread: ThreadType,
user_id_replacer: UserIdReplacer,
) -> Document:
channel_id = channel["id"] channel_id = channel["id"]
return Document( return Document(
id=f"{channel_id}__{thread[0]['ts']}", id=f"{channel_id}__{thread[0]['ts']}",
@@ -92,7 +98,7 @@ def thread_to_doc(workspace: str, channel: ChannelType, thread: ThreadType) -> D
link=get_message_link( link=get_message_link(
event=m, workspace=workspace, channel_id=channel_id event=m, workspace=workspace, channel_id=channel_id
), ),
text=cast(str, m["text"]), text=user_id_replacer.replace_user_ids_with_names(cast(str, m["text"])),
) )
for m in thread for m in thread
], ],
@@ -131,6 +137,8 @@ def get_all_docs(
msg_filter_func: Callable[[MessageType], bool] = _default_msg_filter, msg_filter_func: Callable[[MessageType], bool] = _default_msg_filter,
) -> Generator[Document, None, None]: ) -> Generator[Document, None, None]:
"""Get all documents in the workspace, channel by channel""" """Get all documents in the workspace, channel by channel"""
user_id_replacer = UserIdReplacer(client=client)
channels = get_channels(client) channels = get_channels(client)
for channel in channels: for channel in channels:
@@ -156,7 +164,10 @@ def get_all_docs(
if filtered_thread: if filtered_thread:
channel_docs += 1 channel_docs += 1
yield thread_to_doc( yield thread_to_doc(
workspace=workspace, channel=channel, thread=filtered_thread workspace=workspace,
channel=channel,
thread=filtered_thread,
user_id_replacer=user_id_replacer,
) )
logger.info( logger.info(

View File

@@ -1,11 +1,16 @@
import re
import time import time
from collections.abc import Callable from collections.abc import Callable
from typing import Any from typing import Any
from typing import cast from typing import cast
from danswer.utils.logging import setup_logger
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError from slack_sdk.errors import SlackApiError
from slack_sdk.web import SlackResponse from slack_sdk.web import SlackResponse
logger = setup_logger()
# number of messages we request per page when fetching paginated slack messages # number of messages we request per page when fetching paginated slack messages
_SLACK_LIMIT = 900 _SLACK_LIMIT = 900
@@ -72,3 +77,53 @@ def make_slack_api_rate_limited(
raise Exception(f"Max retries ({max_retries}) exceeded") raise Exception(f"Max retries ({max_retries}) exceeded")
return rate_limited_call return rate_limited_call
class UserIdReplacer:
"""Utility class to replace user IDs with usernames in a message.
Handles caching, so the same request is not made multiple times
for the same user ID"""
def __init__(self, client: WebClient) -> None:
self._client = client
self._user_id_to_name_map: dict[str, str] = {}
def _get_slack_user_name(self, user_id: str) -> str:
if user_id not in self._user_id_to_name_map:
try:
response = make_slack_api_rate_limited(self._client.users_info)(
user=user_id
)
# prefer display name if set, since that is what is shown in Slack
self._user_id_to_name_map[user_id] = (
response["user"]["profile"]["display_name"]
or response["user"]["profile"]["real_name"]
)
except SlackApiError as e:
logger.exception(
f"Error fetching data for user {user_id}: {e.response['error']}"
)
raise
return self._user_id_to_name_map[user_id]
def replace_user_ids_with_names(self, message: str) -> str:
# Find user IDs in the message
user_ids = re.findall("<@(.*?)>", message)
# Iterate over each user ID found
for user_id in user_ids:
try:
if user_id in self._user_id_to_name_map:
user_name = self._user_id_to_name_map[user_id]
else:
user_name = self._get_slack_user_name(user_id)
# Replace the user ID with the username in the message
message = message.replace(f"<@{user_id}>", f"@{user_name}")
except Exception:
logger.exception(
f"Unable to replace user ID with username for user_id '{user_id}"
)
return message

View File

@@ -68,13 +68,13 @@ def _process_quotes(
for quote_dict in quotes.values(): for quote_dict in quotes.values():
doc_id = str(quote_dict.get("document_id", "")) doc_id = str(quote_dict.get("document_id", ""))
doc_link = quote_dict.get("link") doc_link = quote_dict.get("link")
doc_name = quote_dict.get("semantic_identifier") doc_name = str(quote_dict.get("semantic_identifier", ""))
if doc_link and doc_name and doc_id and doc_id not in doc_identifiers: if doc_link and doc_name and doc_id and doc_id not in doc_identifiers:
doc_identifiers.append(str(doc_id)) doc_identifiers.append(doc_id)
custom_semantic_identifier = _build_custom_semantic_identifier( custom_semantic_identifier = _build_custom_semantic_identifier(
semantic_identifier=doc_name, semantic_identifier=doc_name,
blurb=quote_dict.get("blurb", ""), blurb=str(quote_dict.get("blurb", "")),
source=quote_dict.get("source_type", ""), source=str(quote_dict.get("source_type", "")),
) )
quote_lines.append(f"- <{doc_link}|{custom_semantic_identifier}>") quote_lines.append(f"- <{doc_link}|{custom_semantic_identifier}>")