Replace IDs with names in Slack connector

This commit is contained in:
Weves 2023-07-07 18:03:55 -07:00 committed by Chris Weaver
parent 79013ac9fd
commit 3494d6a13a
3 changed files with 73 additions and 7 deletions

View File

@ -17,6 +17,7 @@ from danswer.connectors.models import Section
from danswer.connectors.slack.utils import get_message_link
from danswer.connectors.slack.utils import make_slack_api_call_paginated
from danswer.connectors.slack.utils import make_slack_api_rate_limited
from danswer.connectors.slack.utils import UserIdReplacer
from danswer.utils.logging import setup_logger
from slack_sdk import WebClient
from slack_sdk.web import SlackResponse
@ -83,7 +84,12 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
return threads
def thread_to_doc(workspace: str, channel: ChannelType, thread: ThreadType) -> Document:
def thread_to_doc(
workspace: str,
channel: ChannelType,
thread: ThreadType,
user_id_replacer: UserIdReplacer,
) -> Document:
channel_id = channel["id"]
return Document(
id=f"{channel_id}__{thread[0]['ts']}",
@ -92,7 +98,7 @@ def thread_to_doc(workspace: str, channel: ChannelType, thread: ThreadType) -> D
link=get_message_link(
event=m, workspace=workspace, channel_id=channel_id
),
text=cast(str, m["text"]),
text=user_id_replacer.replace_user_ids_with_names(cast(str, m["text"])),
)
for m in thread
],
@ -131,6 +137,8 @@ def get_all_docs(
msg_filter_func: Callable[[MessageType], bool] = _default_msg_filter,
) -> Generator[Document, None, None]:
"""Get all documents in the workspace, channel by channel"""
user_id_replacer = UserIdReplacer(client=client)
channels = get_channels(client)
for channel in channels:
@ -156,7 +164,10 @@ def get_all_docs(
if filtered_thread:
channel_docs += 1
yield thread_to_doc(
workspace=workspace, channel=channel, thread=filtered_thread
workspace=workspace,
channel=channel,
thread=filtered_thread,
user_id_replacer=user_id_replacer,
)
logger.info(

View File

@ -1,11 +1,16 @@
import re
import time
from collections.abc import Callable
from typing import Any
from typing import cast
from danswer.utils.logging import setup_logger
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from slack_sdk.web import SlackResponse
logger = setup_logger()
# number of messages we request per page when fetching paginated slack messages
_SLACK_LIMIT = 900
@ -72,3 +77,53 @@ def make_slack_api_rate_limited(
raise Exception(f"Max retries ({max_retries}) exceeded")
return rate_limited_call
class UserIdReplacer:
"""Utility class to replace user IDs with usernames in a message.
Handles caching, so the same request is not made multiple times
for the same user ID"""
def __init__(self, client: WebClient) -> None:
self._client = client
self._user_id_to_name_map: dict[str, str] = {}
def _get_slack_user_name(self, user_id: str) -> str:
if user_id not in self._user_id_to_name_map:
try:
response = make_slack_api_rate_limited(self._client.users_info)(
user=user_id
)
# prefer display name if set, since that is what is shown in Slack
self._user_id_to_name_map[user_id] = (
response["user"]["profile"]["display_name"]
or response["user"]["profile"]["real_name"]
)
except SlackApiError as e:
logger.exception(
f"Error fetching data for user {user_id}: {e.response['error']}"
)
raise
return self._user_id_to_name_map[user_id]
def replace_user_ids_with_names(self, message: str) -> str:
# Find user IDs in the message
user_ids = re.findall("<@(.*?)>", message)
# Iterate over each user ID found
for user_id in user_ids:
try:
if user_id in self._user_id_to_name_map:
user_name = self._user_id_to_name_map[user_id]
else:
user_name = self._get_slack_user_name(user_id)
# Replace the user ID with the username in the message
message = message.replace(f"<@{user_id}>", f"@{user_name}")
except Exception:
logger.exception(
f"Unable to replace user ID with username for user_id '{user_id}"
)
return message

View File

@ -68,13 +68,13 @@ def _process_quotes(
for quote_dict in quotes.values():
doc_id = str(quote_dict.get("document_id", ""))
doc_link = quote_dict.get("link")
doc_name = quote_dict.get("semantic_identifier")
doc_name = str(quote_dict.get("semantic_identifier", ""))
if doc_link and doc_name and doc_id and doc_id not in doc_identifiers:
doc_identifiers.append(str(doc_id))
doc_identifiers.append(doc_id)
custom_semantic_identifier = _build_custom_semantic_identifier(
semantic_identifier=doc_name,
blurb=quote_dict.get("blurb", ""),
source=quote_dict.get("source_type", ""),
blurb=str(quote_dict.get("blurb", "")),
source=str(quote_dict.get("source_type", "")),
)
quote_lines.append(f"- <{doc_link}|{custom_semantic_identifier}>")