remove slack workspace (#3394)

* remove slack workspace

* update client tokens

* fix up

* clean up docs

* fix up tests
This commit is contained in:
pablonyx 2024-12-11 17:01:43 -08:00 committed by GitHub
parent 748b79b0ef
commit 0770a587f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 16 additions and 175 deletions

View File

@ -41,7 +41,6 @@ from danswer.connectors.salesforce.connector import SalesforceConnector
from danswer.connectors.sharepoint.connector import SharepointConnector from danswer.connectors.sharepoint.connector import SharepointConnector
from danswer.connectors.slab.connector import SlabConnector from danswer.connectors.slab.connector import SlabConnector
from danswer.connectors.slack.connector import SlackPollConnector from danswer.connectors.slack.connector import SlackPollConnector
from danswer.connectors.slack.load_connector import SlackLoadConnector
from danswer.connectors.teams.connector import TeamsConnector from danswer.connectors.teams.connector import TeamsConnector
from danswer.connectors.web.connector import WebConnector from danswer.connectors.web.connector import WebConnector
from danswer.connectors.wikipedia.connector import WikipediaConnector from danswer.connectors.wikipedia.connector import WikipediaConnector
@ -64,7 +63,6 @@ def identify_connector_class(
DocumentSource.WEB: WebConnector, DocumentSource.WEB: WebConnector,
DocumentSource.FILE: LocalFileConnector, DocumentSource.FILE: LocalFileConnector,
DocumentSource.SLACK: { DocumentSource.SLACK: {
InputType.LOAD_STATE: SlackLoadConnector,
InputType.POLL: SlackPollConnector, InputType.POLL: SlackPollConnector,
InputType.SLIM_RETRIEVAL: SlackPollConnector, InputType.SLIM_RETRIEVAL: SlackPollConnector,
}, },

View File

@ -134,7 +134,6 @@ def get_latest_message_time(thread: ThreadType) -> datetime:
def thread_to_doc( def thread_to_doc(
workspace: str,
channel: ChannelType, channel: ChannelType,
thread: ThreadType, thread: ThreadType,
slack_cleaner: SlackTextCleaner, slack_cleaner: SlackTextCleaner,
@ -179,9 +178,7 @@ def thread_to_doc(
id=f"{channel_id}__{thread[0]['ts']}", id=f"{channel_id}__{thread[0]['ts']}",
sections=[ sections=[
Section( Section(
link=get_message_link( link=get_message_link(event=m, client=client, channel_id=channel_id),
event=m, workspace=workspace, channel_id=channel_id
),
text=slack_cleaner.index_clean(cast(str, m["text"])), text=slack_cleaner.index_clean(cast(str, m["text"])),
) )
for m in thread for m in thread
@ -265,7 +262,6 @@ def filter_channels(
def _get_all_docs( def _get_all_docs(
client: WebClient, client: WebClient,
workspace: str,
channels: list[str] | None = None, channels: list[str] | None = None,
channel_name_regex_enabled: bool = False, channel_name_regex_enabled: bool = False,
oldest: str | None = None, oldest: str | None = None,
@ -312,7 +308,6 @@ def _get_all_docs(
if filtered_thread: if filtered_thread:
channel_docs += 1 channel_docs += 1
yield thread_to_doc( yield thread_to_doc(
workspace=workspace,
channel=channel, channel=channel,
thread=filtered_thread, thread=filtered_thread,
slack_cleaner=slack_cleaner, slack_cleaner=slack_cleaner,
@ -375,14 +370,12 @@ def _get_all_doc_ids(
class SlackPollConnector(PollConnector, SlimConnector): class SlackPollConnector(PollConnector, SlimConnector):
def __init__( def __init__(
self, self,
workspace: str,
channels: list[str] | None = None, channels: list[str] | None = None,
# if specified, will treat the specified channel strings as # if specified, will treat the specified channel strings as
# regexes, and will only index channels that fully match the regexes # regexes, and will only index channels that fully match the regexes
channel_regex_enabled: bool = False, channel_regex_enabled: bool = False,
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
) -> None: ) -> None:
self.workspace = workspace
self.channels = channels self.channels = channels
self.channel_regex_enabled = channel_regex_enabled self.channel_regex_enabled = channel_regex_enabled
self.batch_size = batch_size self.batch_size = batch_size
@ -416,7 +409,6 @@ class SlackPollConnector(PollConnector, SlimConnector):
documents: list[Document] = [] documents: list[Document] = []
for document in _get_all_docs( for document in _get_all_docs(
client=self.client, client=self.client,
workspace=self.workspace,
channels=self.channels, channels=self.channels,
channel_name_regex_enabled=self.channel_regex_enabled, channel_name_regex_enabled=self.channel_regex_enabled,
# NOTE: need to impute to `None` instead of using 0.0, since Slack will # NOTE: need to impute to `None` instead of using 0.0, since Slack will
@ -440,7 +432,6 @@ if __name__ == "__main__":
slack_channel = os.environ.get("SLACK_CHANNEL") slack_channel = os.environ.get("SLACK_CHANNEL")
connector = SlackPollConnector( connector = SlackPollConnector(
workspace=os.environ["SLACK_WORKSPACE"],
channels=[slack_channel] if slack_channel else None, channels=[slack_channel] if slack_channel else None,
) )
connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]}) connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})

View File

@ -1,140 +0,0 @@
import json
import os
from datetime import datetime
from datetime import timezone
from pathlib import Path
from typing import Any
from typing import cast
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.slack.connector import filter_channels
from danswer.connectors.slack.utils import get_message_link
from danswer.utils.logger import setup_logger
logger = setup_logger()
def get_event_time(event: dict[str, Any]) -> datetime | None:
ts = event.get("ts")
if not ts:
return None
return datetime.fromtimestamp(float(ts), tz=timezone.utc)
class SlackLoadConnector(LoadConnector):
# WARNING: DEPRECATED, DO NOT USE
def __init__(
self,
workspace: str,
export_path_str: str,
channels: list[str] | None = None,
# if specified, will treat the specified channel strings as
# regexes, and will only index channels that fully match the regexes
channel_regex_enabled: bool = False,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.workspace = workspace
self.channels = channels
self.channel_regex_enabled = channel_regex_enabled
self.export_path_str = export_path_str
self.batch_size = batch_size
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
if credentials:
logger.warning("Unexpected credentials provided for Slack Load Connector")
return None
@staticmethod
def _process_batch_event(
slack_event: dict[str, Any],
channel: dict[str, Any],
matching_doc: Document | None,
workspace: str,
) -> Document | None:
if (
slack_event["type"] == "message"
and slack_event.get("subtype") != "channel_join"
):
if matching_doc:
return Document(
id=matching_doc.id,
sections=matching_doc.sections
+ [
Section(
link=get_message_link(
event=slack_event,
workspace=workspace,
channel_id=channel["id"],
),
text=slack_event["text"],
)
],
source=matching_doc.source,
semantic_identifier=matching_doc.semantic_identifier,
title="", # slack docs don't really have a "title"
doc_updated_at=get_event_time(slack_event),
metadata=matching_doc.metadata,
)
return Document(
id=slack_event["ts"],
sections=[
Section(
link=get_message_link(
event=slack_event,
workspace=workspace,
channel_id=channel["id"],
),
text=slack_event["text"],
)
],
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
title="", # slack docs don't really have a "title"
doc_updated_at=get_event_time(slack_event),
metadata={},
)
return None
def load_from_state(self) -> GenerateDocumentsOutput:
export_path = Path(self.export_path_str)
with open(export_path / "channels.json") as f:
all_channels = json.load(f)
filtered_channels = filter_channels(
all_channels, self.channels, self.channel_regex_enabled
)
document_batch: dict[str, Document] = {}
for channel_info in filtered_channels:
channel_dir_path = export_path / cast(str, channel_info["name"])
channel_file_paths = [
channel_dir_path / file_name
for file_name in os.listdir(channel_dir_path)
]
for path in channel_file_paths:
with open(path) as f:
events = cast(list[dict[str, Any]], json.load(f))
for slack_event in events:
doc = self._process_batch_event(
slack_event=slack_event,
channel=channel_info,
matching_doc=document_batch.get(
slack_event.get("thread_ts", "")
),
workspace=self.workspace,
)
if doc:
document_batch[doc.id] = doc
if len(document_batch) >= self.batch_size:
yield list(document_batch.values())
yield list(document_batch.values())

View File

@ -2,6 +2,7 @@ import re
import time import time
from collections.abc import Callable from collections.abc import Callable
from collections.abc import Generator from collections.abc import Generator
from functools import lru_cache
from functools import wraps from functools import wraps
from typing import Any from typing import Any
from typing import cast from typing import cast
@ -21,19 +22,21 @@ basic_retry_wrapper = retry_builder()
_SLACK_LIMIT = 900 _SLACK_LIMIT = 900
@lru_cache()
def get_base_url(token: str) -> str:
"""Retrieve and cache the base URL of the Slack workspace based on the client token."""
client = WebClient(token=token)
return client.auth_test()["url"]
def get_message_link( def get_message_link(
event: dict[str, Any], workspace: str, channel_id: str | None = None event: dict[str, Any], client: WebClient, channel_id: str | None = None
) -> str: ) -> str:
channel_id = channel_id or cast( channel_id = channel_id or event["channel"]
str, event["channel"] message_ts = event["ts"]
) # channel must either be present in the event or passed in response = client.chat_getPermalink(channel=channel_id, message_ts=message_ts)
message_ts = cast(str, event["ts"]) permalink = response["permalink"]
message_ts_without_dot = message_ts.replace(".", "") return permalink
thread_ts = cast(str | None, event.get("thread_ts"))
return (
f"https://{workspace}.slack.com/archives/{channel_id}/p{message_ts_without_dot}"
+ (f"?thread_ts={thread_ts}" if thread_ts else "")
)
def _make_slack_api_call_logged( def _make_slack_api_call_logged(

View File

@ -67,7 +67,6 @@ def test_slack_permission_sync(
input_type=InputType.POLL, input_type=InputType.POLL,
source=DocumentSource.SLACK, source=DocumentSource.SLACK,
connector_specific_config={ connector_specific_config={
"workspace": "onyx-test-workspace",
"channels": [public_channel["name"], private_channel["name"]], "channels": [public_channel["name"], private_channel["name"]],
}, },
access_type=AccessType.SYNC, access_type=AccessType.SYNC,
@ -281,7 +280,6 @@ def test_slack_group_permission_sync(
input_type=InputType.POLL, input_type=InputType.POLL,
source=DocumentSource.SLACK, source=DocumentSource.SLACK,
connector_specific_config={ connector_specific_config={
"workspace": "onyx-test-workspace",
"channels": [private_channel["name"]], "channels": [private_channel["name"]],
}, },
access_type=AccessType.SYNC, access_type=AccessType.SYNC,

View File

@ -61,7 +61,6 @@ def test_slack_prune(
input_type=InputType.POLL, input_type=InputType.POLL,
source=DocumentSource.SLACK, source=DocumentSource.SLACK,
connector_specific_config={ connector_specific_config={
"workspace": "onyx-test-workspace",
"channels": [public_channel["name"], private_channel["name"]], "channels": [public_channel["name"], private_channel["name"]],
}, },
access_type=AccessType.PUBLIC, access_type=AccessType.PUBLIC,

View File

@ -546,15 +546,7 @@ Hint: Use the singular form of the object name (e.g., 'Opportunity' instead of '
}, },
slack: { slack: {
description: "Configure Slack connector", description: "Configure Slack connector",
values: [ values: [],
{
type: "text",
query: "Enter the Slack workspace:",
label: "Workspace",
name: "workspace",
optional: false,
},
],
advanced_values: [ advanced_values: [
{ {
type: "list", type: "list",