mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 15:30:59 +02:00
parent
2f54795631
commit
675a5aec9e
@ -22,5 +22,6 @@ class DocumentSource(str, Enum):
|
|||||||
GOOGLE_DRIVE = "google_drive"
|
GOOGLE_DRIVE = "google_drive"
|
||||||
GITHUB = "github"
|
GITHUB = "github"
|
||||||
CONFLUENCE = "confluence"
|
CONFLUENCE = "confluence"
|
||||||
|
SLAB = "slab"
|
||||||
JIRA = "jira"
|
JIRA = "jira"
|
||||||
FILE = "file"
|
FILE = "file"
|
||||||
|
@ -12,6 +12,7 @@ from danswer.connectors.interfaces import EventConnector
|
|||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
from danswer.connectors.models import InputType
|
from danswer.connectors.models import InputType
|
||||||
|
from danswer.connectors.slab.connector import SlabConnector
|
||||||
from danswer.connectors.slack.connector import SlackLoadConnector
|
from danswer.connectors.slack.connector import SlackLoadConnector
|
||||||
from danswer.connectors.slack.connector import SlackPollConnector
|
from danswer.connectors.slack.connector import SlackPollConnector
|
||||||
from danswer.connectors.web.connector import WebConnector
|
from danswer.connectors.web.connector import WebConnector
|
||||||
@ -38,6 +39,7 @@ def identify_connector_class(
|
|||||||
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
|
DocumentSource.GOOGLE_DRIVE: GoogleDriveConnector,
|
||||||
DocumentSource.CONFLUENCE: ConfluenceConnector,
|
DocumentSource.CONFLUENCE: ConfluenceConnector,
|
||||||
DocumentSource.JIRA: JiraConnector,
|
DocumentSource.JIRA: JiraConnector,
|
||||||
|
DocumentSource.SLAB: SlabConnector,
|
||||||
}
|
}
|
||||||
connector_by_source = connector_map.get(source, {})
|
connector_by_source = connector_map.get(source, {})
|
||||||
|
|
||||||
|
0
backend/danswer/connectors/slab/__init__.py
Normal file
0
backend/danswer/connectors/slab/__init__.py
Normal file
223
backend/danswer/connectors/slab/connector.py
Normal file
223
backend/danswer/connectors/slab/connector.py
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
import json
|
||||||
|
from collections.abc import Callable
|
||||||
|
from collections.abc import Generator
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
|
from danswer.connectors.interfaces import PollConnector
|
||||||
|
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||||
|
from danswer.connectors.models import Document
|
||||||
|
from danswer.connectors.models import Section
|
||||||
|
from danswer.utils.logging import setup_logger
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
|
# Fairly generous retry because it's not understood why occasionally GraphQL requests fail even with timeout > 1 min
|
||||||
|
SLAB_GRAPHQL_MAX_TRIES = 10
|
||||||
|
SLAB_API_URL = "https://api.slab.com/v1/graphql"
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class SlabBotTokenNotFoundError(PermissionError):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__("Slab Bot Token not found, was load_credentials called?")
|
||||||
|
|
||||||
|
|
||||||
|
def run_graphql_request(
|
||||||
|
graphql_query: dict, bot_token: str, max_tries: int = SLAB_GRAPHQL_MAX_TRIES
|
||||||
|
) -> str:
|
||||||
|
headers = {"Authorization": bot_token, "Content-Type": "application/json"}
|
||||||
|
|
||||||
|
for try_count in range(max_tries):
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
SLAB_API_URL, headers=headers, json=graphql_query, timeout=60
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise ValueError(f"GraphQL query failed: {graphql_query}")
|
||||||
|
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
except (requests.exceptions.Timeout, ValueError) as e:
|
||||||
|
if try_count < max_tries - 1:
|
||||||
|
logger.warning("A Slab GraphQL error occurred. Retrying...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if isinstance(e, requests.exceptions.Timeout):
|
||||||
|
raise TimeoutError("Slab API timed out after 3 attempts")
|
||||||
|
else:
|
||||||
|
raise ValueError("Slab GraphQL query failed after 3 attempts")
|
||||||
|
|
||||||
|
raise RuntimeError(
|
||||||
|
"Unexpected execution from Slab Connector. This should not happen."
|
||||||
|
) # for static checker
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_post_ids(bot_token: str) -> list[str]:
|
||||||
|
query = """
|
||||||
|
query GetAllPostIds {
|
||||||
|
organization {
|
||||||
|
posts {
|
||||||
|
id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
graphql_query = {"query": query}
|
||||||
|
|
||||||
|
results = json.loads(run_graphql_request(graphql_query, bot_token))
|
||||||
|
posts = results["data"]["organization"]["posts"]
|
||||||
|
return [post["id"] for post in posts]
|
||||||
|
|
||||||
|
|
||||||
|
def get_post_by_id(post_id: str, bot_token: str) -> dict[str, str]:
|
||||||
|
query = """
|
||||||
|
query GetPostById($postId: ID!) {
|
||||||
|
post(id: $postId) {
|
||||||
|
title
|
||||||
|
content
|
||||||
|
linkAccess
|
||||||
|
updatedAt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
graphql_query = {"query": query, "variables": {"postId": post_id}}
|
||||||
|
results = json.loads(run_graphql_request(graphql_query, bot_token))
|
||||||
|
return results["data"]["post"]
|
||||||
|
|
||||||
|
|
||||||
|
def iterate_post_batches(
|
||||||
|
batch_size: int, bot_token: str
|
||||||
|
) -> Generator[list[dict[str, str]], None, None]:
|
||||||
|
"""This may not be safe to use, not sure if page edits will change the order of results"""
|
||||||
|
query = """
|
||||||
|
query IteratePostBatches($query: String!, $first: Int, $types: [SearchType], $after: String) {
|
||||||
|
search(query: $query, first: $first, types: $types, after: $after) {
|
||||||
|
edges {
|
||||||
|
node {
|
||||||
|
... on PostSearchResult {
|
||||||
|
post {
|
||||||
|
id
|
||||||
|
title
|
||||||
|
content
|
||||||
|
updatedAt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageInfo {
|
||||||
|
endCursor
|
||||||
|
hasNextPage
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
pagination_start = None
|
||||||
|
exists_more_pages = True
|
||||||
|
while exists_more_pages:
|
||||||
|
graphql_query = {
|
||||||
|
"query": query,
|
||||||
|
"variables": {
|
||||||
|
"query": "",
|
||||||
|
"first": batch_size,
|
||||||
|
"types": ["POST"],
|
||||||
|
"after": pagination_start,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
results = json.loads(run_graphql_request(graphql_query, bot_token))
|
||||||
|
pagination_start = results["data"]["search"]["pageInfo"]["endCursor"]
|
||||||
|
hits = results["data"]["search"]["edges"]
|
||||||
|
|
||||||
|
posts = [hit["node"] for hit in hits]
|
||||||
|
if posts:
|
||||||
|
yield posts
|
||||||
|
|
||||||
|
exists_more_pages = results["data"]["search"]["pageInfo"]["hasNextPage"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_slab_url_from_title_id(base_url: str, title: str, page_id: str) -> str:
|
||||||
|
"""This is not a documented approach but seems to be the way it works currently
|
||||||
|
May be subject to change without notification"""
|
||||||
|
title = title.replace("[", "").replace("]", "").replace(" ", "-").lower()
|
||||||
|
url_id = title + "-" + page_id
|
||||||
|
return urljoin(urljoin(base_url, "posts/"), url_id)
|
||||||
|
|
||||||
|
|
||||||
|
class SlabConnector(LoadConnector, PollConnector):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str,
|
||||||
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
|
slab_bot_token: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.base_url = base_url
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.slab_bot_token = slab_bot_token
|
||||||
|
|
||||||
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
|
self.slab_bot_token = credentials["slab_bot_token"]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _iterate_posts(
|
||||||
|
self, time_filter: Callable[[datetime], bool] | None = None
|
||||||
|
) -> GenerateDocumentsOutput:
|
||||||
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
|
if self.slab_bot_token is None:
|
||||||
|
raise SlabBotTokenNotFoundError()
|
||||||
|
|
||||||
|
all_post_ids: list[str] = get_all_post_ids(self.slab_bot_token)
|
||||||
|
|
||||||
|
for post_id in all_post_ids:
|
||||||
|
post = get_post_by_id(post_id, self.slab_bot_token)
|
||||||
|
last_modified = parser.parse(post["updatedAt"])
|
||||||
|
if time_filter is not None and not time_filter(last_modified):
|
||||||
|
continue
|
||||||
|
|
||||||
|
page_url = get_slab_url_from_title_id(self.base_url, post["title"], post_id)
|
||||||
|
|
||||||
|
content_text = ""
|
||||||
|
contents = json.loads(post["content"])
|
||||||
|
for content_segment in contents:
|
||||||
|
insert = content_segment.get("insert")
|
||||||
|
if insert and isinstance(insert, str):
|
||||||
|
content_text += insert
|
||||||
|
|
||||||
|
doc_batch.append(
|
||||||
|
Document(
|
||||||
|
id=post_id, # can't be url as this changes with the post title
|
||||||
|
sections=[Section(link=page_url, text=content_text)],
|
||||||
|
source=DocumentSource.SLAB,
|
||||||
|
semantic_identifier=post["title"],
|
||||||
|
metadata={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(doc_batch) >= self.batch_size:
|
||||||
|
yield doc_batch
|
||||||
|
doc_batch = []
|
||||||
|
|
||||||
|
if doc_batch:
|
||||||
|
yield doc_batch
|
||||||
|
|
||||||
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
|
yield from self._iterate_posts()
|
||||||
|
|
||||||
|
def poll_source(
|
||||||
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||||
|
) -> GenerateDocumentsOutput:
|
||||||
|
start_time = datetime.fromtimestamp(start, tz=timezone.utc)
|
||||||
|
end_time = datetime.fromtimestamp(end, tz=timezone.utc)
|
||||||
|
|
||||||
|
yield from self._iterate_posts(
|
||||||
|
time_filter=lambda t: start_time <= t <= end_time
|
||||||
|
)
|
@ -6,6 +6,7 @@ reorder-python-imports==3.9.0
|
|||||||
types-beautifulsoup4==4.12.0.3
|
types-beautifulsoup4==4.12.0.3
|
||||||
types-html5lib==1.1.11.13
|
types-html5lib==1.1.11.13
|
||||||
types-psycopg2==2.9.21.10
|
types-psycopg2==2.9.21.10
|
||||||
|
types-python-dateutil==2.8.19.13
|
||||||
types-regex==2023.3.23.1
|
types-regex==2023.3.23.1
|
||||||
types-requests==2.28.11.17
|
types-requests==2.28.11.17
|
||||||
types-urllib3==1.26.25.11
|
types-urllib3==1.26.25.11
|
||||||
|
Loading…
x
Reference in New Issue
Block a user