mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-02 03:01:14 +02:00
210 lines
7.7 KiB
Python
210 lines
7.7 KiB
Python
from datetime import datetime
|
|
from datetime import timezone
|
|
from typing import Any
|
|
from typing import List
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
|
from onyx.configs.constants import DocumentSource
|
|
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
|
|
rate_limit_builder,
|
|
)
|
|
from onyx.connectors.document360.utils import flatten_child_categories
|
|
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
|
from onyx.connectors.interfaces import LoadConnector
|
|
from onyx.connectors.interfaces import PollConnector
|
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
|
from onyx.connectors.models import BasicExpertInfo
|
|
from onyx.connectors.models import ConnectorMissingCredentialError
|
|
from onyx.connectors.models import Document
|
|
from onyx.connectors.models import TextSection
|
|
from onyx.file_processing.html_utils import parse_html_page_basic
|
|
from onyx.utils.retry_wrapper import retry_builder
|
|
|
|
# Limitations and Potential Improvements
|
|
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
|
|
# 2. Only the HTML Articles are supported, Document360 also has a Markdown and "Block" format
|
|
# 3. The contents are not as cleaned up as other HTML connectors
|
|
|
|
DOCUMENT360_BASE_URL = "https://portal.document360.io"
|
|
DOCUMENT360_API_BASE_URL = "https://apihub.document360.io/v2"
|
|
|
|
|
|
class Document360Connector(LoadConnector, PollConnector):
|
|
def __init__(
|
|
self,
|
|
workspace: str,
|
|
categories: List[str] | None = None,
|
|
batch_size: int = INDEX_BATCH_SIZE,
|
|
portal_id: Optional[str] = None,
|
|
api_token: Optional[str] = None,
|
|
) -> None:
|
|
self.portal_id = portal_id
|
|
self.workspace = workspace
|
|
self.categories = categories
|
|
self.batch_size = batch_size
|
|
self.api_token = api_token
|
|
|
|
def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
|
|
self.api_token = credentials.get("document360_api_token")
|
|
self.portal_id = credentials.get("portal_id")
|
|
return None
|
|
|
|
# rate limiting set based on the enterprise plan: https://apidocs.document360.com/apidocs/rate-limiting
|
|
# NOTE: retry will handle cases where user is not on enterprise plan - we will just hit the rate limit
|
|
# and then retry after a period
|
|
@retry_builder()
|
|
@rate_limit_builder(max_calls=100, period=60)
|
|
def _make_request(self, endpoint: str, params: Optional[dict] = None) -> Any:
|
|
if not self.api_token:
|
|
raise ConnectorMissingCredentialError("Document360")
|
|
|
|
headers = {"accept": "application/json", "api_token": self.api_token}
|
|
|
|
response = requests.get(
|
|
f"{DOCUMENT360_API_BASE_URL}/{endpoint}", headers=headers, params=params
|
|
)
|
|
response.raise_for_status()
|
|
|
|
return response.json()["data"]
|
|
|
|
def _get_workspace_id_by_name(self) -> str:
|
|
projects = self._make_request("ProjectVersions")
|
|
workspace_id = next(
|
|
(
|
|
project["id"]
|
|
for project in projects
|
|
if project["version_code_name"] == self.workspace
|
|
),
|
|
None,
|
|
)
|
|
if workspace_id is None:
|
|
raise ValueError("Not able to find Workspace ID by the user provided name")
|
|
|
|
return workspace_id
|
|
|
|
def _get_articles_with_category(self, workspace_id: str) -> Any:
|
|
all_categories = self._make_request(
|
|
f"ProjectVersions/{workspace_id}/categories"
|
|
)
|
|
articles_with_category = []
|
|
|
|
for category in all_categories:
|
|
if not self.categories or category["name"] in self.categories:
|
|
for article in category["articles"]:
|
|
articles_with_category.append(
|
|
{"id": article["id"], "category_name": category["name"]}
|
|
)
|
|
for child_category in category["child_categories"]:
|
|
all_nested_categories = flatten_child_categories(child_category)
|
|
for nested_category in all_nested_categories:
|
|
for article in nested_category["articles"]:
|
|
articles_with_category.append(
|
|
{
|
|
"id": article["id"],
|
|
"category_name": nested_category["name"],
|
|
}
|
|
)
|
|
|
|
return articles_with_category
|
|
|
|
def _process_articles(
|
|
self, start: datetime | None = None, end: datetime | None = None
|
|
) -> GenerateDocumentsOutput:
|
|
if self.api_token is None:
|
|
raise ConnectorMissingCredentialError("Document360")
|
|
|
|
workspace_id = self._get_workspace_id_by_name()
|
|
articles = self._get_articles_with_category(workspace_id)
|
|
|
|
doc_batch: List[Document] = []
|
|
|
|
for article in articles:
|
|
article_details = self._make_request(
|
|
f"Articles/{article['id']}", {"langCode": "en"}
|
|
)
|
|
|
|
updated_at = datetime.strptime(
|
|
article_details["modified_at"], "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
).replace(tzinfo=timezone.utc)
|
|
if start is not None and updated_at < start:
|
|
continue
|
|
if end is not None and updated_at > end:
|
|
continue
|
|
|
|
authors = [
|
|
BasicExpertInfo(
|
|
display_name=author.get("name"), email=author["email_id"]
|
|
)
|
|
for author in article_details.get("authors", [])
|
|
if author["email_id"]
|
|
]
|
|
|
|
doc_link = (
|
|
article_details["url"]
|
|
if article_details.get("url")
|
|
else f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
|
|
)
|
|
|
|
html_content = article_details["html_content"]
|
|
article_content = (
|
|
parse_html_page_basic(html_content) if html_content is not None else ""
|
|
)
|
|
doc_text = (
|
|
f"{article_details.get('description', '')}\n{article_content}".strip()
|
|
)
|
|
|
|
document = Document(
|
|
id=article_details["id"],
|
|
sections=[TextSection(link=doc_link, text=doc_text)],
|
|
source=DocumentSource.DOCUMENT360,
|
|
semantic_identifier=article_details["title"],
|
|
doc_updated_at=updated_at,
|
|
primary_owners=authors,
|
|
metadata={
|
|
"workspace": self.workspace,
|
|
"category": article["category_name"],
|
|
},
|
|
)
|
|
|
|
doc_batch.append(document)
|
|
|
|
if len(doc_batch) >= self.batch_size:
|
|
yield doc_batch
|
|
doc_batch = []
|
|
|
|
if doc_batch:
|
|
yield doc_batch
|
|
|
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
|
return self._process_articles()
|
|
|
|
def poll_source(
|
|
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
|
) -> GenerateDocumentsOutput:
|
|
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
|
|
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
|
return self._process_articles(start_datetime, end_datetime)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import time
|
|
import os
|
|
|
|
document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
|
|
document360_connector.load_credentials(
|
|
{
|
|
"portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
|
|
"document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
|
|
}
|
|
)
|
|
|
|
current = time.time()
|
|
one_year_ago = current - 24 * 60 * 60 * 360
|
|
latest_docs = document360_connector.poll_source(one_year_ago, current)
|
|
|
|
for doc in latest_docs:
|
|
print(doc)
|