danswer/backend/onyx/connectors/document360/connector.py

from datetime import datetime
from datetime import timezone
from typing import Any
from typing import List
from typing import Optional

import requests

from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
)
from onyx.connectors.document360.utils import flatten_child_categories
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.retry_wrapper import retry_builder

# Limitations and Potential Improvements
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
# 2. Only the HTML Articles are supported, Document360 also has a Markdown and "Block" format
# 3. The contents are not as cleaned up as other HTML connectors

DOCUMENT360_BASE_URL = "https://portal.document360.io"
DOCUMENT360_API_BASE_URL = "https://apihub.document360.io/v2"


class Document360Connector(LoadConnector, PollConnector):
    def __init__(
        self,
        workspace: str,
        categories: List[str] | None = None,
        batch_size: int = INDEX_BATCH_SIZE,
        portal_id: Optional[str] = None,
        api_token: Optional[str] = None,
    ) -> None:
        self.portal_id = portal_id
        self.workspace = workspace
        self.categories = categories
        self.batch_size = batch_size
        self.api_token = api_token

    def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
        self.api_token = credentials.get("document360_api_token")
        self.portal_id = credentials.get("portal_id")
        return None

    # rate limiting set based on the enterprise plan: https://apidocs.document360.com/apidocs/rate-limiting
    # NOTE: retry will handle cases where user is not on enterprise plan - we will just hit the rate limit
    # and then retry after a period
    @retry_builder()
    @rate_limit_builder(max_calls=100, period=60)
    def _make_request(self, endpoint: str, params: Optional[dict] = None) -> Any:
        if not self.api_token:
            raise ConnectorMissingCredentialError("Document360")

        headers = {"accept": "application/json", "api_token": self.api_token}

        response = requests.get(
            f"{DOCUMENT360_API_BASE_URL}/{endpoint}", headers=headers, params=params
        )
        response.raise_for_status()

        return response.json()["data"]

    def _get_workspace_id_by_name(self) -> str:
        projects = self._make_request("ProjectVersions")
        workspace_id = next(
            (
                project["id"]
                for project in projects
                if project["version_code_name"] == self.workspace
            ),
            None,
        )
        if workspace_id is None:
            raise ValueError("Not able to find Workspace ID by the user provided name")

        return workspace_id

    def _get_articles_with_category(self, workspace_id: str) -> Any:
        all_categories = self._make_request(
            f"ProjectVersions/{workspace_id}/categories"
        )
        articles_with_category = []

        for category in all_categories:
            if not self.categories or category["name"] in self.categories:
                for article in category["articles"]:
                    articles_with_category.append(
                        {"id": article["id"], "category_name": category["name"]}
                    )
                for child_category in category["child_categories"]:
                    all_nested_categories = flatten_child_categories(child_category)
                    for nested_category in all_nested_categories:
                        for article in nested_category["articles"]:
                            articles_with_category.append(
                                {
                                    "id": article["id"],
                                    "category_name": nested_category["name"],
                                }
                            )

        return articles_with_category

    def _process_articles(
        self, start: datetime | None = None, end: datetime | None = None
    ) -> GenerateDocumentsOutput:
        if self.api_token is None:
            raise ConnectorMissingCredentialError("Document360")

        workspace_id = self._get_workspace_id_by_name()
        articles = self._get_articles_with_category(workspace_id)

        doc_batch: List[Document] = []

        for article in articles:
            article_details = self._make_request(
                f"Articles/{article['id']}", {"langCode": "en"}
            )

            updated_at = datetime.strptime(
                article_details["modified_at"], "%Y-%m-%dT%H:%M:%S.%fZ"
            ).replace(tzinfo=timezone.utc)
            if start is not None and updated_at < start:
                continue
            if end is not None and updated_at > end:
                continue

            authors = [
                BasicExpertInfo(
                    display_name=author.get("name"), email=author["email_id"]
                )
                for author in article_details.get("authors", [])
                if author["email_id"]
            ]

            doc_link = (
                article_details["url"]
                if article_details.get("url")
                else f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
            )

            html_content = article_details["html_content"]
            article_content = (
                parse_html_page_basic(html_content) if html_content is not None else ""
            )
            doc_text = (
                f"{article_details.get('description', '')}\n{article_content}".strip()
            )

            document = Document(
                id=article_details["id"],
                sections=[TextSection(link=doc_link, text=doc_text)],
                source=DocumentSource.DOCUMENT360,
                semantic_identifier=article_details["title"],
                doc_updated_at=updated_at,
                primary_owners=authors,
                metadata={
                    "workspace": self.workspace,
                    "category": article["category_name"],
                },
            )

            doc_batch.append(document)

            if len(doc_batch) >= self.batch_size:
                yield doc_batch
                doc_batch = []

        if doc_batch:
            yield doc_batch

    def load_from_state(self) -> GenerateDocumentsOutput:
        return self._process_articles()

    def poll_source(
        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
        return self._process_articles(start_datetime, end_datetime)


if __name__ == "__main__":
    import time
    import os

    document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
    document360_connector.load_credentials(
        {
            "portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
            "document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
        }
    )

    current = time.time()
    one_year_ago = current - 24 * 60 * 60 * 360
    latest_docs = document360_connector.poll_source(one_year_ago, current)

    for doc in latest_docs:
        print(doc)