pablonyx f87e559cc4
Separate out indexing-time image analysis into new phase (#4228)
* Separate out indexing-time image analysis into new phase

* looking good

* k

* k
2025-03-12 22:26:05 +00:00

210 lines
7.7 KiB
Python

from datetime import datetime
from datetime import timezone
from typing import Any
from typing import List
from typing import Optional
import requests
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
)
from onyx.connectors.document360.utils import flatten_child_categories
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import parse_html_page_basic
from onyx.utils.retry_wrapper import retry_builder
# Limitations and Potential Improvements
# 1. The "Categories themselves contain potentially relevant information" but they're not pulled in
# 2. Only the HTML Articles are supported, Document360 also has a Markdown and "Block" format
# 3. The contents are not as cleaned up as other HTML connectors
DOCUMENT360_BASE_URL = "https://portal.document360.io"
DOCUMENT360_API_BASE_URL = "https://apihub.document360.io/v2"
class Document360Connector(LoadConnector, PollConnector):
def __init__(
self,
workspace: str,
categories: List[str] | None = None,
batch_size: int = INDEX_BATCH_SIZE,
portal_id: Optional[str] = None,
api_token: Optional[str] = None,
) -> None:
self.portal_id = portal_id
self.workspace = workspace
self.categories = categories
self.batch_size = batch_size
self.api_token = api_token
def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
self.api_token = credentials.get("document360_api_token")
self.portal_id = credentials.get("portal_id")
return None
# rate limiting set based on the enterprise plan: https://apidocs.document360.com/apidocs/rate-limiting
# NOTE: retry will handle cases where user is not on enterprise plan - we will just hit the rate limit
# and then retry after a period
@retry_builder()
@rate_limit_builder(max_calls=100, period=60)
def _make_request(self, endpoint: str, params: Optional[dict] = None) -> Any:
if not self.api_token:
raise ConnectorMissingCredentialError("Document360")
headers = {"accept": "application/json", "api_token": self.api_token}
response = requests.get(
f"{DOCUMENT360_API_BASE_URL}/{endpoint}", headers=headers, params=params
)
response.raise_for_status()
return response.json()["data"]
def _get_workspace_id_by_name(self) -> str:
projects = self._make_request("ProjectVersions")
workspace_id = next(
(
project["id"]
for project in projects
if project["version_code_name"] == self.workspace
),
None,
)
if workspace_id is None:
raise ValueError("Not able to find Workspace ID by the user provided name")
return workspace_id
def _get_articles_with_category(self, workspace_id: str) -> Any:
all_categories = self._make_request(
f"ProjectVersions/{workspace_id}/categories"
)
articles_with_category = []
for category in all_categories:
if not self.categories or category["name"] in self.categories:
for article in category["articles"]:
articles_with_category.append(
{"id": article["id"], "category_name": category["name"]}
)
for child_category in category["child_categories"]:
all_nested_categories = flatten_child_categories(child_category)
for nested_category in all_nested_categories:
for article in nested_category["articles"]:
articles_with_category.append(
{
"id": article["id"],
"category_name": nested_category["name"],
}
)
return articles_with_category
def _process_articles(
self, start: datetime | None = None, end: datetime | None = None
) -> GenerateDocumentsOutput:
if self.api_token is None:
raise ConnectorMissingCredentialError("Document360")
workspace_id = self._get_workspace_id_by_name()
articles = self._get_articles_with_category(workspace_id)
doc_batch: List[Document] = []
for article in articles:
article_details = self._make_request(
f"Articles/{article['id']}", {"langCode": "en"}
)
updated_at = datetime.strptime(
article_details["modified_at"], "%Y-%m-%dT%H:%M:%S.%fZ"
).replace(tzinfo=timezone.utc)
if start is not None and updated_at < start:
continue
if end is not None and updated_at > end:
continue
authors = [
BasicExpertInfo(
display_name=author.get("name"), email=author["email_id"]
)
for author in article_details.get("authors", [])
if author["email_id"]
]
doc_link = (
article_details["url"]
if article_details.get("url")
else f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
)
html_content = article_details["html_content"]
article_content = (
parse_html_page_basic(html_content) if html_content is not None else ""
)
doc_text = (
f"{article_details.get('description', '')}\n{article_content}".strip()
)
document = Document(
id=article_details["id"],
sections=[TextSection(link=doc_link, text=doc_text)],
source=DocumentSource.DOCUMENT360,
semantic_identifier=article_details["title"],
doc_updated_at=updated_at,
primary_owners=authors,
metadata={
"workspace": self.workspace,
"category": article["category_name"],
},
)
doc_batch.append(document)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
def load_from_state(self) -> GenerateDocumentsOutput:
return self._process_articles()
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
return self._process_articles(start_datetime, end_datetime)
if __name__ == "__main__":
import time
import os
document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
document360_connector.load_credentials(
{
"portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
"document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
}
)
current = time.time()
one_year_ago = current - 24 * 60 * 60 * 360
latest_docs = document360_connector.poll_source(one_year_ago, current)
for doc in latest_docs:
print(doc)