Zendesk Connector Metadata and small batch fix (#866)

This commit is contained in:
Yuhong Sun 2023-12-23 16:34:48 -08:00 committed by GitHub
parent dca4f7a72b
commit a122510cee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,3 +1,5 @@
from datetime import datetime
from datetime import timezone
from typing import Any
from zenpy import Zenpy # type: ignore
@ -5,14 +7,34 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document
from danswer.connectors.models import Section
def _article_to_document(article: Article) -> Document:
author = BasicExpertInfo(
display_name=article.author.name, email=article.author.email
)
update_time = datetime.fromisoformat(article.updated_at).astimezone(timezone.utc)
return Document(
id=f"article:{article.id}",
sections=[
Section(link=article.html_url, text=parse_html_page_basic(article.body))
],
source=DocumentSource.ZENDESK,
semantic_identifier=article.title,
doc_updated_at=update_time,
primary_owners=[author],
metadata={"type": "article"},
)
class ZendeskClientNotSetUpError(PermissionError):
def __init__(self) -> None:
super().__init__("Zendesk Client is not set up, was load_credentials called?")
@ -34,18 +56,6 @@ class ZendeskConnector(LoadConnector, PollConnector):
def load_from_state(self) -> GenerateDocumentsOutput:
return self.poll_source(None, None)
def _article_to_document(self, article: Article) -> Document:
return Document(
id=f"article:{article.id}",
sections=[Section(link=article.html_url, text=article.body)],
source=DocumentSource.ZENDESK,
semantic_identifier="Article: " + article.title,
metadata={
"type": "article",
"updated_at": article.updated_at,
},
)
def poll_source(
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> GenerateDocumentsOutput:
@ -64,7 +74,10 @@ class ZendeskConnector(LoadConnector, PollConnector):
if article.body is None:
continue
doc_batch.append(self._article_to_document(article))
doc_batch.append(_article_to_document(article))
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch.clear()
if doc_batch:
yield doc_batch