Zendesk Connector Metadata and small batch fix (#866)

This commit is contained in:
Yuhong Sun 2023-12-23 16:34:48 -08:00 committed by GitHub
parent dca4f7a72b
commit a122510cee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,3 +1,5 @@
from datetime import datetime
from datetime import timezone
from typing import Any from typing import Any
from zenpy import Zenpy # type: ignore from zenpy import Zenpy # type: ignore
@ -5,14 +7,34 @@ from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
def _article_to_document(article: Article) -> Document:
author = BasicExpertInfo(
display_name=article.author.name, email=article.author.email
)
update_time = datetime.fromisoformat(article.updated_at).astimezone(timezone.utc)
return Document(
id=f"article:{article.id}",
sections=[
Section(link=article.html_url, text=parse_html_page_basic(article.body))
],
source=DocumentSource.ZENDESK,
semantic_identifier=article.title,
doc_updated_at=update_time,
primary_owners=[author],
metadata={"type": "article"},
)
class ZendeskClientNotSetUpError(PermissionError): class ZendeskClientNotSetUpError(PermissionError):
def __init__(self) -> None: def __init__(self) -> None:
super().__init__("Zendesk Client is not set up, was load_credentials called?") super().__init__("Zendesk Client is not set up, was load_credentials called?")
@ -34,18 +56,6 @@ class ZendeskConnector(LoadConnector, PollConnector):
def load_from_state(self) -> GenerateDocumentsOutput: def load_from_state(self) -> GenerateDocumentsOutput:
return self.poll_source(None, None) return self.poll_source(None, None)
def _article_to_document(self, article: Article) -> Document:
return Document(
id=f"article:{article.id}",
sections=[Section(link=article.html_url, text=article.body)],
source=DocumentSource.ZENDESK,
semantic_identifier="Article: " + article.title,
metadata={
"type": "article",
"updated_at": article.updated_at,
},
)
def poll_source( def poll_source(
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
@ -64,7 +74,10 @@ class ZendeskConnector(LoadConnector, PollConnector):
if article.body is None: if article.body is None:
continue continue
doc_batch.append(self._article_to_document(article)) doc_batch.append(_article_to_document(article))
if len(doc_batch) >= self.batch_size: if len(doc_batch) >= self.batch_size:
yield doc_batch yield doc_batch
doc_batch.clear() doc_batch.clear()
if doc_batch:
yield doc_batch