diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index 88c54b848..c3947b233 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -26,6 +26,10 @@ env: GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }} # Slab SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }} + # Zendesk + ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }} + ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }} + ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }} # Salesforce SF_USERNAME: ${{ secrets.SF_USERNAME }} SF_PASSWORD: ${{ secrets.SF_PASSWORD }} diff --git a/backend/onyx/connectors/zendesk/connector.py b/backend/onyx/connectors/zendesk/connector.py index 17df88e76..d5ecb6498 100644 --- a/backend/onyx/connectors/zendesk/connector.py +++ b/backend/onyx/connectors/zendesk/connector.py @@ -10,17 +10,21 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import ( time_str_to_utc, ) from onyx.connectors.interfaces import GenerateDocumentsOutput +from onyx.connectors.interfaces import GenerateSlimDocumentOutput from onyx.connectors.interfaces import LoadConnector from onyx.connectors.interfaces import PollConnector from onyx.connectors.interfaces import SecondsSinceUnixEpoch +from onyx.connectors.interfaces import SlimConnector from onyx.connectors.models import BasicExpertInfo from onyx.connectors.models import Document from onyx.connectors.models import Section +from onyx.connectors.models import SlimDocument from onyx.file_processing.html_utils import parse_html_page_basic from onyx.utils.retry_wrapper import retry_builder MAX_PAGE_SIZE = 30 # Zendesk API maximum +_SLIM_BATCH_SIZE = 1000 class ZendeskCredentialsNotSetUpError(PermissionError): @@ -272,7 +276,7 @@ def _ticket_to_document( ) -class ZendeskConnector(LoadConnector, PollConnector): +class ZendeskConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, @@ -397,6 +401,43 @@ class ZendeskConnector(LoadConnector, PollConnector): if doc_batch: yield doc_batch + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + slim_doc_batch: list[SlimDocument] = [] + if self.content_type == "articles": + articles = _get_articles( + self.client, start_time=int(start) if start else None + ) + for article in articles: + slim_doc_batch.append( + SlimDocument( + id=f"article:{article['id']}", + ) + ) + if len(slim_doc_batch) >= _SLIM_BATCH_SIZE: + yield slim_doc_batch + slim_doc_batch = [] + elif self.content_type == "tickets": + tickets = _get_tickets( + self.client, start_time=int(start) if start else None + ) + for ticket in tickets: + slim_doc_batch.append( + SlimDocument( + id=f"zendesk_ticket_{ticket['id']}", + ) + ) + if len(slim_doc_batch) >= _SLIM_BATCH_SIZE: + yield slim_doc_batch + slim_doc_batch = [] + else: + raise ValueError(f"Unsupported content_type: {self.content_type}") + if slim_doc_batch: + yield slim_doc_batch + if __name__ == "__main__": import os diff --git a/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py b/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py new file mode 100644 index 000000000..321ca298b --- /dev/null +++ b/backend/tests/daily/connectors/zendesk/test_zendesk_connector.py @@ -0,0 +1,96 @@ +import json +import os +import time +from pathlib import Path + +import pytest + +from danswer.configs.constants import DocumentSource +from danswer.connectors.models import Document +from danswer.connectors.zendesk.connector import ZendeskConnector + + +def load_test_data(file_name: str = "test_zendesk_data.json") -> dict[str, dict]: + current_dir = Path(__file__).parent + with open(current_dir / file_name, "r") as f: + return json.load(f) + + +@pytest.fixture +def zendesk_article_connector() -> ZendeskConnector: + connector = ZendeskConnector(content_type="articles") + connector.load_credentials(get_credentials()) + return connector + + +@pytest.fixture +def zendesk_ticket_connector() -> ZendeskConnector: + connector = ZendeskConnector(content_type="tickets") + connector.load_credentials(get_credentials()) + return connector + + +def get_credentials() -> dict[str, str]: + return { + "zendesk_subdomain": os.environ["ZENDESK_SUBDOMAIN"], + "zendesk_email": os.environ["ZENDESK_EMAIL"], + "zendesk_token": os.environ["ZENDESK_TOKEN"], + } + + +@pytest.mark.parametrize( + "connector_fixture", ["zendesk_article_connector", "zendesk_ticket_connector"] +) +def test_zendesk_connector_basic( + request: pytest.FixtureRequest, connector_fixture: str +) -> None: + connector = request.getfixturevalue(connector_fixture) + test_data = load_test_data() + all_docs: list[Document] = [] + target_test_doc_id: str + if connector.content_type == "articles": + target_test_doc_id = f"article:{test_data['article']['id']}" + else: + target_test_doc_id = f"zendesk_ticket_{test_data['ticket']['id']}" + + target_doc: Document | None = None + + for doc_batch in connector.poll_source(0, time.time()): + for doc in doc_batch: + all_docs.append(doc) + if doc.id == target_test_doc_id: + target_doc = doc + + assert len(all_docs) > 0, "No documents were retrieved from the connector" + assert ( + target_doc is not None + ), "Target document was not found in the retrieved documents" + assert target_doc.source == DocumentSource.ZENDESK, "Document source is not ZENDESK" + + if connector.content_type == "articles": + print(f"target_doc.semantic_identifier {target_doc.semantic_identifier}") + assert ( + target_doc.semantic_identifier + == test_data["article"]["semantic_identifier"] + ), "Article title does not match" + else: + assert target_doc.semantic_identifier.startswith( + f"Ticket #{test_data['ticket']['id']}" + ), "Ticket ID does not match" + + +def test_zendesk_connector_slim(zendesk_article_connector: ZendeskConnector) -> None: + # Get full doc IDs + all_full_doc_ids = set() + for doc_batch in zendesk_article_connector.load_from_state(): + all_full_doc_ids.update([doc.id for doc in doc_batch]) + + # Get slim doc IDs + all_slim_doc_ids = set() + for slim_doc_batch in zendesk_article_connector.retrieve_all_slim_documents(): + all_slim_doc_ids.update([doc.id for doc in slim_doc_batch]) + + # Full docs should be subset of slim docs + assert all_full_doc_ids.issubset( + all_slim_doc_ids + ), f"Full doc IDs {all_full_doc_ids} not subset of slim doc IDs {all_slim_doc_ids}" diff --git a/backend/tests/daily/connectors/zendesk/test_zendesk_data.json b/backend/tests/daily/connectors/zendesk/test_zendesk_data.json new file mode 100644 index 000000000..0b5087155 --- /dev/null +++ b/backend/tests/daily/connectors/zendesk/test_zendesk_data.json @@ -0,0 +1,11 @@ +{ + "article": { + "id": "17275801227804", + "semantic_identifier": "How can agents leverage knowledge to help customers?" + + }, + "ticket": { + "id": "1" + + } +} \ No newline at end of file