Replace html processing library with danswer util

2025-09-18 11:34:12 +02:00 · 2024-10-24 11:49:11 -04:00
parent 87b5975091
commit cc1e1c178b
1 changed files with 2 additions and 4 deletions
--- a/backend/danswer/connectors/freshdesk/connector.py
+++ b/backend/danswer/connectors/freshdesk/connector.py
@@ -2,7 +2,7 @@ import requests
 import json
 from datetime import datetime, timezone
 from typing import Any, List, Optional
-from bs4 import BeautifulSoup  # Add this import for HTML parsing
+from danswer.file_processing.html_utils import parse_html_page_basic
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.interfaces import GenerateDocumentsOutput, PollConnector, LoadConnector
@@ -33,8 +33,7 @@ class FreshdeskConnector(PollConnector, LoadConnector):
        ]

    def strip_html_tags(self, html: str) -> str:
-        soup = BeautifulSoup(html, 'html.parser')
-        return soup.get_text()
+        return parse_html_page_basic(html)

    def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
        self.api_key = credentials.get("freshdesk_api_key")
@@ -43,7 +42,6 @@ class FreshdeskConnector(PollConnector, LoadConnector):
        return None

    def _process_tickets(self, start: datetime, end: datetime) -> GenerateDocumentsOutput:
-        logger.info("Processing tickets")
        if any([self.api_key, self.domain, self.password]) is None:
            raise ConnectorMissingCredentialError("freshdesk")