Replace html processing library with danswer util

This commit is contained in:
Skylar Kesselring
2024-10-24 11:49:11 -04:00
parent 87b5975091
commit cc1e1c178b

View File

@@ -2,7 +2,7 @@ import requests
import json import json
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any, List, Optional from typing import Any, List, Optional
from bs4 import BeautifulSoup # Add this import for HTML parsing from danswer.file_processing.html_utils import parse_html_page_basic
from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import GenerateDocumentsOutput, PollConnector, LoadConnector from danswer.connectors.interfaces import GenerateDocumentsOutput, PollConnector, LoadConnector
@@ -33,8 +33,7 @@ class FreshdeskConnector(PollConnector, LoadConnector):
] ]
def strip_html_tags(self, html: str) -> str: def strip_html_tags(self, html: str) -> str:
soup = BeautifulSoup(html, 'html.parser') return parse_html_page_basic(html)
return soup.get_text()
def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]: def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]:
self.api_key = credentials.get("freshdesk_api_key") self.api_key = credentials.get("freshdesk_api_key")
@@ -43,7 +42,6 @@ class FreshdeskConnector(PollConnector, LoadConnector):
return None return None
def _process_tickets(self, start: datetime, end: datetime) -> GenerateDocumentsOutput: def _process_tickets(self, start: datetime, end: datetime) -> GenerateDocumentsOutput:
logger.info("Processing tickets")
if any([self.api_key, self.domain, self.password]) is None: if any([self.api_key, self.domain, self.password]) is None:
raise ConnectorMissingCredentialError("freshdesk") raise ConnectorMissingCredentialError("freshdesk")