Enhance Highspot connector with error handling and add unit tests (#4454)

* Enhance Highspot connector with error handling and add unit tests for poll_source functionality

* Fix file extension validation logic to allow either plain text or document format
This commit is contained in:
SubashMohan
2025-04-04 22:23:16 +05:30
committed by GitHub
parent 842a73a242
commit 9dd56a5c80
3 changed files with 271 additions and 164 deletions

View File

@@ -20,7 +20,8 @@ from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document from onyx.connectors.models import Document
from onyx.connectors.models import SlimDocument from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import ALL_ACCEPTED_FILE_EXTENSIONS from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import extract_file_text from onyx.file_processing.extract_file_text import extract_file_text
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
@@ -84,14 +85,21 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Populate the spot ID map with all available spots. Populate the spot ID map with all available spots.
Keys are stored as lowercase for case-insensitive lookups. Keys are stored as lowercase for case-insensitive lookups.
""" """
spots = self.client.get_spots() try:
for spot in spots: spots = self.client.get_spots()
if "title" in spot and "id" in spot: for spot in spots:
spot_name = spot["title"] if "title" in spot and "id" in spot:
self._spot_id_map[spot_name.lower()] = spot["id"] spot_name = spot["title"]
self._spot_id_map[spot_name.lower()] = spot["id"]
self._all_spots_fetched = True self._all_spots_fetched = True
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot") logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
except HighspotClientError as e:
logger.error(f"Error retrieving spots from Highspot: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error retrieving spots from Highspot: {str(e)}")
raise
def _get_all_spot_names(self) -> List[str]: def _get_all_spot_names(self) -> List[str]:
""" """
@@ -151,116 +159,142 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Batches of Document objects Batches of Document objects
""" """
doc_batch: list[Document] = [] doc_batch: list[Document] = []
try:
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
if not spot_names_to_process:
logger.warning("No spots found in Highspot")
raise ValueError("No spots found in Highspot")
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots"
)
# If no spots specified, get all spots for spot_name in spot_names_to_process:
spot_names_to_process = self.spot_names try:
if not spot_names_to_process: spot_id = self._get_spot_id_from_name(spot_name)
spot_names_to_process = self._get_all_spot_names() if spot_id is None:
logger.info( logger.warning(f"Spot ID not found for spot {spot_name}")
f"No spots specified, using all {len(spot_names_to_process)} available spots"
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
if spot_id is None:
logger.warning(f"Spot ID not found for spot {spot_name}")
continue
offset = 0
has_more = True
while has_more:
logger.info(
f"Retrieving items from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
logger.info(f"Received Items: {items}")
if not items:
has_more = False
continue continue
offset = 0
has_more = True
for item in items: while has_more:
try: logger.info(
item_id = item.get("id") f"Retrieving items from spot {spot_name}, offset {offset}"
if not item_id: )
logger.warning("Item without ID found, skipping") response = self.client.get_spot_items(
continue spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
logger.info(f"Received Items: {items}")
if not items:
has_more = False
continue
item_details = self.client.get_item(item_id) for item in items:
if not item_details: try:
logger.warning( item_id = item.get("id")
f"Item {item_id} details not found, skipping" if not item_id:
) logger.warning("Item without ID found, skipping")
continue continue
# Apply time filter if specified
if start or end: item_details = self.client.get_item(item_id)
updated_at = item_details.get("date_updated") if not item_details:
if updated_at: logger.warning(
# Convert to datetime for comparison f"Item {item_id} details not found, skipping"
try: )
updated_time = datetime.fromisoformat( continue
updated_at.replace("Z", "+00:00") # Apply time filter if specified
) if start or end:
if ( updated_at = item_details.get("date_updated")
start and updated_time.timestamp() < start if updated_at:
) or (end and updated_time.timestamp() > end): # Convert to datetime for comparison
try:
updated_time = datetime.fromisoformat(
updated_at.replace("Z", "+00:00")
)
if (
start
and updated_time.timestamp() < start
) or (
end and updated_time.timestamp() > end
):
continue
except (ValueError, TypeError):
# Skip if date cannot be parsed
logger.warning(
f"Invalid date format for item {item_id}: {updated_at}"
)
continue continue
except (ValueError, TypeError):
# Skip if date cannot be parsed
logger.warning(
f"Invalid date format for item {item_id}: {updated_at}"
)
continue
content = self._get_item_content(item_details) content = self._get_item_content(item_details)
title = item_details.get("title", "")
doc_batch.append( title = item_details.get("title", "")
Document(
id=f"HIGHSPOT_{item_id}", doc_batch.append(
sections=[ Document(
TextSection( id=f"HIGHSPOT_{item_id}",
link=item_details.get( sections=[
"url", TextSection(
f"https://www.highspot.com/items/{item_id}", link=item_details.get(
"url",
f"https://www.highspot.com/items/{item_id}",
),
text=content,
)
],
source=DocumentSource.HIGHSPOT,
semantic_identifier=title,
metadata={
"spot_name": spot_name,
"type": item_details.get(
"content_type", ""
), ),
text=content, "created_at": item_details.get(
) "date_added", ""
], ),
source=DocumentSource.HIGHSPOT, "author": item_details.get("author", ""),
semantic_identifier=title, "language": item_details.get(
metadata={ "language", ""
"spot_name": spot_name, ),
"type": item_details.get("content_type", ""), "can_download": str(
"created_at": item_details.get( item_details.get("can_download", False)
"date_added", "" ),
), },
"author": item_details.get("author", ""), doc_updated_at=item_details.get("date_updated"),
"language": item_details.get("language", ""), )
"can_download": str(
item_details.get("can_download", False)
),
},
doc_updated_at=item_details.get("date_updated"),
) )
)
if len(doc_batch) >= self.batch_size: if len(doc_batch) >= self.batch_size:
yield doc_batch yield doc_batch
doc_batch = [] doc_batch = []
except HighspotClientError as e: except HighspotClientError as e:
item_id = "ID" if not item_id else item_id item_id = "ID" if not item_id else item_id
logger.error(f"Error retrieving item {item_id}: {str(e)}") logger.error(
f"Error retrieving item {item_id}: {str(e)}"
)
except Exception as e:
item_id = "ID" if not item_id else item_id
logger.error(
f"Unexpected error for item {item_id}: {str(e)}"
)
has_more = len(items) >= self.batch_size has_more = len(items) >= self.batch_size
offset += self.batch_size offset += self.batch_size
except (HighspotClientError, ValueError) as e: except (HighspotClientError, ValueError) as e:
logger.error(f"Error processing spot {spot_name}: {str(e)}") logger.error(f"Error processing spot {spot_name}: {str(e)}")
except Exception as e:
logger.error(
f"Unexpected error processing spot {spot_name}: {str(e)}"
)
except Exception as e:
logger.error(f"Error in Highspot connector: {str(e)}")
raise
if doc_batch: if doc_batch:
yield doc_batch yield doc_batch
@@ -286,7 +320,9 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
# Extract title and description once at the beginning # Extract title and description once at the beginning
title, description = self._extract_title_and_description(item_details) title, description = self._extract_title_and_description(item_details)
default_content = f"{title}\n{description}" default_content = f"{title}\n{description}"
logger.info(f"Processing item {item_id} with extension {file_extension}") logger.info(
f"Processing item {item_id} with extension {file_extension} and file name {content_name}"
)
try: try:
if content_type == "WebLink": if content_type == "WebLink":
@@ -298,30 +334,39 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
elif ( elif (
is_valid_format is_valid_format
and file_extension in ALL_ACCEPTED_FILE_EXTENSIONS and (
file_extension in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
or file_extension in ACCEPTED_DOCUMENT_FILE_EXTENSIONS
)
and can_download and can_download
): ):
# For documents, try to get the text content
if not item_id: # Ensure item_id is defined
return default_content
content_response = self.client.get_item_content(item_id) content_response = self.client.get_item_content(item_id)
# Process and extract text from binary content based on type # Process and extract text from binary content based on type
if content_response: if content_response:
text_content = extract_file_text( text_content = extract_file_text(
BytesIO(content_response), content_name BytesIO(content_response), content_name, False
) )
return text_content return text_content if text_content else default_content
return default_content return default_content
else: else:
return default_content return default_content
except HighspotClientError as e: except HighspotClientError as e:
# Use item_id safely in the warning message error_context = f"item {item_id}" if item_id else "(item id not found)"
error_context = f"item {item_id}" if item_id else "item"
logger.warning(f"Could not retrieve content for {error_context}: {str(e)}") logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
return "" return default_content
except ValueError as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
logger.error(f"Value error for {error_context}: {str(e)}")
return default_content
except Exception as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
logger.error(
f"Unexpected error retrieving content for {error_context}: {str(e)}"
)
return default_content
def _extract_title_and_description( def _extract_title_and_description(
self, item_details: Dict[str, Any] self, item_details: Dict[str, Any]
@@ -358,55 +403,63 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Batches of SlimDocument objects Batches of SlimDocument objects
""" """
slim_doc_batch: list[SlimDocument] = [] slim_doc_batch: list[SlimDocument] = []
try:
# If no spots specified, get all spots # If no spots specified, get all spots
spot_names_to_process = self.spot_names spot_names_to_process = self.spot_names
if not spot_names_to_process: if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names() spot_names_to_process = self._get_all_spot_names()
logger.info( if not spot_names_to_process:
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents" logger.warning("No spots found in Highspot")
) raise ValueError("No spots found in Highspot")
logger.info(
for spot_name in spot_names_to_process: f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
try:
spot_id = self._get_spot_id_from_name(spot_name)
offset = 0
has_more = True
while has_more:
logger.info(
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
if not items:
has_more = False
continue
for item in items:
item_id = item.get("id")
if not item_id:
continue
slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
) )
if slim_doc_batch: for spot_name in spot_names_to_process:
yield slim_doc_batch try:
spot_id = self._get_spot_id_from_name(spot_name)
offset = 0
has_more = True
while has_more:
logger.info(
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
if not items:
has_more = False
continue
for item in items:
item_id = item.get("id")
if not item_id:
continue
slim_doc_batch.append(
SlimDocument(id=f"HIGHSPOT_{item_id}")
)
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
)
if slim_doc_batch:
yield slim_doc_batch
except Exception as e:
logger.error(f"Error in Highspot Slim Connector: {str(e)}")
raise
def validate_credentials(self) -> bool: def validate_credentials(self) -> bool:
""" """

View File

@@ -1,6 +1,7 @@
import json import json
import os import os
import time import time
from datetime import datetime
from pathlib import Path from pathlib import Path
from unittest.mock import MagicMock from unittest.mock import MagicMock
from unittest.mock import patch from unittest.mock import patch
@@ -105,6 +106,54 @@ def test_highspot_connector_slim(
assert len(all_slim_doc_ids) > 0 assert len(all_slim_doc_ids) > 0
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_highspot_connector_poll_source(
mock_get_api_key: MagicMock, highspot_connector: HighspotConnector
) -> None:
"""Test poll_source functionality with date range filtering."""
# Define date range: April 3, 2025 to April 4, 2025
start_date = datetime(2025, 4, 3, 0, 0, 0)
end_date = datetime(2025, 4, 4, 23, 59, 59)
# Convert to seconds since Unix epoch
start_time = int(time.mktime(start_date.timetuple()))
end_time = int(time.mktime(end_date.timetuple()))
# Load test data for assertions
test_data = load_test_data()
poll_source_data = test_data.get("poll_source", {})
target_doc_id = poll_source_data.get("target_doc_id")
# Call poll_source with date range
all_docs: list[Document] = []
target_doc: Document | None = None
for doc_batch in highspot_connector.poll_source(start_time, end_time):
for doc in doc_batch:
all_docs.append(doc)
if doc.id == f"HIGHSPOT_{target_doc_id}":
target_doc = doc
# Verify documents were loaded
assert len(all_docs) > 0
# Verify the specific test document was found and has correct properties
assert target_doc is not None
assert target_doc.semantic_identifier == poll_source_data.get("semantic_identifier")
assert target_doc.source == DocumentSource.HIGHSPOT
assert target_doc.metadata is not None
# Verify sections
assert len(target_doc.sections) == 1
section = target_doc.sections[0]
assert section.link == poll_source_data.get("link")
assert section.text is not None
assert len(section.text) > 0
def test_highspot_connector_validate_credentials( def test_highspot_connector_validate_credentials(
highspot_connector: HighspotConnector, highspot_connector: HighspotConnector,
) -> None: ) -> None:

View File

@@ -1,5 +1,10 @@
{ {
"target_doc_id": "67cd8eb35d3ee0487de2e704", "target_doc_id": "67cd8eb35d3ee0487de2e704",
"semantic_identifier": "Highspot in Action _ Salesforce Integration", "semantic_identifier": "Highspot in Action _ Salesforce Integration",
"link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704" "link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704",
"poll_source": {
"target_doc_id":"67ef9edcc3f40b2bf3d816a8",
"semantic_identifier":"A Brief Introduction To AI",
"link":"https://www.highspot.com/items/67ef9edcc3f40b2bf3d816a8"
}
} }