mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-04 09:58:32 +02:00
Highspot connector (#4277)
This commit is contained in:
parent
f45798b5dd
commit
ba514aaaa2
@ -50,6 +50,9 @@ env:
|
||||
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
|
||||
# Notion
|
||||
NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }}
|
||||
# Highspot
|
||||
HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }}
|
||||
HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }}
|
||||
|
||||
jobs:
|
||||
connectors-check:
|
||||
|
@ -174,6 +174,7 @@ class DocumentSource(str, Enum):
|
||||
FIREFLIES = "fireflies"
|
||||
EGNYTE = "egnyte"
|
||||
AIRTABLE = "airtable"
|
||||
HIGHSPOT = "highspot"
|
||||
|
||||
# Special case just for integration tests
|
||||
MOCK_CONNECTOR = "mock_connector"
|
||||
|
@ -30,6 +30,7 @@ from onyx.connectors.gong.connector import GongConnector
|
||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from onyx.connectors.google_site.connector import GoogleSitesConnector
|
||||
from onyx.connectors.guru.connector import GuruConnector
|
||||
from onyx.connectors.highspot.connector import HighspotConnector
|
||||
from onyx.connectors.hubspot.connector import HubSpotConnector
|
||||
from onyx.connectors.interfaces import BaseConnector
|
||||
from onyx.connectors.interfaces import CheckpointConnector
|
||||
@ -117,6 +118,7 @@ def identify_connector_class(
|
||||
DocumentSource.FIREFLIES: FirefliesConnector,
|
||||
DocumentSource.EGNYTE: EgnyteConnector,
|
||||
DocumentSource.AIRTABLE: AirtableConnector,
|
||||
DocumentSource.HIGHSPOT: HighspotConnector,
|
||||
# just for integration tests
|
||||
DocumentSource.MOCK_CONNECTOR: MockConnector,
|
||||
}
|
||||
|
4
backend/onyx/connectors/highspot/__init__.py
Normal file
4
backend/onyx/connectors/highspot/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
"""
|
||||
Highspot connector package for Onyx.
|
||||
Enables integration with Highspot's knowledge base.
|
||||
"""
|
280
backend/onyx/connectors/highspot/client.py
Normal file
280
backend/onyx/connectors/highspot/client.py
Normal file
@ -0,0 +1,280 @@
|
||||
import base64
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.exceptions import HTTPError
|
||||
from requests.exceptions import RequestException
|
||||
from requests.exceptions import Timeout
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class HighspotClientError(Exception):
|
||||
"""Base exception for Highspot API client errors."""
|
||||
|
||||
def __init__(self, message: str, status_code: Optional[int] = None):
|
||||
self.message = message
|
||||
self.status_code = status_code
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
class HighspotAuthenticationError(HighspotClientError):
|
||||
"""Exception raised for authentication errors."""
|
||||
|
||||
|
||||
class HighspotRateLimitError(HighspotClientError):
|
||||
"""Exception raised when rate limit is exceeded."""
|
||||
|
||||
def __init__(self, message: str, retry_after: Optional[str] = None):
|
||||
self.retry_after = retry_after
|
||||
super().__init__(message)
|
||||
|
||||
|
||||
class HighspotClient:
|
||||
"""
|
||||
Client for interacting with the Highspot API.
|
||||
|
||||
Uses basic authentication with provided key (username) and secret (password).
|
||||
Implements retry logic, error handling, and connection pooling.
|
||||
"""
|
||||
|
||||
BASE_URL = "https://api-su2.highspot.com/v1.0/"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
key: str,
|
||||
secret: str,
|
||||
base_url: str = BASE_URL,
|
||||
timeout: int = 30,
|
||||
max_retries: int = 3,
|
||||
backoff_factor: float = 0.5,
|
||||
status_forcelist: Optional[List[int]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the Highspot API client.
|
||||
|
||||
Args:
|
||||
key: API key (used as username)
|
||||
secret: API secret (used as password)
|
||||
base_url: Base URL for the Highspot API
|
||||
timeout: Request timeout in seconds
|
||||
max_retries: Maximum number of retries for failed requests
|
||||
backoff_factor: Backoff factor for retries
|
||||
status_forcelist: HTTP status codes to retry on
|
||||
"""
|
||||
if not key or not secret:
|
||||
raise ValueError("API key and secret are required")
|
||||
|
||||
self.key = key
|
||||
self.secret = secret
|
||||
self.base_url = base_url
|
||||
self.timeout = timeout
|
||||
|
||||
# Set up session with retry logic
|
||||
self.session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=max_retries,
|
||||
backoff_factor=backoff_factor,
|
||||
status_forcelist=status_forcelist or [429, 500, 502, 503, 504],
|
||||
allowed_methods=["GET", "POST", "PUT", "DELETE"],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
# Set up authentication
|
||||
self._setup_auth()
|
||||
|
||||
def _setup_auth(self) -> None:
|
||||
"""Set up basic authentication for the session."""
|
||||
auth = f"{self.key}:{self.secret}"
|
||||
encoded_auth = base64.b64encode(auth.encode()).decode()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"Authorization": f"Basic {encoded_auth}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
)
|
||||
|
||||
def _make_request(
|
||||
self,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
json_data: Optional[Dict[str, Any]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Make a request to the Highspot API.
|
||||
|
||||
Args:
|
||||
method: HTTP method (GET, POST, etc.)
|
||||
endpoint: API endpoint
|
||||
params: URL parameters
|
||||
data: Form data
|
||||
json_data: JSON data
|
||||
headers: Additional headers
|
||||
|
||||
Returns:
|
||||
API response as a dictionary
|
||||
|
||||
Raises:
|
||||
HighspotClientError: On API errors
|
||||
HighspotAuthenticationError: On authentication errors
|
||||
HighspotRateLimitError: On rate limiting
|
||||
requests.exceptions.RequestException: On request failures
|
||||
"""
|
||||
url = urljoin(self.base_url, endpoint)
|
||||
request_headers = {}
|
||||
if headers:
|
||||
request_headers.update(headers)
|
||||
|
||||
try:
|
||||
logger.debug(f"Making {method} request to {url}")
|
||||
response = self.session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
params=params,
|
||||
data=data,
|
||||
json=json_data,
|
||||
headers=request_headers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if response.content and response.content.strip():
|
||||
return response.json()
|
||||
return {}
|
||||
|
||||
except HTTPError as e:
|
||||
status_code = e.response.status_code
|
||||
error_msg = str(e)
|
||||
|
||||
try:
|
||||
error_data = e.response.json()
|
||||
if isinstance(error_data, dict):
|
||||
error_msg = error_data.get("message", str(e))
|
||||
except (ValueError, KeyError):
|
||||
pass
|
||||
|
||||
if status_code == 401:
|
||||
raise HighspotAuthenticationError(f"Authentication failed: {error_msg}")
|
||||
elif status_code == 429:
|
||||
retry_after = e.response.headers.get("Retry-After")
|
||||
raise HighspotRateLimitError(
|
||||
f"Rate limit exceeded: {error_msg}", retry_after=retry_after
|
||||
)
|
||||
else:
|
||||
raise HighspotClientError(
|
||||
f"API error {status_code}: {error_msg}", status_code=status_code
|
||||
)
|
||||
|
||||
except Timeout:
|
||||
raise HighspotClientError("Request timed out")
|
||||
except RequestException as e:
|
||||
raise HighspotClientError(f"Request failed: {str(e)}")
|
||||
|
||||
def get_spots(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get all available spots.
|
||||
|
||||
Returns:
|
||||
List of spots with their names and IDs
|
||||
"""
|
||||
params = {"right": "view"}
|
||||
response = self._make_request("GET", "spots", params=params)
|
||||
logger.info(f"Received {response} spots")
|
||||
total_counts = response.get("counts_total")
|
||||
# Fix comparison to handle None value
|
||||
if total_counts is not None and total_counts > 0:
|
||||
return response.get("collection", [])
|
||||
return []
|
||||
|
||||
def get_spot(self, spot_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get details for a specific spot.
|
||||
|
||||
Args:
|
||||
spot_id: ID of the spot
|
||||
|
||||
Returns:
|
||||
Spot details
|
||||
"""
|
||||
if not spot_id:
|
||||
raise ValueError("spot_id is required")
|
||||
return self._make_request("GET", f"spots/{spot_id}")
|
||||
|
||||
def get_spot_items(
|
||||
self, spot_id: str, offset: int = 0, page_size: int = 100
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get items in a specific spot.
|
||||
|
||||
Args:
|
||||
spot_id: ID of the spot
|
||||
offset: offset number
|
||||
page_size: Number of items per page
|
||||
|
||||
Returns:
|
||||
Items in the spot
|
||||
"""
|
||||
if not spot_id:
|
||||
raise ValueError("spot_id is required")
|
||||
|
||||
params = {"spot": spot_id, "start": offset, "limit": page_size}
|
||||
return self._make_request("GET", "items", params=params)
|
||||
|
||||
def get_item(self, item_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get details for a specific item.
|
||||
|
||||
Args:
|
||||
item_id: ID of the item
|
||||
|
||||
Returns:
|
||||
Item details
|
||||
"""
|
||||
if not item_id:
|
||||
raise ValueError("item_id is required")
|
||||
return self._make_request("GET", f"items/{item_id}")
|
||||
|
||||
def get_item_content(self, item_id: str) -> bytes:
|
||||
"""
|
||||
Get the raw content of an item.
|
||||
|
||||
Args:
|
||||
item_id: ID of the item
|
||||
|
||||
Returns:
|
||||
Raw content bytes
|
||||
"""
|
||||
if not item_id:
|
||||
raise ValueError("item_id is required")
|
||||
|
||||
url = urljoin(self.base_url, f"items/{item_id}/content")
|
||||
response = self.session.get(url, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
return response.content
|
||||
|
||||
def health_check(self) -> bool:
|
||||
"""
|
||||
Check if the API is accessible and credentials are valid.
|
||||
|
||||
Returns:
|
||||
True if API is accessible, False otherwise
|
||||
"""
|
||||
try:
|
||||
self._make_request("GET", "spots", params={"limit": 1})
|
||||
return True
|
||||
except (HighspotClientError, HighspotAuthenticationError):
|
||||
return False
|
431
backend/onyx/connectors/highspot/connector.py
Normal file
431
backend/onyx/connectors/highspot/connector.py
Normal file
@ -0,0 +1,431 @@
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.highspot.client import HighspotClient
|
||||
from onyx.connectors.highspot.client import HighspotClientError
|
||||
from onyx.connectors.highspot.utils import scrape_url_content
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import VALID_FILE_EXTENSIONS
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
_SLIM_BATCH_SIZE = 1000
|
||||
|
||||
|
||||
class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
"""
|
||||
Connector for loading data from Highspot.
|
||||
|
||||
Retrieves content from specified spots using the Highspot API.
|
||||
If no spots are specified, retrieves content from all available spots.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
spot_names: List[str] = [],
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
):
|
||||
"""
|
||||
Initialize the Highspot connector.
|
||||
|
||||
Args:
|
||||
spot_names: List of spot names to retrieve content from (if empty, gets all spots)
|
||||
batch_size: Number of items to retrieve in each batch
|
||||
"""
|
||||
self.spot_names = spot_names
|
||||
self.batch_size = batch_size
|
||||
self._client: Optional[HighspotClient] = None
|
||||
self._spot_id_map: Dict[str, str] = {} # Maps spot names to spot IDs
|
||||
self._all_spots_fetched = False
|
||||
self.highspot_url: Optional[str] = None
|
||||
self.key: Optional[str] = None
|
||||
self.secret: Optional[str] = None
|
||||
|
||||
@property
|
||||
def client(self) -> HighspotClient:
|
||||
if self._client is None:
|
||||
if not self.key or not self.secret:
|
||||
raise ConnectorMissingCredentialError("Highspot")
|
||||
# Ensure highspot_url is a string, use default if None
|
||||
base_url = (
|
||||
self.highspot_url
|
||||
if self.highspot_url is not None
|
||||
else HighspotClient.BASE_URL
|
||||
)
|
||||
self._client = HighspotClient(self.key, self.secret, base_url=base_url)
|
||||
return self._client
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
logger.info("Loading Highspot credentials")
|
||||
self.highspot_url = credentials.get("highspot_url")
|
||||
self.key = credentials.get("highspot_key")
|
||||
self.secret = credentials.get("highspot_secret")
|
||||
return None
|
||||
|
||||
def _populate_spot_id_map(self) -> None:
|
||||
"""
|
||||
Populate the spot ID map with all available spots.
|
||||
Keys are stored as lowercase for case-insensitive lookups.
|
||||
"""
|
||||
spots = self.client.get_spots()
|
||||
for spot in spots:
|
||||
if "title" in spot and "id" in spot:
|
||||
spot_name = spot["title"]
|
||||
self._spot_id_map[spot_name.lower()] = spot["id"]
|
||||
|
||||
self._all_spots_fetched = True
|
||||
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
|
||||
|
||||
def _get_all_spot_names(self) -> List[str]:
|
||||
"""
|
||||
Retrieve all available spot names.
|
||||
|
||||
Returns:
|
||||
List of all spot names
|
||||
"""
|
||||
if not self._all_spots_fetched:
|
||||
self._populate_spot_id_map()
|
||||
|
||||
return [spot_name for spot_name in self._spot_id_map.keys()]
|
||||
|
||||
def _get_spot_id_from_name(self, spot_name: str) -> str:
|
||||
"""
|
||||
Get spot ID from a spot name.
|
||||
|
||||
Args:
|
||||
spot_name: Name of the spot
|
||||
|
||||
Returns:
|
||||
ID of the spot
|
||||
|
||||
Raises:
|
||||
ValueError: If spot name is not found
|
||||
"""
|
||||
if not self._all_spots_fetched:
|
||||
self._populate_spot_id_map()
|
||||
|
||||
spot_name_lower = spot_name.lower()
|
||||
if spot_name_lower not in self._spot_id_map:
|
||||
raise ValueError(f"Spot '{spot_name}' not found")
|
||||
|
||||
return self._spot_id_map[spot_name_lower]
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
"""
|
||||
Load content from configured spots in Highspot.
|
||||
If no spots are configured, loads from all spots.
|
||||
|
||||
Yields:
|
||||
Batches of Document objects
|
||||
"""
|
||||
return self.poll_source(None, None)
|
||||
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
||||
) -> GenerateDocumentsOutput:
|
||||
"""
|
||||
Poll Highspot for content updated since the start time.
|
||||
|
||||
Args:
|
||||
start: Start time as seconds since Unix epoch
|
||||
end: End time as seconds since Unix epoch
|
||||
|
||||
Yields:
|
||||
Batches of Document objects
|
||||
"""
|
||||
doc_batch: list[Document] = []
|
||||
|
||||
# If no spots specified, get all spots
|
||||
spot_names_to_process = self.spot_names
|
||||
if not spot_names_to_process:
|
||||
spot_names_to_process = self._get_all_spot_names()
|
||||
logger.info(
|
||||
f"No spots specified, using all {len(spot_names_to_process)} available spots"
|
||||
)
|
||||
|
||||
for spot_name in spot_names_to_process:
|
||||
try:
|
||||
spot_id = self._get_spot_id_from_name(spot_name)
|
||||
if spot_id is None:
|
||||
logger.warning(f"Spot ID not found for spot {spot_name}")
|
||||
continue
|
||||
offset = 0
|
||||
has_more = True
|
||||
|
||||
while has_more:
|
||||
logger.info(
|
||||
f"Retrieving items from spot {spot_name}, offset {offset}"
|
||||
)
|
||||
response = self.client.get_spot_items(
|
||||
spot_id=spot_id, offset=offset, page_size=self.batch_size
|
||||
)
|
||||
items = response.get("collection", [])
|
||||
logger.info(f"Received Items: {items}")
|
||||
if not items:
|
||||
has_more = False
|
||||
continue
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
item_id = item.get("id")
|
||||
if not item_id:
|
||||
logger.warning("Item without ID found, skipping")
|
||||
continue
|
||||
|
||||
item_details = self.client.get_item(item_id)
|
||||
if not item_details:
|
||||
logger.warning(
|
||||
f"Item {item_id} details not found, skipping"
|
||||
)
|
||||
continue
|
||||
# Apply time filter if specified
|
||||
if start or end:
|
||||
updated_at = item_details.get("date_updated")
|
||||
if updated_at:
|
||||
# Convert to datetime for comparison
|
||||
try:
|
||||
updated_time = datetime.fromisoformat(
|
||||
updated_at.replace("Z", "+00:00")
|
||||
)
|
||||
if (
|
||||
start and updated_time.timestamp() < start
|
||||
) or (end and updated_time.timestamp() > end):
|
||||
continue
|
||||
except (ValueError, TypeError):
|
||||
# Skip if date cannot be parsed
|
||||
logger.warning(
|
||||
f"Invalid date format for item {item_id}: {updated_at}"
|
||||
)
|
||||
continue
|
||||
|
||||
content = self._get_item_content(item_details)
|
||||
title = item_details.get("title", "")
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
id=f"HIGHSPOT_{item_id}",
|
||||
sections=[
|
||||
TextSection(
|
||||
link=item_details.get(
|
||||
"url",
|
||||
f"https://www.highspot.com/items/{item_id}",
|
||||
),
|
||||
text=content,
|
||||
)
|
||||
],
|
||||
source=DocumentSource.HIGHSPOT,
|
||||
semantic_identifier=title,
|
||||
metadata={
|
||||
"spot_name": spot_name,
|
||||
"type": item_details.get("content_type", ""),
|
||||
"created_at": item_details.get(
|
||||
"date_added", ""
|
||||
),
|
||||
"author": item_details.get("author", ""),
|
||||
"language": item_details.get("language", ""),
|
||||
"can_download": str(
|
||||
item_details.get("can_download", False)
|
||||
),
|
||||
},
|
||||
doc_updated_at=item_details.get("date_updated"),
|
||||
)
|
||||
)
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
|
||||
except HighspotClientError as e:
|
||||
item_id = "ID" if not item_id else item_id
|
||||
logger.error(f"Error retrieving item {item_id}: {str(e)}")
|
||||
|
||||
has_more = len(items) >= self.batch_size
|
||||
offset += self.batch_size
|
||||
|
||||
except (HighspotClientError, ValueError) as e:
|
||||
logger.error(f"Error processing spot {spot_name}: {str(e)}")
|
||||
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
|
||||
def _get_item_content(self, item_details: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Get the text content of an item.
|
||||
|
||||
Args:
|
||||
item_details: Item details from the API
|
||||
|
||||
Returns:
|
||||
Text content of the item
|
||||
"""
|
||||
item_id = item_details.get("id", "")
|
||||
content_name = item_details.get("content_name", "")
|
||||
is_valid_format = content_name and "." in content_name
|
||||
file_extension = content_name.split(".")[-1].lower() if is_valid_format else ""
|
||||
file_extension = "." + file_extension if file_extension else ""
|
||||
can_download = item_details.get("can_download", False)
|
||||
content_type = item_details.get("content_type", "")
|
||||
|
||||
# Extract title and description once at the beginning
|
||||
title, description = self._extract_title_and_description(item_details)
|
||||
default_content = f"{title}\n{description}"
|
||||
logger.info(f"Processing item {item_id} with extension {file_extension}")
|
||||
|
||||
try:
|
||||
if content_type == "WebLink":
|
||||
url = item_details.get("url")
|
||||
if not url:
|
||||
return default_content
|
||||
content = scrape_url_content(url, True)
|
||||
return content if content else default_content
|
||||
|
||||
elif (
|
||||
is_valid_format
|
||||
and file_extension in VALID_FILE_EXTENSIONS
|
||||
and can_download
|
||||
):
|
||||
# For documents, try to get the text content
|
||||
if not item_id: # Ensure item_id is defined
|
||||
return default_content
|
||||
|
||||
content_response = self.client.get_item_content(item_id)
|
||||
# Process and extract text from binary content based on type
|
||||
if content_response:
|
||||
text_content = extract_file_text(
|
||||
BytesIO(content_response), content_name
|
||||
)
|
||||
return text_content
|
||||
return default_content
|
||||
|
||||
else:
|
||||
return default_content
|
||||
|
||||
except HighspotClientError as e:
|
||||
# Use item_id safely in the warning message
|
||||
error_context = f"item {item_id}" if item_id else "item"
|
||||
logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
|
||||
return ""
|
||||
|
||||
def _extract_title_and_description(
|
||||
self, item_details: Dict[str, Any]
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Extract the title and description from item details.
|
||||
|
||||
Args:
|
||||
item_details: Item details from the API
|
||||
|
||||
Returns:
|
||||
Tuple of title and description
|
||||
"""
|
||||
title = item_details.get("title", "")
|
||||
description = item_details.get("description", "")
|
||||
return title, description
|
||||
|
||||
def retrieve_all_slim_documents(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""
|
||||
Retrieve all document IDs from the configured spots.
|
||||
If no spots are configured, retrieves from all spots.
|
||||
|
||||
Args:
|
||||
start: Optional start time filter
|
||||
end: Optional end time filter
|
||||
callback: Optional indexing heartbeat callback
|
||||
|
||||
Yields:
|
||||
Batches of SlimDocument objects
|
||||
"""
|
||||
slim_doc_batch: list[SlimDocument] = []
|
||||
|
||||
# If no spots specified, get all spots
|
||||
spot_names_to_process = self.spot_names
|
||||
if not spot_names_to_process:
|
||||
spot_names_to_process = self._get_all_spot_names()
|
||||
logger.info(
|
||||
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
|
||||
)
|
||||
|
||||
for spot_name in spot_names_to_process:
|
||||
try:
|
||||
spot_id = self._get_spot_id_from_name(spot_name)
|
||||
offset = 0
|
||||
has_more = True
|
||||
|
||||
while has_more:
|
||||
logger.info(
|
||||
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
|
||||
)
|
||||
response = self.client.get_spot_items(
|
||||
spot_id=spot_id, offset=offset, page_size=self.batch_size
|
||||
)
|
||||
|
||||
items = response.get("collection", [])
|
||||
if not items:
|
||||
has_more = False
|
||||
continue
|
||||
|
||||
for item in items:
|
||||
item_id = item.get("id")
|
||||
if not item_id:
|
||||
continue
|
||||
|
||||
slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
|
||||
|
||||
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
|
||||
yield slim_doc_batch
|
||||
slim_doc_batch = []
|
||||
|
||||
has_more = len(items) >= self.batch_size
|
||||
offset += self.batch_size
|
||||
|
||||
except (HighspotClientError, ValueError) as e:
|
||||
logger.error(
|
||||
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
|
||||
)
|
||||
|
||||
if slim_doc_batch:
|
||||
yield slim_doc_batch
|
||||
|
||||
def validate_credentials(self) -> bool:
|
||||
"""
|
||||
Validate that the provided credentials can access the Highspot API.
|
||||
|
||||
Returns:
|
||||
True if credentials are valid, False otherwise
|
||||
"""
|
||||
try:
|
||||
return self.client.health_check()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to validate credentials: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
spot_names: List[str] = []
|
||||
connector = HighspotConnector(spot_names)
|
||||
credentials = {"highspot_key": "", "highspot_secret": ""}
|
||||
connector.load_credentials(credentials=credentials)
|
||||
for doc in connector.load_from_state():
|
||||
print(doc)
|
122
backend/onyx/connectors/highspot/utils.py
Normal file
122
backend/onyx/connectors/highspot/utils.py
Normal file
@ -0,0 +1,122 @@
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from onyx.file_processing.html_utils import web_html_cleanup
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Constants
|
||||
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
|
||||
JAVASCRIPT_DISABLED_MESSAGE = "You have JavaScript disabled in your browser"
|
||||
DEFAULT_TIMEOUT = 60000 # 60 seconds
|
||||
|
||||
|
||||
def scrape_url_content(
|
||||
url: str, scroll_before_scraping: bool = False, timeout_ms: int = DEFAULT_TIMEOUT
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Scrapes content from a given URL and returns the cleaned text.
|
||||
|
||||
Args:
|
||||
url: The URL to scrape
|
||||
scroll_before_scraping: Whether to scroll through the page to load lazy content
|
||||
timeout_ms: Timeout in milliseconds for page navigation and loading
|
||||
|
||||
Returns:
|
||||
The cleaned text content of the page or None if scraping fails
|
||||
"""
|
||||
playwright = None
|
||||
browser = None
|
||||
try:
|
||||
validate_url(url)
|
||||
playwright = sync_playwright().start()
|
||||
browser = playwright.chromium.launch(headless=True)
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
|
||||
logger.info(f"Navigating to URL: {url}")
|
||||
try:
|
||||
page.goto(url, timeout=timeout_ms)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to navigate to {url}: {str(e)}")
|
||||
return None
|
||||
|
||||
if scroll_before_scraping:
|
||||
logger.debug("Scrolling page to load lazy content")
|
||||
scroll_attempts = 0
|
||||
previous_height = page.evaluate("document.body.scrollHeight")
|
||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
try:
|
||||
page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
||||
except Exception as e:
|
||||
logger.warning(f"Network idle wait timed out: {str(e)}")
|
||||
break
|
||||
|
||||
new_height = page.evaluate("document.body.scrollHeight")
|
||||
if new_height == previous_height:
|
||||
break
|
||||
previous_height = new_height
|
||||
scroll_attempts += 1
|
||||
|
||||
content = page.content()
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
parsed_html = web_html_cleanup(soup)
|
||||
|
||||
if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
|
||||
logger.debug("JavaScript disabled message detected, checking iframes")
|
||||
try:
|
||||
iframe_count = page.frame_locator("iframe").locator("html").count()
|
||||
if iframe_count > 0:
|
||||
iframe_texts = (
|
||||
page.frame_locator("iframe").locator("html").all_inner_texts()
|
||||
)
|
||||
iframe_content = "\n".join(iframe_texts)
|
||||
|
||||
if len(parsed_html.cleaned_text) < 700:
|
||||
parsed_html.cleaned_text = iframe_content
|
||||
else:
|
||||
parsed_html.cleaned_text += "\n" + iframe_content
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing iframes: {str(e)}")
|
||||
|
||||
return parsed_html.cleaned_text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping URL {url}: {str(e)}")
|
||||
return None
|
||||
|
||||
finally:
|
||||
if browser:
|
||||
try:
|
||||
browser.close()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error closing browser: {str(e)}")
|
||||
if playwright:
|
||||
try:
|
||||
playwright.stop()
|
||||
except Exception as e:
|
||||
logger.debug(f"Error stopping playwright: {str(e)}")
|
||||
|
||||
|
||||
def validate_url(url: str) -> None:
|
||||
"""
|
||||
Validates that a URL is properly formatted.
|
||||
|
||||
Args:
|
||||
url: The URL to validate
|
||||
|
||||
Raises:
|
||||
ValueError: If URL is not valid
|
||||
"""
|
||||
parse = urlparse(url)
|
||||
if parse.scheme != "http" and parse.scheme != "https":
|
||||
raise ValueError("URL must be of scheme https?://")
|
||||
|
||||
if not parse.hostname:
|
||||
raise ValueError("URL must include a hostname")
|
@ -0,0 +1,98 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.highspot.connector import HighspotConnector
|
||||
from onyx.connectors.models import Document
|
||||
|
||||
|
||||
def load_test_data(file_name: str = "test_highspot_data.json") -> dict:
|
||||
"""Load test data from JSON file."""
|
||||
current_dir = Path(__file__).parent
|
||||
with open(current_dir / file_name, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def highspot_connector() -> HighspotConnector:
|
||||
"""Create a Highspot connector with credentials from environment variables."""
|
||||
# Check if required environment variables are set
|
||||
if not os.environ.get("HIGHSPOT_KEY") or not os.environ.get("HIGHSPOT_SECRET"):
|
||||
pytest.fail("HIGHSPOT_KEY or HIGHSPOT_SECRET environment variables not set")
|
||||
|
||||
connector = HighspotConnector(
|
||||
spot_names=["Test content"], # Use specific spot name instead of empty list
|
||||
batch_size=10, # Smaller batch size for testing
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"highspot_key": os.environ["HIGHSPOT_KEY"],
|
||||
"highspot_secret": os.environ["HIGHSPOT_SECRET"],
|
||||
"highspot_url": os.environ.get(
|
||||
"HIGHSPOT_URL", "https://api-su2.highspot.com/v1.0/"
|
||||
),
|
||||
}
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
def test_highspot_connector_basic(highspot_connector: HighspotConnector) -> None:
|
||||
"""Test basic functionality of the Highspot connector."""
|
||||
all_docs: list[Document] = []
|
||||
test_data = load_test_data()
|
||||
target_test_doc_id = test_data.get("target_doc_id")
|
||||
target_test_doc: Document | None = None
|
||||
|
||||
# Test loading documents
|
||||
for doc_batch in highspot_connector.poll_source(0, time.time()):
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
if doc.id == f"HIGHSPOT_{target_test_doc_id}":
|
||||
target_test_doc = doc
|
||||
|
||||
# Verify documents were loaded
|
||||
assert len(all_docs) > 0
|
||||
|
||||
# If we have a specific test document ID, validate it
|
||||
if target_test_doc_id and target_test_doc is not None:
|
||||
assert target_test_doc.semantic_identifier == test_data.get(
|
||||
"semantic_identifier"
|
||||
)
|
||||
assert target_test_doc.source == DocumentSource.HIGHSPOT
|
||||
assert target_test_doc.metadata is not None
|
||||
|
||||
assert len(target_test_doc.sections) == 1
|
||||
section = target_test_doc.sections[0]
|
||||
assert section.link is not None
|
||||
# Only check if content exists, as exact content might change
|
||||
assert section.text is not None
|
||||
assert len(section.text) > 0
|
||||
|
||||
|
||||
def test_highspot_connector_slim(highspot_connector: HighspotConnector) -> None:
|
||||
"""Test slim document retrieval."""
|
||||
# Get all doc IDs from the full connector
|
||||
all_full_doc_ids = set()
|
||||
for doc_batch in highspot_connector.load_from_state():
|
||||
all_full_doc_ids.update([doc.id for doc in doc_batch])
|
||||
|
||||
# Get all doc IDs from the slim connector
|
||||
all_slim_doc_ids = set()
|
||||
for slim_doc_batch in highspot_connector.retrieve_all_slim_documents():
|
||||
all_slim_doc_ids.update([doc.id for doc in slim_doc_batch])
|
||||
|
||||
# The set of full doc IDs should be a subset of the slim doc IDs
|
||||
assert all_full_doc_ids.issubset(all_slim_doc_ids)
|
||||
# Make sure we actually got some documents
|
||||
assert len(all_slim_doc_ids) > 0
|
||||
|
||||
|
||||
def test_highspot_connector_validate_credentials(
|
||||
highspot_connector: HighspotConnector,
|
||||
) -> None:
|
||||
"""Test credential validation."""
|
||||
assert highspot_connector.validate_credentials() is True
|
@ -0,0 +1,5 @@
|
||||
{
|
||||
"target_doc_id": "67cd8eb35d3ee0487de2e704",
|
||||
"semantic_identifier": "Highspot in Action _ Salesforce Integration",
|
||||
"link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704"
|
||||
}
|
BIN
web/public/Highspot.png
Normal file
BIN
web/public/Highspot.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 17 KiB |
@ -89,6 +89,7 @@ import cohereIcon from "../../../public/Cohere.svg";
|
||||
import voyageIcon from "../../../public/Voyage.png";
|
||||
import googleIcon from "../../../public/Google.webp";
|
||||
import xenforoIcon from "../../../public/Xenforo.svg";
|
||||
import highspotIcon from "../../../public/Highspot.png";
|
||||
import { FaGithub, FaRobot } from "react-icons/fa";
|
||||
|
||||
import { cn } from "@/lib/utils";
|
||||
@ -2912,6 +2913,13 @@ export const GitbookIcon = ({
|
||||
</div>
|
||||
);
|
||||
|
||||
export const HighspotIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => {
|
||||
return <LogoIcon size={size} className={className} src={highspotIcon} />;
|
||||
};
|
||||
|
||||
export const PinnedIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
|
@ -1249,6 +1249,47 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
|
||||
],
|
||||
overrideDefaultFreq: 60 * 60 * 24,
|
||||
},
|
||||
highspot: {
|
||||
description: "Configure Highspot connector",
|
||||
values: [
|
||||
{
|
||||
type: "tab",
|
||||
name: "highspot_scope",
|
||||
label: "What should we index from Highspot?",
|
||||
optional: true,
|
||||
tabs: [
|
||||
{
|
||||
value: "spots",
|
||||
label: "Specific Spots",
|
||||
fields: [
|
||||
{
|
||||
type: "list",
|
||||
query: "Enter the spot name(s):",
|
||||
label: "Spot Name(s)",
|
||||
name: "spot_names",
|
||||
optional: false,
|
||||
description: "For multiple spots, enter your spot one by one.",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
value: "everything",
|
||||
label: "Everything",
|
||||
fields: [
|
||||
{
|
||||
type: "string_tab",
|
||||
label: "Everything",
|
||||
name: "everything",
|
||||
description:
|
||||
"This connector will index all spots the provided credentials have access to!",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
advanced_values: [],
|
||||
},
|
||||
};
|
||||
export function createConnectorInitialValues(
|
||||
connector: ConfigurableSources
|
||||
|
@ -226,6 +226,12 @@ export interface AirtableCredentialJson {
|
||||
airtable_access_token: string;
|
||||
}
|
||||
|
||||
export interface HighspotCredentialJson {
|
||||
highspot_url: string;
|
||||
highspot_key: string;
|
||||
highspot_secret: string;
|
||||
}
|
||||
|
||||
export const credentialTemplates: Record<ValidSources, any> = {
|
||||
github: { github_access_token: "" } as GithubCredentialJson,
|
||||
gitlab: {
|
||||
@ -353,6 +359,11 @@ export const credentialTemplates: Record<ValidSources, any> = {
|
||||
gitbook: {
|
||||
gitbook_api_key: "",
|
||||
} as GitbookCredentialJson,
|
||||
highspot: {
|
||||
highspot_url: "",
|
||||
highspot_key: "",
|
||||
highspot_secret: "",
|
||||
} as HighspotCredentialJson,
|
||||
};
|
||||
|
||||
export const credentialDisplayNames: Record<string, string> = {
|
||||
@ -488,6 +499,11 @@ export const credentialDisplayNames: Record<string, string> = {
|
||||
// GitBook
|
||||
gitbook_space_id: "GitBook Space ID",
|
||||
gitbook_api_key: "GitBook API Key",
|
||||
|
||||
//Highspot
|
||||
highspot_url: "Highspot URL",
|
||||
highspot_key: "Highspot Key",
|
||||
highspot_secret: "Highspot Secret",
|
||||
};
|
||||
|
||||
export function getDisplayNameForCredentialKey(key: string): string {
|
||||
|
@ -44,6 +44,7 @@ import {
|
||||
GlobeIcon2,
|
||||
FileIcon2,
|
||||
GitbookIcon,
|
||||
HighspotIcon,
|
||||
} from "@/components/icons/icons";
|
||||
import { ValidSources } from "./types";
|
||||
import {
|
||||
@ -329,6 +330,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
|
||||
category: SourceCategory.Wiki,
|
||||
docs: "https://docs.onyx.app/connectors/gitbook",
|
||||
},
|
||||
highspot: {
|
||||
icon: HighspotIcon,
|
||||
displayName: "Highspot",
|
||||
category: SourceCategory.Wiki,
|
||||
docs: "https://docs.onyx.app/connectors/highspot",
|
||||
},
|
||||
// currently used for the Internet Search tool docs, which is why
|
||||
// a globe is used
|
||||
not_applicable: {
|
||||
|
@ -390,6 +390,7 @@ export enum ValidSources {
|
||||
Egnyte = "egnyte",
|
||||
Airtable = "airtable",
|
||||
Gitbook = "gitbook",
|
||||
Highspot = "highspot",
|
||||
}
|
||||
|
||||
export const validAutoSyncSources = [
|
||||
|
Loading…
x
Reference in New Issue
Block a user