diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml
index f7f3972b3..aa740aa8d 100644
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -50,6 +50,9 @@ env:
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
# Notion
NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }}
+ # Highspot
+ HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }}
+ HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }}
jobs:
connectors-check:
diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py
index 2d7445362..cbcc71245 100644
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -174,6 +174,7 @@ class DocumentSource(str, Enum):
FIREFLIES = "fireflies"
EGNYTE = "egnyte"
AIRTABLE = "airtable"
+ HIGHSPOT = "highspot"
# Special case just for integration tests
MOCK_CONNECTOR = "mock_connector"
diff --git a/backend/onyx/connectors/factory.py b/backend/onyx/connectors/factory.py
index 73593cc60..2f0b10743 100644
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -30,6 +30,7 @@ from onyx.connectors.gong.connector import GongConnector
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from onyx.connectors.google_site.connector import GoogleSitesConnector
from onyx.connectors.guru.connector import GuruConnector
+from onyx.connectors.highspot.connector import HighspotConnector
from onyx.connectors.hubspot.connector import HubSpotConnector
from onyx.connectors.interfaces import BaseConnector
from onyx.connectors.interfaces import CheckpointConnector
@@ -117,6 +118,7 @@ def identify_connector_class(
DocumentSource.FIREFLIES: FirefliesConnector,
DocumentSource.EGNYTE: EgnyteConnector,
DocumentSource.AIRTABLE: AirtableConnector,
+ DocumentSource.HIGHSPOT: HighspotConnector,
# just for integration tests
DocumentSource.MOCK_CONNECTOR: MockConnector,
}
diff --git a/backend/onyx/connectors/highspot/__init__.py b/backend/onyx/connectors/highspot/__init__.py
new file mode 100644
index 000000000..df94b5875
--- /dev/null
+++ b/backend/onyx/connectors/highspot/__init__.py
@@ -0,0 +1,4 @@
+"""
+Highspot connector package for Onyx.
+Enables integration with Highspot's knowledge base.
+"""
diff --git a/backend/onyx/connectors/highspot/client.py b/backend/onyx/connectors/highspot/client.py
new file mode 100644
index 000000000..7879e6e79
--- /dev/null
+++ b/backend/onyx/connectors/highspot/client.py
@@ -0,0 +1,280 @@
+import base64
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from urllib.parse import urljoin
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.exceptions import HTTPError
+from requests.exceptions import RequestException
+from requests.exceptions import Timeout
+from urllib3.util.retry import Retry
+
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class HighspotClientError(Exception):
+ """Base exception for Highspot API client errors."""
+
+ def __init__(self, message: str, status_code: Optional[int] = None):
+ self.message = message
+ self.status_code = status_code
+ super().__init__(self.message)
+
+
+class HighspotAuthenticationError(HighspotClientError):
+ """Exception raised for authentication errors."""
+
+
+class HighspotRateLimitError(HighspotClientError):
+ """Exception raised when rate limit is exceeded."""
+
+ def __init__(self, message: str, retry_after: Optional[str] = None):
+ self.retry_after = retry_after
+ super().__init__(message)
+
+
+class HighspotClient:
+ """
+ Client for interacting with the Highspot API.
+
+ Uses basic authentication with provided key (username) and secret (password).
+ Implements retry logic, error handling, and connection pooling.
+ """
+
+ BASE_URL = "https://api-su2.highspot.com/v1.0/"
+
+ def __init__(
+ self,
+ key: str,
+ secret: str,
+ base_url: str = BASE_URL,
+ timeout: int = 30,
+ max_retries: int = 3,
+ backoff_factor: float = 0.5,
+ status_forcelist: Optional[List[int]] = None,
+ ):
+ """
+ Initialize the Highspot API client.
+
+ Args:
+ key: API key (used as username)
+ secret: API secret (used as password)
+ base_url: Base URL for the Highspot API
+ timeout: Request timeout in seconds
+ max_retries: Maximum number of retries for failed requests
+ backoff_factor: Backoff factor for retries
+ status_forcelist: HTTP status codes to retry on
+ """
+ if not key or not secret:
+ raise ValueError("API key and secret are required")
+
+ self.key = key
+ self.secret = secret
+ self.base_url = base_url
+ self.timeout = timeout
+
+ # Set up session with retry logic
+ self.session = requests.Session()
+ retry_strategy = Retry(
+ total=max_retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist or [429, 500, 502, 503, 504],
+ allowed_methods=["GET", "POST", "PUT", "DELETE"],
+ )
+ adapter = HTTPAdapter(max_retries=retry_strategy)
+ self.session.mount("http://", adapter)
+ self.session.mount("https://", adapter)
+
+ # Set up authentication
+ self._setup_auth()
+
+ def _setup_auth(self) -> None:
+ """Set up basic authentication for the session."""
+ auth = f"{self.key}:{self.secret}"
+ encoded_auth = base64.b64encode(auth.encode()).decode()
+ self.session.headers.update(
+ {
+ "Authorization": f"Basic {encoded_auth}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ }
+ )
+
+ def _make_request(
+ self,
+ method: str,
+ endpoint: str,
+ params: Optional[Dict[str, Any]] = None,
+ data: Optional[Dict[str, Any]] = None,
+ json_data: Optional[Dict[str, Any]] = None,
+ headers: Optional[Dict[str, str]] = None,
+ ) -> Dict[str, Any]:
+ """
+ Make a request to the Highspot API.
+
+ Args:
+ method: HTTP method (GET, POST, etc.)
+ endpoint: API endpoint
+ params: URL parameters
+ data: Form data
+ json_data: JSON data
+ headers: Additional headers
+
+ Returns:
+ API response as a dictionary
+
+ Raises:
+ HighspotClientError: On API errors
+ HighspotAuthenticationError: On authentication errors
+ HighspotRateLimitError: On rate limiting
+ requests.exceptions.RequestException: On request failures
+ """
+ url = urljoin(self.base_url, endpoint)
+ request_headers = {}
+ if headers:
+ request_headers.update(headers)
+
+ try:
+ logger.debug(f"Making {method} request to {url}")
+ response = self.session.request(
+ method=method,
+ url=url,
+ params=params,
+ data=data,
+ json=json_data,
+ headers=request_headers,
+ timeout=self.timeout,
+ )
+ response.raise_for_status()
+
+ if response.content and response.content.strip():
+ return response.json()
+ return {}
+
+ except HTTPError as e:
+ status_code = e.response.status_code
+ error_msg = str(e)
+
+ try:
+ error_data = e.response.json()
+ if isinstance(error_data, dict):
+ error_msg = error_data.get("message", str(e))
+ except (ValueError, KeyError):
+ pass
+
+ if status_code == 401:
+ raise HighspotAuthenticationError(f"Authentication failed: {error_msg}")
+ elif status_code == 429:
+ retry_after = e.response.headers.get("Retry-After")
+ raise HighspotRateLimitError(
+ f"Rate limit exceeded: {error_msg}", retry_after=retry_after
+ )
+ else:
+ raise HighspotClientError(
+ f"API error {status_code}: {error_msg}", status_code=status_code
+ )
+
+ except Timeout:
+ raise HighspotClientError("Request timed out")
+ except RequestException as e:
+ raise HighspotClientError(f"Request failed: {str(e)}")
+
+ def get_spots(self) -> List[Dict[str, Any]]:
+ """
+ Get all available spots.
+
+ Returns:
+ List of spots with their names and IDs
+ """
+ params = {"right": "view"}
+ response = self._make_request("GET", "spots", params=params)
+ logger.info(f"Received {response} spots")
+ total_counts = response.get("counts_total")
+ # Fix comparison to handle None value
+ if total_counts is not None and total_counts > 0:
+ return response.get("collection", [])
+ return []
+
+ def get_spot(self, spot_id: str) -> Dict[str, Any]:
+ """
+ Get details for a specific spot.
+
+ Args:
+ spot_id: ID of the spot
+
+ Returns:
+ Spot details
+ """
+ if not spot_id:
+ raise ValueError("spot_id is required")
+ return self._make_request("GET", f"spots/{spot_id}")
+
+ def get_spot_items(
+ self, spot_id: str, offset: int = 0, page_size: int = 100
+ ) -> Dict[str, Any]:
+ """
+ Get items in a specific spot.
+
+ Args:
+ spot_id: ID of the spot
+ offset: offset number
+ page_size: Number of items per page
+
+ Returns:
+ Items in the spot
+ """
+ if not spot_id:
+ raise ValueError("spot_id is required")
+
+ params = {"spot": spot_id, "start": offset, "limit": page_size}
+ return self._make_request("GET", "items", params=params)
+
+ def get_item(self, item_id: str) -> Dict[str, Any]:
+ """
+ Get details for a specific item.
+
+ Args:
+ item_id: ID of the item
+
+ Returns:
+ Item details
+ """
+ if not item_id:
+ raise ValueError("item_id is required")
+ return self._make_request("GET", f"items/{item_id}")
+
+ def get_item_content(self, item_id: str) -> bytes:
+ """
+ Get the raw content of an item.
+
+ Args:
+ item_id: ID of the item
+
+ Returns:
+ Raw content bytes
+ """
+ if not item_id:
+ raise ValueError("item_id is required")
+
+ url = urljoin(self.base_url, f"items/{item_id}/content")
+ response = self.session.get(url, timeout=self.timeout)
+ response.raise_for_status()
+ return response.content
+
+ def health_check(self) -> bool:
+ """
+ Check if the API is accessible and credentials are valid.
+
+ Returns:
+ True if API is accessible, False otherwise
+ """
+ try:
+ self._make_request("GET", "spots", params={"limit": 1})
+ return True
+ except (HighspotClientError, HighspotAuthenticationError):
+ return False
diff --git a/backend/onyx/connectors/highspot/connector.py b/backend/onyx/connectors/highspot/connector.py
new file mode 100644
index 000000000..380d878a5
--- /dev/null
+++ b/backend/onyx/connectors/highspot/connector.py
@@ -0,0 +1,431 @@
+from datetime import datetime
+from io import BytesIO
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.highspot.client import HighspotClient
+from onyx.connectors.highspot.client import HighspotClientError
+from onyx.connectors.highspot.utils import scrape_url_content
+from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import GenerateSlimDocumentOutput
+from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import PollConnector
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import SlimConnector
+from onyx.connectors.models import ConnectorMissingCredentialError
+from onyx.connectors.models import Document
+from onyx.connectors.models import SlimDocument
+from onyx.connectors.models import TextSection
+from onyx.file_processing.extract_file_text import extract_file_text
+from onyx.file_processing.extract_file_text import VALID_FILE_EXTENSIONS
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+_SLIM_BATCH_SIZE = 1000
+
+
+class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
+ """
+ Connector for loading data from Highspot.
+
+ Retrieves content from specified spots using the Highspot API.
+ If no spots are specified, retrieves content from all available spots.
+ """
+
+ def __init__(
+ self,
+ spot_names: List[str] = [],
+ batch_size: int = INDEX_BATCH_SIZE,
+ ):
+ """
+ Initialize the Highspot connector.
+
+ Args:
+ spot_names: List of spot names to retrieve content from (if empty, gets all spots)
+ batch_size: Number of items to retrieve in each batch
+ """
+ self.spot_names = spot_names
+ self.batch_size = batch_size
+ self._client: Optional[HighspotClient] = None
+ self._spot_id_map: Dict[str, str] = {} # Maps spot names to spot IDs
+ self._all_spots_fetched = False
+ self.highspot_url: Optional[str] = None
+ self.key: Optional[str] = None
+ self.secret: Optional[str] = None
+
+ @property
+ def client(self) -> HighspotClient:
+ if self._client is None:
+ if not self.key or not self.secret:
+ raise ConnectorMissingCredentialError("Highspot")
+ # Ensure highspot_url is a string, use default if None
+ base_url = (
+ self.highspot_url
+ if self.highspot_url is not None
+ else HighspotClient.BASE_URL
+ )
+ self._client = HighspotClient(self.key, self.secret, base_url=base_url)
+ return self._client
+
+ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+ logger.info("Loading Highspot credentials")
+ self.highspot_url = credentials.get("highspot_url")
+ self.key = credentials.get("highspot_key")
+ self.secret = credentials.get("highspot_secret")
+ return None
+
+ def _populate_spot_id_map(self) -> None:
+ """
+ Populate the spot ID map with all available spots.
+ Keys are stored as lowercase for case-insensitive lookups.
+ """
+ spots = self.client.get_spots()
+ for spot in spots:
+ if "title" in spot and "id" in spot:
+ spot_name = spot["title"]
+ self._spot_id_map[spot_name.lower()] = spot["id"]
+
+ self._all_spots_fetched = True
+ logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
+
+ def _get_all_spot_names(self) -> List[str]:
+ """
+ Retrieve all available spot names.
+
+ Returns:
+ List of all spot names
+ """
+ if not self._all_spots_fetched:
+ self._populate_spot_id_map()
+
+ return [spot_name for spot_name in self._spot_id_map.keys()]
+
+ def _get_spot_id_from_name(self, spot_name: str) -> str:
+ """
+ Get spot ID from a spot name.
+
+ Args:
+ spot_name: Name of the spot
+
+ Returns:
+ ID of the spot
+
+ Raises:
+ ValueError: If spot name is not found
+ """
+ if not self._all_spots_fetched:
+ self._populate_spot_id_map()
+
+ spot_name_lower = spot_name.lower()
+ if spot_name_lower not in self._spot_id_map:
+ raise ValueError(f"Spot '{spot_name}' not found")
+
+ return self._spot_id_map[spot_name_lower]
+
+ def load_from_state(self) -> GenerateDocumentsOutput:
+ """
+ Load content from configured spots in Highspot.
+ If no spots are configured, loads from all spots.
+
+ Yields:
+ Batches of Document objects
+ """
+ return self.poll_source(None, None)
+
+ def poll_source(
+ self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
+ ) -> GenerateDocumentsOutput:
+ """
+ Poll Highspot for content updated since the start time.
+
+ Args:
+ start: Start time as seconds since Unix epoch
+ end: End time as seconds since Unix epoch
+
+ Yields:
+ Batches of Document objects
+ """
+ doc_batch: list[Document] = []
+
+ # If no spots specified, get all spots
+ spot_names_to_process = self.spot_names
+ if not spot_names_to_process:
+ spot_names_to_process = self._get_all_spot_names()
+ logger.info(
+ f"No spots specified, using all {len(spot_names_to_process)} available spots"
+ )
+
+ for spot_name in spot_names_to_process:
+ try:
+ spot_id = self._get_spot_id_from_name(spot_name)
+ if spot_id is None:
+ logger.warning(f"Spot ID not found for spot {spot_name}")
+ continue
+ offset = 0
+ has_more = True
+
+ while has_more:
+ logger.info(
+ f"Retrieving items from spot {spot_name}, offset {offset}"
+ )
+ response = self.client.get_spot_items(
+ spot_id=spot_id, offset=offset, page_size=self.batch_size
+ )
+ items = response.get("collection", [])
+ logger.info(f"Received Items: {items}")
+ if not items:
+ has_more = False
+ continue
+
+ for item in items:
+ try:
+ item_id = item.get("id")
+ if not item_id:
+ logger.warning("Item without ID found, skipping")
+ continue
+
+ item_details = self.client.get_item(item_id)
+ if not item_details:
+ logger.warning(
+ f"Item {item_id} details not found, skipping"
+ )
+ continue
+ # Apply time filter if specified
+ if start or end:
+ updated_at = item_details.get("date_updated")
+ if updated_at:
+ # Convert to datetime for comparison
+ try:
+ updated_time = datetime.fromisoformat(
+ updated_at.replace("Z", "+00:00")
+ )
+ if (
+ start and updated_time.timestamp() < start
+ ) or (end and updated_time.timestamp() > end):
+ continue
+ except (ValueError, TypeError):
+ # Skip if date cannot be parsed
+ logger.warning(
+ f"Invalid date format for item {item_id}: {updated_at}"
+ )
+ continue
+
+ content = self._get_item_content(item_details)
+ title = item_details.get("title", "")
+
+ doc_batch.append(
+ Document(
+ id=f"HIGHSPOT_{item_id}",
+ sections=[
+ TextSection(
+ link=item_details.get(
+ "url",
+ f"https://www.highspot.com/items/{item_id}",
+ ),
+ text=content,
+ )
+ ],
+ source=DocumentSource.HIGHSPOT,
+ semantic_identifier=title,
+ metadata={
+ "spot_name": spot_name,
+ "type": item_details.get("content_type", ""),
+ "created_at": item_details.get(
+ "date_added", ""
+ ),
+ "author": item_details.get("author", ""),
+ "language": item_details.get("language", ""),
+ "can_download": str(
+ item_details.get("can_download", False)
+ ),
+ },
+ doc_updated_at=item_details.get("date_updated"),
+ )
+ )
+
+ if len(doc_batch) >= self.batch_size:
+ yield doc_batch
+ doc_batch = []
+
+ except HighspotClientError as e:
+ item_id = "ID" if not item_id else item_id
+ logger.error(f"Error retrieving item {item_id}: {str(e)}")
+
+ has_more = len(items) >= self.batch_size
+ offset += self.batch_size
+
+ except (HighspotClientError, ValueError) as e:
+ logger.error(f"Error processing spot {spot_name}: {str(e)}")
+
+ if doc_batch:
+ yield doc_batch
+
+ def _get_item_content(self, item_details: Dict[str, Any]) -> str:
+ """
+ Get the text content of an item.
+
+ Args:
+ item_details: Item details from the API
+
+ Returns:
+ Text content of the item
+ """
+ item_id = item_details.get("id", "")
+ content_name = item_details.get("content_name", "")
+ is_valid_format = content_name and "." in content_name
+ file_extension = content_name.split(".")[-1].lower() if is_valid_format else ""
+ file_extension = "." + file_extension if file_extension else ""
+ can_download = item_details.get("can_download", False)
+ content_type = item_details.get("content_type", "")
+
+ # Extract title and description once at the beginning
+ title, description = self._extract_title_and_description(item_details)
+ default_content = f"{title}\n{description}"
+ logger.info(f"Processing item {item_id} with extension {file_extension}")
+
+ try:
+ if content_type == "WebLink":
+ url = item_details.get("url")
+ if not url:
+ return default_content
+ content = scrape_url_content(url, True)
+ return content if content else default_content
+
+ elif (
+ is_valid_format
+ and file_extension in VALID_FILE_EXTENSIONS
+ and can_download
+ ):
+ # For documents, try to get the text content
+ if not item_id: # Ensure item_id is defined
+ return default_content
+
+ content_response = self.client.get_item_content(item_id)
+ # Process and extract text from binary content based on type
+ if content_response:
+ text_content = extract_file_text(
+ BytesIO(content_response), content_name
+ )
+ return text_content
+ return default_content
+
+ else:
+ return default_content
+
+ except HighspotClientError as e:
+ # Use item_id safely in the warning message
+ error_context = f"item {item_id}" if item_id else "item"
+ logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
+ return ""
+
+ def _extract_title_and_description(
+ self, item_details: Dict[str, Any]
+ ) -> tuple[str, str]:
+ """
+ Extract the title and description from item details.
+
+ Args:
+ item_details: Item details from the API
+
+ Returns:
+ Tuple of title and description
+ """
+ title = item_details.get("title", "")
+ description = item_details.get("description", "")
+ return title, description
+
+ def retrieve_all_slim_documents(
+ self,
+ start: SecondsSinceUnixEpoch | None = None,
+ end: SecondsSinceUnixEpoch | None = None,
+ callback: IndexingHeartbeatInterface | None = None,
+ ) -> GenerateSlimDocumentOutput:
+ """
+ Retrieve all document IDs from the configured spots.
+ If no spots are configured, retrieves from all spots.
+
+ Args:
+ start: Optional start time filter
+ end: Optional end time filter
+ callback: Optional indexing heartbeat callback
+
+ Yields:
+ Batches of SlimDocument objects
+ """
+ slim_doc_batch: list[SlimDocument] = []
+
+ # If no spots specified, get all spots
+ spot_names_to_process = self.spot_names
+ if not spot_names_to_process:
+ spot_names_to_process = self._get_all_spot_names()
+ logger.info(
+ f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
+ )
+
+ for spot_name in spot_names_to_process:
+ try:
+ spot_id = self._get_spot_id_from_name(spot_name)
+ offset = 0
+ has_more = True
+
+ while has_more:
+ logger.info(
+ f"Retrieving slim documents from spot {spot_name}, offset {offset}"
+ )
+ response = self.client.get_spot_items(
+ spot_id=spot_id, offset=offset, page_size=self.batch_size
+ )
+
+ items = response.get("collection", [])
+ if not items:
+ has_more = False
+ continue
+
+ for item in items:
+ item_id = item.get("id")
+ if not item_id:
+ continue
+
+ slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
+
+ if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
+ yield slim_doc_batch
+ slim_doc_batch = []
+
+ has_more = len(items) >= self.batch_size
+ offset += self.batch_size
+
+ except (HighspotClientError, ValueError) as e:
+ logger.error(
+ f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
+ )
+
+ if slim_doc_batch:
+ yield slim_doc_batch
+
+ def validate_credentials(self) -> bool:
+ """
+ Validate that the provided credentials can access the Highspot API.
+
+ Returns:
+ True if credentials are valid, False otherwise
+ """
+ try:
+ return self.client.health_check()
+ except Exception as e:
+ logger.error(f"Failed to validate credentials: {str(e)}")
+ return False
+
+
+if __name__ == "__main__":
+ spot_names: List[str] = []
+ connector = HighspotConnector(spot_names)
+ credentials = {"highspot_key": "", "highspot_secret": ""}
+ connector.load_credentials(credentials=credentials)
+ for doc in connector.load_from_state():
+ print(doc)
diff --git a/backend/onyx/connectors/highspot/utils.py b/backend/onyx/connectors/highspot/utils.py
new file mode 100644
index 000000000..efc00dac1
--- /dev/null
+++ b/backend/onyx/connectors/highspot/utils.py
@@ -0,0 +1,122 @@
+from typing import Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+from playwright.sync_api import sync_playwright
+
+from onyx.file_processing.html_utils import web_html_cleanup
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+# Constants
+WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
+JAVASCRIPT_DISABLED_MESSAGE = "You have JavaScript disabled in your browser"
+DEFAULT_TIMEOUT = 60000 # 60 seconds
+
+
+def scrape_url_content(
+ url: str, scroll_before_scraping: bool = False, timeout_ms: int = DEFAULT_TIMEOUT
+) -> Optional[str]:
+ """
+ Scrapes content from a given URL and returns the cleaned text.
+
+ Args:
+ url: The URL to scrape
+ scroll_before_scraping: Whether to scroll through the page to load lazy content
+ timeout_ms: Timeout in milliseconds for page navigation and loading
+
+ Returns:
+ The cleaned text content of the page or None if scraping fails
+ """
+ playwright = None
+ browser = None
+ try:
+ validate_url(url)
+ playwright = sync_playwright().start()
+ browser = playwright.chromium.launch(headless=True)
+ context = browser.new_context()
+ page = context.new_page()
+
+ logger.info(f"Navigating to URL: {url}")
+ try:
+ page.goto(url, timeout=timeout_ms)
+ except Exception as e:
+ logger.error(f"Failed to navigate to {url}: {str(e)}")
+ return None
+
+ if scroll_before_scraping:
+ logger.debug("Scrolling page to load lazy content")
+ scroll_attempts = 0
+ previous_height = page.evaluate("document.body.scrollHeight")
+ while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ try:
+ page.wait_for_load_state("networkidle", timeout=timeout_ms)
+ except Exception as e:
+ logger.warning(f"Network idle wait timed out: {str(e)}")
+ break
+
+ new_height = page.evaluate("document.body.scrollHeight")
+ if new_height == previous_height:
+ break
+ previous_height = new_height
+ scroll_attempts += 1
+
+ content = page.content()
+ soup = BeautifulSoup(content, "html.parser")
+
+ parsed_html = web_html_cleanup(soup)
+
+ if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
+ logger.debug("JavaScript disabled message detected, checking iframes")
+ try:
+ iframe_count = page.frame_locator("iframe").locator("html").count()
+ if iframe_count > 0:
+ iframe_texts = (
+ page.frame_locator("iframe").locator("html").all_inner_texts()
+ )
+ iframe_content = "\n".join(iframe_texts)
+
+ if len(parsed_html.cleaned_text) < 700:
+ parsed_html.cleaned_text = iframe_content
+ else:
+ parsed_html.cleaned_text += "\n" + iframe_content
+ except Exception as e:
+ logger.warning(f"Error processing iframes: {str(e)}")
+
+ return parsed_html.cleaned_text
+
+ except Exception as e:
+ logger.error(f"Error scraping URL {url}: {str(e)}")
+ return None
+
+ finally:
+ if browser:
+ try:
+ browser.close()
+ except Exception as e:
+ logger.debug(f"Error closing browser: {str(e)}")
+ if playwright:
+ try:
+ playwright.stop()
+ except Exception as e:
+ logger.debug(f"Error stopping playwright: {str(e)}")
+
+
+def validate_url(url: str) -> None:
+ """
+ Validates that a URL is properly formatted.
+
+ Args:
+ url: The URL to validate
+
+ Raises:
+ ValueError: If URL is not valid
+ """
+ parse = urlparse(url)
+ if parse.scheme != "http" and parse.scheme != "https":
+ raise ValueError("URL must be of scheme https?://")
+
+ if not parse.hostname:
+ raise ValueError("URL must include a hostname")
diff --git a/backend/tests/daily/connectors/highspot/test_highspot_connector.py b/backend/tests/daily/connectors/highspot/test_highspot_connector.py
new file mode 100644
index 000000000..6da06b58c
--- /dev/null
+++ b/backend/tests/daily/connectors/highspot/test_highspot_connector.py
@@ -0,0 +1,98 @@
+import json
+import os
+import time
+from pathlib import Path
+
+import pytest
+
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.highspot.connector import HighspotConnector
+from onyx.connectors.models import Document
+
+
+def load_test_data(file_name: str = "test_highspot_data.json") -> dict:
+ """Load test data from JSON file."""
+ current_dir = Path(__file__).parent
+ with open(current_dir / file_name, "r") as f:
+ return json.load(f)
+
+
+@pytest.fixture
+def highspot_connector() -> HighspotConnector:
+ """Create a Highspot connector with credentials from environment variables."""
+ # Check if required environment variables are set
+ if not os.environ.get("HIGHSPOT_KEY") or not os.environ.get("HIGHSPOT_SECRET"):
+ pytest.fail("HIGHSPOT_KEY or HIGHSPOT_SECRET environment variables not set")
+
+ connector = HighspotConnector(
+ spot_names=["Test content"], # Use specific spot name instead of empty list
+ batch_size=10, # Smaller batch size for testing
+ )
+ connector.load_credentials(
+ {
+ "highspot_key": os.environ["HIGHSPOT_KEY"],
+ "highspot_secret": os.environ["HIGHSPOT_SECRET"],
+ "highspot_url": os.environ.get(
+ "HIGHSPOT_URL", "https://api-su2.highspot.com/v1.0/"
+ ),
+ }
+ )
+ return connector
+
+
+def test_highspot_connector_basic(highspot_connector: HighspotConnector) -> None:
+ """Test basic functionality of the Highspot connector."""
+ all_docs: list[Document] = []
+ test_data = load_test_data()
+ target_test_doc_id = test_data.get("target_doc_id")
+ target_test_doc: Document | None = None
+
+ # Test loading documents
+ for doc_batch in highspot_connector.poll_source(0, time.time()):
+ for doc in doc_batch:
+ all_docs.append(doc)
+ if doc.id == f"HIGHSPOT_{target_test_doc_id}":
+ target_test_doc = doc
+
+ # Verify documents were loaded
+ assert len(all_docs) > 0
+
+ # If we have a specific test document ID, validate it
+ if target_test_doc_id and target_test_doc is not None:
+ assert target_test_doc.semantic_identifier == test_data.get(
+ "semantic_identifier"
+ )
+ assert target_test_doc.source == DocumentSource.HIGHSPOT
+ assert target_test_doc.metadata is not None
+
+ assert len(target_test_doc.sections) == 1
+ section = target_test_doc.sections[0]
+ assert section.link is not None
+ # Only check if content exists, as exact content might change
+ assert section.text is not None
+ assert len(section.text) > 0
+
+
+def test_highspot_connector_slim(highspot_connector: HighspotConnector) -> None:
+ """Test slim document retrieval."""
+ # Get all doc IDs from the full connector
+ all_full_doc_ids = set()
+ for doc_batch in highspot_connector.load_from_state():
+ all_full_doc_ids.update([doc.id for doc in doc_batch])
+
+ # Get all doc IDs from the slim connector
+ all_slim_doc_ids = set()
+ for slim_doc_batch in highspot_connector.retrieve_all_slim_documents():
+ all_slim_doc_ids.update([doc.id for doc in slim_doc_batch])
+
+ # The set of full doc IDs should be a subset of the slim doc IDs
+ assert all_full_doc_ids.issubset(all_slim_doc_ids)
+ # Make sure we actually got some documents
+ assert len(all_slim_doc_ids) > 0
+
+
+def test_highspot_connector_validate_credentials(
+ highspot_connector: HighspotConnector,
+) -> None:
+ """Test credential validation."""
+ assert highspot_connector.validate_credentials() is True
diff --git a/backend/tests/daily/connectors/highspot/test_highspot_data.json b/backend/tests/daily/connectors/highspot/test_highspot_data.json
new file mode 100644
index 000000000..d796b3d60
--- /dev/null
+++ b/backend/tests/daily/connectors/highspot/test_highspot_data.json
@@ -0,0 +1,5 @@
+{
+ "target_doc_id": "67cd8eb35d3ee0487de2e704",
+ "semantic_identifier": "Highspot in Action _ Salesforce Integration",
+ "link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704"
+}
diff --git a/web/public/Highspot.png b/web/public/Highspot.png
new file mode 100644
index 000000000..7bf874f5b
Binary files /dev/null and b/web/public/Highspot.png differ
diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx
index 55d2ea648..e7f0a7390 100644
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -89,6 +89,7 @@ import cohereIcon from "../../../public/Cohere.svg";
import voyageIcon from "../../../public/Voyage.png";
import googleIcon from "../../../public/Google.webp";
import xenforoIcon from "../../../public/Xenforo.svg";
+import highspotIcon from "../../../public/Highspot.png";
import { FaGithub, FaRobot } from "react-icons/fa";
import { cn } from "@/lib/utils";
@@ -2912,6 +2913,13 @@ export const GitbookIcon = ({
);
+export const HighspotIcon = ({
+ size = 16,
+ className = defaultTailwindCSS,
+}: IconProps) => {
+ return ;
+};
+
export const PinnedIcon = ({
size = 16,
className = defaultTailwindCSS,
diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx
index e26631864..f644bc6a4 100644
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -1249,6 +1249,47 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
],
overrideDefaultFreq: 60 * 60 * 24,
},
+ highspot: {
+ description: "Configure Highspot connector",
+ values: [
+ {
+ type: "tab",
+ name: "highspot_scope",
+ label: "What should we index from Highspot?",
+ optional: true,
+ tabs: [
+ {
+ value: "spots",
+ label: "Specific Spots",
+ fields: [
+ {
+ type: "list",
+ query: "Enter the spot name(s):",
+ label: "Spot Name(s)",
+ name: "spot_names",
+ optional: false,
+ description: "For multiple spots, enter your spot one by one.",
+ },
+ ],
+ },
+ {
+ value: "everything",
+ label: "Everything",
+ fields: [
+ {
+ type: "string_tab",
+ label: "Everything",
+ name: "everything",
+ description:
+ "This connector will index all spots the provided credentials have access to!",
+ },
+ ],
+ },
+ ],
+ },
+ ],
+ advanced_values: [],
+ },
};
export function createConnectorInitialValues(
connector: ConfigurableSources
diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts
index ac35beb29..0b34d47cc 100644
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@@ -226,6 +226,12 @@ export interface AirtableCredentialJson {
airtable_access_token: string;
}
+export interface HighspotCredentialJson {
+ highspot_url: string;
+ highspot_key: string;
+ highspot_secret: string;
+}
+
export const credentialTemplates: Record = {
github: { github_access_token: "" } as GithubCredentialJson,
gitlab: {
@@ -353,6 +359,11 @@ export const credentialTemplates: Record = {
gitbook: {
gitbook_api_key: "",
} as GitbookCredentialJson,
+ highspot: {
+ highspot_url: "",
+ highspot_key: "",
+ highspot_secret: "",
+ } as HighspotCredentialJson,
};
export const credentialDisplayNames: Record = {
@@ -488,6 +499,11 @@ export const credentialDisplayNames: Record = {
// GitBook
gitbook_space_id: "GitBook Space ID",
gitbook_api_key: "GitBook API Key",
+
+ //Highspot
+ highspot_url: "Highspot URL",
+ highspot_key: "Highspot Key",
+ highspot_secret: "Highspot Secret",
};
export function getDisplayNameForCredentialKey(key: string): string {
diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts
index 7a3341256..6a1bd0ce5 100644
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@@ -44,6 +44,7 @@ import {
GlobeIcon2,
FileIcon2,
GitbookIcon,
+ HighspotIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
@@ -329,6 +330,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Wiki,
docs: "https://docs.onyx.app/connectors/gitbook",
},
+ highspot: {
+ icon: HighspotIcon,
+ displayName: "Highspot",
+ category: SourceCategory.Wiki,
+ docs: "https://docs.onyx.app/connectors/highspot",
+ },
// currently used for the Internet Search tool docs, which is why
// a globe is used
not_applicable: {
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 901aa85d7..c2007f7c2 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -390,6 +390,7 @@ export enum ValidSources {
Egnyte = "egnyte",
Airtable = "airtable",
Gitbook = "gitbook",
+ Highspot = "highspot",
}
export const validAutoSyncSources = [