Highspot connector (#4277)

2025-04-04 09:58:32 +02:00 · 2025-03-17 21:06:02 +05:30 · 2025-03-17 21:06:02 +05:30 · ba514aaaa2
commit ba514aaaa2
parent f45798b5dd
15 changed files with 1019 additions and 0 deletions
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@ -50,6 +50,9 @@ env:
  GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
  # Notion
  NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }}
+  # Highspot
+  HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }}
+  HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }}

 jobs:
  connectors-check:
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@ -174,6 +174,7 @@ class DocumentSource(str, Enum):
    FIREFLIES = "fireflies"
    EGNYTE = "egnyte"
    AIRTABLE = "airtable"
+    HIGHSPOT = "highspot"

    # Special case just for integration tests
    MOCK_CONNECTOR = "mock_connector"
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@ -30,6 +30,7 @@ from onyx.connectors.gong.connector import GongConnector
 from onyx.connectors.google_drive.connector import GoogleDriveConnector
 from onyx.connectors.google_site.connector import GoogleSitesConnector
 from onyx.connectors.guru.connector import GuruConnector
+from onyx.connectors.highspot.connector import HighspotConnector
 from onyx.connectors.hubspot.connector import HubSpotConnector
 from onyx.connectors.interfaces import BaseConnector
 from onyx.connectors.interfaces import CheckpointConnector
@ -117,6 +118,7 @@ def identify_connector_class(
        DocumentSource.FIREFLIES: FirefliesConnector,
        DocumentSource.EGNYTE: EgnyteConnector,
        DocumentSource.AIRTABLE: AirtableConnector,
+        DocumentSource.HIGHSPOT: HighspotConnector,
        # just for integration tests
        DocumentSource.MOCK_CONNECTOR: MockConnector,
    }
--- a/backend/onyx/connectors/highspot/init.py
+++ b/backend/onyx/connectors/highspot/init.py
@ -0,0 +1,4 @@
+"""
+Highspot connector package for Onyx.
+Enables integration with Highspot's knowledge base.
+"""
--- a/backend/onyx/connectors/highspot/client.py
+++ b/backend/onyx/connectors/highspot/client.py
@ -0,0 +1,280 @@
+import base64
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from urllib.parse import urljoin
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.exceptions import HTTPError
+from requests.exceptions import RequestException
+from requests.exceptions import Timeout
+from urllib3.util.retry import Retry
+
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class HighspotClientError(Exception):
+    """Base exception for Highspot API client errors."""
+
+    def __init__(self, message: str, status_code: Optional[int] = None):
+        self.message = message
+        self.status_code = status_code
+        super().__init__(self.message)
+
+
+class HighspotAuthenticationError(HighspotClientError):
+    """Exception raised for authentication errors."""
+
+
+class HighspotRateLimitError(HighspotClientError):
+    """Exception raised when rate limit is exceeded."""
+
+    def __init__(self, message: str, retry_after: Optional[str] = None):
+        self.retry_after = retry_after
+        super().__init__(message)
+
+
+class HighspotClient:
+    """
+    Client for interacting with the Highspot API.
+
+    Uses basic authentication with provided key (username) and secret (password).
+    Implements retry logic, error handling, and connection pooling.
+    """
+
+    BASE_URL = "https://api-su2.highspot.com/v1.0/"
+
+    def __init__(
+        self,
+        key: str,
+        secret: str,
+        base_url: str = BASE_URL,
+        timeout: int = 30,
+        max_retries: int = 3,
+        backoff_factor: float = 0.5,
+        status_forcelist: Optional[List[int]] = None,
+    ):
+        """
+        Initialize the Highspot API client.
+
+        Args:
+            key: API key (used as username)
+            secret: API secret (used as password)
+            base_url: Base URL for the Highspot API
+            timeout: Request timeout in seconds
+            max_retries: Maximum number of retries for failed requests
+            backoff_factor: Backoff factor for retries
+            status_forcelist: HTTP status codes to retry on
+        """
+        if not key or not secret:
+            raise ValueError("API key and secret are required")
+
+        self.key = key
+        self.secret = secret
+        self.base_url = base_url
+        self.timeout = timeout
+
+        # Set up session with retry logic
+        self.session = requests.Session()
+        retry_strategy = Retry(
+            total=max_retries,
+            backoff_factor=backoff_factor,
+            status_forcelist=status_forcelist or [429, 500, 502, 503, 504],
+            allowed_methods=["GET", "POST", "PUT", "DELETE"],
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+
+        # Set up authentication
+        self._setup_auth()
+
+    def _setup_auth(self) -> None:
+        """Set up basic authentication for the session."""
+        auth = f"{self.key}:{self.secret}"
+        encoded_auth = base64.b64encode(auth.encode()).decode()
+        self.session.headers.update(
+            {
+                "Authorization": f"Basic {encoded_auth}",
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+            }
+        )
+
+    def _make_request(
+        self,
+        method: str,
+        endpoint: str,
+        params: Optional[Dict[str, Any]] = None,
+        data: Optional[Dict[str, Any]] = None,
+        json_data: Optional[Dict[str, Any]] = None,
+        headers: Optional[Dict[str, str]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Make a request to the Highspot API.
+
+        Args:
+            method: HTTP method (GET, POST, etc.)
+            endpoint: API endpoint
+            params: URL parameters
+            data: Form data
+            json_data: JSON data
+            headers: Additional headers
+
+        Returns:
+            API response as a dictionary
+
+        Raises:
+            HighspotClientError: On API errors
+            HighspotAuthenticationError: On authentication errors
+            HighspotRateLimitError: On rate limiting
+            requests.exceptions.RequestException: On request failures
+        """
+        url = urljoin(self.base_url, endpoint)
+        request_headers = {}
+        if headers:
+            request_headers.update(headers)
+
+        try:
+            logger.debug(f"Making {method} request to {url}")
+            response = self.session.request(
+                method=method,
+                url=url,
+                params=params,
+                data=data,
+                json=json_data,
+                headers=request_headers,
+                timeout=self.timeout,
+            )
+            response.raise_for_status()
+
+            if response.content and response.content.strip():
+                return response.json()
+            return {}
+
+        except HTTPError as e:
+            status_code = e.response.status_code
+            error_msg = str(e)
+
+            try:
+                error_data = e.response.json()
+                if isinstance(error_data, dict):
+                    error_msg = error_data.get("message", str(e))
+            except (ValueError, KeyError):
+                pass
+
+            if status_code == 401:
+                raise HighspotAuthenticationError(f"Authentication failed: {error_msg}")
+            elif status_code == 429:
+                retry_after = e.response.headers.get("Retry-After")
+                raise HighspotRateLimitError(
+                    f"Rate limit exceeded: {error_msg}", retry_after=retry_after
+                )
+            else:
+                raise HighspotClientError(
+                    f"API error {status_code}: {error_msg}", status_code=status_code
+                )
+
+        except Timeout:
+            raise HighspotClientError("Request timed out")
+        except RequestException as e:
+            raise HighspotClientError(f"Request failed: {str(e)}")
+
+    def get_spots(self) -> List[Dict[str, Any]]:
+        """
+        Get all available spots.
+
+        Returns:
+            List of spots with their names and IDs
+        """
+        params = {"right": "view"}
+        response = self._make_request("GET", "spots", params=params)
+        logger.info(f"Received {response} spots")
+        total_counts = response.get("counts_total")
+        # Fix comparison to handle None value
+        if total_counts is not None and total_counts > 0:
+            return response.get("collection", [])
+        return []
+
+    def get_spot(self, spot_id: str) -> Dict[str, Any]:
+        """
+        Get details for a specific spot.
+
+        Args:
+            spot_id: ID of the spot
+
+        Returns:
+            Spot details
+        """
+        if not spot_id:
+            raise ValueError("spot_id is required")
+        return self._make_request("GET", f"spots/{spot_id}")
+
+    def get_spot_items(
+        self, spot_id: str, offset: int = 0, page_size: int = 100
+    ) -> Dict[str, Any]:
+        """
+        Get items in a specific spot.
+
+        Args:
+            spot_id: ID of the spot
+            offset: offset number
+            page_size: Number of items per page
+
+        Returns:
+            Items in the spot
+        """
+        if not spot_id:
+            raise ValueError("spot_id is required")
+
+        params = {"spot": spot_id, "start": offset, "limit": page_size}
+        return self._make_request("GET", "items", params=params)
+
+    def get_item(self, item_id: str) -> Dict[str, Any]:
+        """
+        Get details for a specific item.
+
+        Args:
+            item_id: ID of the item
+
+        Returns:
+            Item details
+        """
+        if not item_id:
+            raise ValueError("item_id is required")
+        return self._make_request("GET", f"items/{item_id}")
+
+    def get_item_content(self, item_id: str) -> bytes:
+        """
+        Get the raw content of an item.
+
+        Args:
+            item_id: ID of the item
+
+        Returns:
+            Raw content bytes
+        """
+        if not item_id:
+            raise ValueError("item_id is required")
+
+        url = urljoin(self.base_url, f"items/{item_id}/content")
+        response = self.session.get(url, timeout=self.timeout)
+        response.raise_for_status()
+        return response.content
+
+    def health_check(self) -> bool:
+        """
+        Check if the API is accessible and credentials are valid.
+
+        Returns:
+            True if API is accessible, False otherwise
+        """
+        try:
+            self._make_request("GET", "spots", params={"limit": 1})
+            return True
+        except (HighspotClientError, HighspotAuthenticationError):
+            return False
--- a/backend/onyx/connectors/highspot/connector.py
+++ b/backend/onyx/connectors/highspot/connector.py
@ -0,0 +1,431 @@
+from datetime import datetime
+from io import BytesIO
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+
+from onyx.configs.app_configs import INDEX_BATCH_SIZE
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.highspot.client import HighspotClient
+from onyx.connectors.highspot.client import HighspotClientError
+from onyx.connectors.highspot.utils import scrape_url_content
+from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import GenerateSlimDocumentOutput
+from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import PollConnector
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import SlimConnector
+from onyx.connectors.models import ConnectorMissingCredentialError
+from onyx.connectors.models import Document
+from onyx.connectors.models import SlimDocument
+from onyx.connectors.models import TextSection
+from onyx.file_processing.extract_file_text import extract_file_text
+from onyx.file_processing.extract_file_text import VALID_FILE_EXTENSIONS
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+_SLIM_BATCH_SIZE = 1000
+
+
+class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
+    """
+    Connector for loading data from Highspot.
+
+    Retrieves content from specified spots using the Highspot API.
+    If no spots are specified, retrieves content from all available spots.
+    """
+
+    def __init__(
+        self,
+        spot_names: List[str] = [],
+        batch_size: int = INDEX_BATCH_SIZE,
+    ):
+        """
+        Initialize the Highspot connector.
+
+        Args:
+            spot_names: List of spot names to retrieve content from (if empty, gets all spots)
+            batch_size: Number of items to retrieve in each batch
+        """
+        self.spot_names = spot_names
+        self.batch_size = batch_size
+        self._client: Optional[HighspotClient] = None
+        self._spot_id_map: Dict[str, str] = {}  # Maps spot names to spot IDs
+        self._all_spots_fetched = False
+        self.highspot_url: Optional[str] = None
+        self.key: Optional[str] = None
+        self.secret: Optional[str] = None
+
+    @property
+    def client(self) -> HighspotClient:
+        if self._client is None:
+            if not self.key or not self.secret:
+                raise ConnectorMissingCredentialError("Highspot")
+            # Ensure highspot_url is a string, use default if None
+            base_url = (
+                self.highspot_url
+                if self.highspot_url is not None
+                else HighspotClient.BASE_URL
+            )
+            self._client = HighspotClient(self.key, self.secret, base_url=base_url)
+        return self._client
+
+    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+        logger.info("Loading Highspot credentials")
+        self.highspot_url = credentials.get("highspot_url")
+        self.key = credentials.get("highspot_key")
+        self.secret = credentials.get("highspot_secret")
+        return None
+
+    def _populate_spot_id_map(self) -> None:
+        """
+        Populate the spot ID map with all available spots.
+        Keys are stored as lowercase for case-insensitive lookups.
+        """
+        spots = self.client.get_spots()
+        for spot in spots:
+            if "title" in spot and "id" in spot:
+                spot_name = spot["title"]
+                self._spot_id_map[spot_name.lower()] = spot["id"]
+
+        self._all_spots_fetched = True
+        logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
+
+    def _get_all_spot_names(self) -> List[str]:
+        """
+        Retrieve all available spot names.
+
+        Returns:
+            List of all spot names
+        """
+        if not self._all_spots_fetched:
+            self._populate_spot_id_map()
+
+        return [spot_name for spot_name in self._spot_id_map.keys()]
+
+    def _get_spot_id_from_name(self, spot_name: str) -> str:
+        """
+        Get spot ID from a spot name.
+
+        Args:
+            spot_name: Name of the spot
+
+        Returns:
+            ID of the spot
+
+        Raises:
+            ValueError: If spot name is not found
+        """
+        if not self._all_spots_fetched:
+            self._populate_spot_id_map()
+
+        spot_name_lower = spot_name.lower()
+        if spot_name_lower not in self._spot_id_map:
+            raise ValueError(f"Spot '{spot_name}' not found")
+
+        return self._spot_id_map[spot_name_lower]
+
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        """
+        Load content from configured spots in Highspot.
+        If no spots are configured, loads from all spots.
+
+        Yields:
+            Batches of Document objects
+        """
+        return self.poll_source(None, None)
+
+    def poll_source(
+        self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
+    ) -> GenerateDocumentsOutput:
+        """
+        Poll Highspot for content updated since the start time.
+
+        Args:
+            start: Start time as seconds since Unix epoch
+            end: End time as seconds since Unix epoch
+
+        Yields:
+            Batches of Document objects
+        """
+        doc_batch: list[Document] = []
+
+        # If no spots specified, get all spots
+        spot_names_to_process = self.spot_names
+        if not spot_names_to_process:
+            spot_names_to_process = self._get_all_spot_names()
+            logger.info(
+                f"No spots specified, using all {len(spot_names_to_process)} available spots"
+            )
+
+        for spot_name in spot_names_to_process:
+            try:
+                spot_id = self._get_spot_id_from_name(spot_name)
+                if spot_id is None:
+                    logger.warning(f"Spot ID not found for spot {spot_name}")
+                    continue
+                offset = 0
+                has_more = True
+
+                while has_more:
+                    logger.info(
+                        f"Retrieving items from spot {spot_name}, offset {offset}"
+                    )
+                    response = self.client.get_spot_items(
+                        spot_id=spot_id, offset=offset, page_size=self.batch_size
+                    )
+                    items = response.get("collection", [])
+                    logger.info(f"Received Items: {items}")
+                    if not items:
+                        has_more = False
+                        continue
+
+                    for item in items:
+                        try:
+                            item_id = item.get("id")
+                            if not item_id:
+                                logger.warning("Item without ID found, skipping")
+                                continue
+
+                            item_details = self.client.get_item(item_id)
+                            if not item_details:
+                                logger.warning(
+                                    f"Item {item_id} details not found, skipping"
+                                )
+                                continue
+                            # Apply time filter if specified
+                            if start or end:
+                                updated_at = item_details.get("date_updated")
+                                if updated_at:
+                                    # Convert to datetime for comparison
+                                    try:
+                                        updated_time = datetime.fromisoformat(
+                                            updated_at.replace("Z", "+00:00")
+                                        )
+                                        if (
+                                            start and updated_time.timestamp() < start
+                                        ) or (end and updated_time.timestamp() > end):
+                                            continue
+                                    except (ValueError, TypeError):
+                                        # Skip if date cannot be parsed
+                                        logger.warning(
+                                            f"Invalid date format for item {item_id}: {updated_at}"
+                                        )
+                                        continue
+
+                            content = self._get_item_content(item_details)
+                            title = item_details.get("title", "")
+
+                            doc_batch.append(
+                                Document(
+                                    id=f"HIGHSPOT_{item_id}",
+                                    sections=[
+                                        TextSection(
+                                            link=item_details.get(
+                                                "url",
+                                                f"https://www.highspot.com/items/{item_id}",
+                                            ),
+                                            text=content,
+                                        )
+                                    ],
+                                    source=DocumentSource.HIGHSPOT,
+                                    semantic_identifier=title,
+                                    metadata={
+                                        "spot_name": spot_name,
+                                        "type": item_details.get("content_type", ""),
+                                        "created_at": item_details.get(
+                                            "date_added", ""
+                                        ),
+                                        "author": item_details.get("author", ""),
+                                        "language": item_details.get("language", ""),
+                                        "can_download": str(
+                                            item_details.get("can_download", False)
+                                        ),
+                                    },
+                                    doc_updated_at=item_details.get("date_updated"),
+                                )
+                            )
+
+                            if len(doc_batch) >= self.batch_size:
+                                yield doc_batch
+                                doc_batch = []
+
+                        except HighspotClientError as e:
+                            item_id = "ID" if not item_id else item_id
+                            logger.error(f"Error retrieving item {item_id}: {str(e)}")
+
+                    has_more = len(items) >= self.batch_size
+                    offset += self.batch_size
+
+            except (HighspotClientError, ValueError) as e:
+                logger.error(f"Error processing spot {spot_name}: {str(e)}")
+
+        if doc_batch:
+            yield doc_batch
+
+    def _get_item_content(self, item_details: Dict[str, Any]) -> str:
+        """
+        Get the text content of an item.
+
+        Args:
+            item_details: Item details from the API
+
+        Returns:
+            Text content of the item
+        """
+        item_id = item_details.get("id", "")
+        content_name = item_details.get("content_name", "")
+        is_valid_format = content_name and "." in content_name
+        file_extension = content_name.split(".")[-1].lower() if is_valid_format else ""
+        file_extension = "." + file_extension if file_extension else ""
+        can_download = item_details.get("can_download", False)
+        content_type = item_details.get("content_type", "")
+
+        # Extract title and description once at the beginning
+        title, description = self._extract_title_and_description(item_details)
+        default_content = f"{title}\n{description}"
+        logger.info(f"Processing item {item_id} with extension {file_extension}")
+
+        try:
+            if content_type == "WebLink":
+                url = item_details.get("url")
+                if not url:
+                    return default_content
+                content = scrape_url_content(url, True)
+                return content if content else default_content
+
+            elif (
+                is_valid_format
+                and file_extension in VALID_FILE_EXTENSIONS
+                and can_download
+            ):
+                # For documents, try to get the text content
+                if not item_id:  # Ensure item_id is defined
+                    return default_content
+
+                content_response = self.client.get_item_content(item_id)
+                # Process and extract text from binary content based on type
+                if content_response:
+                    text_content = extract_file_text(
+                        BytesIO(content_response), content_name
+                    )
+                    return text_content
+                return default_content
+
+            else:
+                return default_content
+
+        except HighspotClientError as e:
+            # Use item_id safely in the warning message
+            error_context = f"item {item_id}" if item_id else "item"
+            logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
+            return ""
+
+    def _extract_title_and_description(
+        self, item_details: Dict[str, Any]
+    ) -> tuple[str, str]:
+        """
+        Extract the title and description from item details.
+
+        Args:
+            item_details: Item details from the API
+
+        Returns:
+            Tuple of title and description
+        """
+        title = item_details.get("title", "")
+        description = item_details.get("description", "")
+        return title, description
+
+    def retrieve_all_slim_documents(
+        self,
+        start: SecondsSinceUnixEpoch | None = None,
+        end: SecondsSinceUnixEpoch | None = None,
+        callback: IndexingHeartbeatInterface | None = None,
+    ) -> GenerateSlimDocumentOutput:
+        """
+        Retrieve all document IDs from the configured spots.
+        If no spots are configured, retrieves from all spots.
+
+        Args:
+            start: Optional start time filter
+            end: Optional end time filter
+            callback: Optional indexing heartbeat callback
+
+        Yields:
+            Batches of SlimDocument objects
+        """
+        slim_doc_batch: list[SlimDocument] = []
+
+        # If no spots specified, get all spots
+        spot_names_to_process = self.spot_names
+        if not spot_names_to_process:
+            spot_names_to_process = self._get_all_spot_names()
+            logger.info(
+                f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
+            )
+
+        for spot_name in spot_names_to_process:
+            try:
+                spot_id = self._get_spot_id_from_name(spot_name)
+                offset = 0
+                has_more = True
+
+                while has_more:
+                    logger.info(
+                        f"Retrieving slim documents from spot {spot_name}, offset {offset}"
+                    )
+                    response = self.client.get_spot_items(
+                        spot_id=spot_id, offset=offset, page_size=self.batch_size
+                    )
+
+                    items = response.get("collection", [])
+                    if not items:
+                        has_more = False
+                        continue
+
+                    for item in items:
+                        item_id = item.get("id")
+                        if not item_id:
+                            continue
+
+                        slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
+
+                        if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
+                            yield slim_doc_batch
+                            slim_doc_batch = []
+
+                    has_more = len(items) >= self.batch_size
+                    offset += self.batch_size
+
+            except (HighspotClientError, ValueError) as e:
+                logger.error(
+                    f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
+                )
+
+        if slim_doc_batch:
+            yield slim_doc_batch
+
+    def validate_credentials(self) -> bool:
+        """
+        Validate that the provided credentials can access the Highspot API.
+
+        Returns:
+            True if credentials are valid, False otherwise
+        """
+        try:
+            return self.client.health_check()
+        except Exception as e:
+            logger.error(f"Failed to validate credentials: {str(e)}")
+            return False
+
+
+if __name__ == "__main__":
+    spot_names: List[str] = []
+    connector = HighspotConnector(spot_names)
+    credentials = {"highspot_key": "", "highspot_secret": ""}
+    connector.load_credentials(credentials=credentials)
+    for doc in connector.load_from_state():
+        print(doc)
--- a/backend/onyx/connectors/highspot/utils.py
+++ b/backend/onyx/connectors/highspot/utils.py
@ -0,0 +1,122 @@
+from typing import Optional
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+from playwright.sync_api import sync_playwright
+
+from onyx.file_processing.html_utils import web_html_cleanup
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+# Constants
+WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
+JAVASCRIPT_DISABLED_MESSAGE = "You have JavaScript disabled in your browser"
+DEFAULT_TIMEOUT = 60000  # 60 seconds
+
+
+def scrape_url_content(
+    url: str, scroll_before_scraping: bool = False, timeout_ms: int = DEFAULT_TIMEOUT
+) -> Optional[str]:
+    """
+    Scrapes content from a given URL and returns the cleaned text.
+
+    Args:
+        url: The URL to scrape
+        scroll_before_scraping: Whether to scroll through the page to load lazy content
+        timeout_ms: Timeout in milliseconds for page navigation and loading
+
+    Returns:
+        The cleaned text content of the page or None if scraping fails
+    """
+    playwright = None
+    browser = None
+    try:
+        validate_url(url)
+        playwright = sync_playwright().start()
+        browser = playwright.chromium.launch(headless=True)
+        context = browser.new_context()
+        page = context.new_page()
+
+        logger.info(f"Navigating to URL: {url}")
+        try:
+            page.goto(url, timeout=timeout_ms)
+        except Exception as e:
+            logger.error(f"Failed to navigate to {url}: {str(e)}")
+            return None
+
+        if scroll_before_scraping:
+            logger.debug("Scrolling page to load lazy content")
+            scroll_attempts = 0
+            previous_height = page.evaluate("document.body.scrollHeight")
+            while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
+                page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+                try:
+                    page.wait_for_load_state("networkidle", timeout=timeout_ms)
+                except Exception as e:
+                    logger.warning(f"Network idle wait timed out: {str(e)}")
+                    break
+
+                new_height = page.evaluate("document.body.scrollHeight")
+                if new_height == previous_height:
+                    break
+                previous_height = new_height
+                scroll_attempts += 1
+
+        content = page.content()
+        soup = BeautifulSoup(content, "html.parser")
+
+        parsed_html = web_html_cleanup(soup)
+
+        if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text:
+            logger.debug("JavaScript disabled message detected, checking iframes")
+            try:
+                iframe_count = page.frame_locator("iframe").locator("html").count()
+                if iframe_count > 0:
+                    iframe_texts = (
+                        page.frame_locator("iframe").locator("html").all_inner_texts()
+                    )
+                    iframe_content = "\n".join(iframe_texts)
+
+                    if len(parsed_html.cleaned_text) < 700:
+                        parsed_html.cleaned_text = iframe_content
+                    else:
+                        parsed_html.cleaned_text += "\n" + iframe_content
+            except Exception as e:
+                logger.warning(f"Error processing iframes: {str(e)}")
+
+        return parsed_html.cleaned_text
+
+    except Exception as e:
+        logger.error(f"Error scraping URL {url}: {str(e)}")
+        return None
+
+    finally:
+        if browser:
+            try:
+                browser.close()
+            except Exception as e:
+                logger.debug(f"Error closing browser: {str(e)}")
+        if playwright:
+            try:
+                playwright.stop()
+            except Exception as e:
+                logger.debug(f"Error stopping playwright: {str(e)}")
+
+
+def validate_url(url: str) -> None:
+    """
+    Validates that a URL is properly formatted.
+
+    Args:
+        url: The URL to validate
+
+    Raises:
+        ValueError: If URL is not valid
+    """
+    parse = urlparse(url)
+    if parse.scheme != "http" and parse.scheme != "https":
+        raise ValueError("URL must be of scheme https?://")
+
+    if not parse.hostname:
+        raise ValueError("URL must include a hostname")
--- a/backend/tests/daily/connectors/highspot/test_highspot_connector.py
+++ b/backend/tests/daily/connectors/highspot/test_highspot_connector.py
@ -0,0 +1,98 @@
+import json
+import os
+import time
+from pathlib import Path
+
+import pytest
+
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.highspot.connector import HighspotConnector
+from onyx.connectors.models import Document
+
+
+def load_test_data(file_name: str = "test_highspot_data.json") -> dict:
+    """Load test data from JSON file."""
+    current_dir = Path(__file__).parent
+    with open(current_dir / file_name, "r") as f:
+        return json.load(f)
+
+
+@pytest.fixture
+def highspot_connector() -> HighspotConnector:
+    """Create a Highspot connector with credentials from environment variables."""
+    # Check if required environment variables are set
+    if not os.environ.get("HIGHSPOT_KEY") or not os.environ.get("HIGHSPOT_SECRET"):
+        pytest.fail("HIGHSPOT_KEY or HIGHSPOT_SECRET environment variables not set")
+
+    connector = HighspotConnector(
+        spot_names=["Test content"],  # Use specific spot name instead of empty list
+        batch_size=10,  # Smaller batch size for testing
+    )
+    connector.load_credentials(
+        {
+            "highspot_key": os.environ["HIGHSPOT_KEY"],
+            "highspot_secret": os.environ["HIGHSPOT_SECRET"],
+            "highspot_url": os.environ.get(
+                "HIGHSPOT_URL", "https://api-su2.highspot.com/v1.0/"
+            ),
+        }
+    )
+    return connector
+
+
+def test_highspot_connector_basic(highspot_connector: HighspotConnector) -> None:
+    """Test basic functionality of the Highspot connector."""
+    all_docs: list[Document] = []
+    test_data = load_test_data()
+    target_test_doc_id = test_data.get("target_doc_id")
+    target_test_doc: Document | None = None
+
+    # Test loading documents
+    for doc_batch in highspot_connector.poll_source(0, time.time()):
+        for doc in doc_batch:
+            all_docs.append(doc)
+            if doc.id == f"HIGHSPOT_{target_test_doc_id}":
+                target_test_doc = doc
+
+    # Verify documents were loaded
+    assert len(all_docs) > 0
+
+    # If we have a specific test document ID, validate it
+    if target_test_doc_id and target_test_doc is not None:
+        assert target_test_doc.semantic_identifier == test_data.get(
+            "semantic_identifier"
+        )
+        assert target_test_doc.source == DocumentSource.HIGHSPOT
+        assert target_test_doc.metadata is not None
+
+        assert len(target_test_doc.sections) == 1
+        section = target_test_doc.sections[0]
+        assert section.link is not None
+        # Only check if content exists, as exact content might change
+        assert section.text is not None
+        assert len(section.text) > 0
+
+
+def test_highspot_connector_slim(highspot_connector: HighspotConnector) -> None:
+    """Test slim document retrieval."""
+    # Get all doc IDs from the full connector
+    all_full_doc_ids = set()
+    for doc_batch in highspot_connector.load_from_state():
+        all_full_doc_ids.update([doc.id for doc in doc_batch])
+
+    # Get all doc IDs from the slim connector
+    all_slim_doc_ids = set()
+    for slim_doc_batch in highspot_connector.retrieve_all_slim_documents():
+        all_slim_doc_ids.update([doc.id for doc in slim_doc_batch])
+
+    # The set of full doc IDs should be a subset of the slim doc IDs
+    assert all_full_doc_ids.issubset(all_slim_doc_ids)
+    # Make sure we actually got some documents
+    assert len(all_slim_doc_ids) > 0
+
+
+def test_highspot_connector_validate_credentials(
+    highspot_connector: HighspotConnector,
+) -> None:
+    """Test credential validation."""
+    assert highspot_connector.validate_credentials() is True
--- a/backend/tests/daily/connectors/highspot/test_highspot_data.json
+++ b/backend/tests/daily/connectors/highspot/test_highspot_data.json
@ -0,0 +1,5 @@
+{
+    "target_doc_id": "67cd8eb35d3ee0487de2e704",
+    "semantic_identifier": "Highspot in Action _ Salesforce Integration",
+    "link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704"
+}
--- a/web/public/Highspot.png
+++ b/web/public/Highspot.png
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@ -89,6 +89,7 @@ import cohereIcon from "../../../public/Cohere.svg";
 import voyageIcon from "../../../public/Voyage.png";
 import googleIcon from "../../../public/Google.webp";
 import xenforoIcon from "../../../public/Xenforo.svg";
+import highspotIcon from "../../../public/Highspot.png";
 import { FaGithub, FaRobot } from "react-icons/fa";

 import { cn } from "@/lib/utils";
@ -2912,6 +2913,13 @@ export const GitbookIcon = ({
  </div>
 );

+export const HighspotIcon = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => {
+  return <LogoIcon size={size} className={className} src={highspotIcon} />;
+};
+
 export const PinnedIcon = ({
  size = 16,
  className = defaultTailwindCSS,
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@ -1249,6 +1249,47 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
    ],
    overrideDefaultFreq: 60 * 60 * 24,
  },
+  highspot: {
+    description: "Configure Highspot connector",
+    values: [
+      {
+        type: "tab",
+        name: "highspot_scope",
+        label: "What should we index from Highspot?",
+        optional: true,
+        tabs: [
+          {
+            value: "spots",
+            label: "Specific Spots",
+            fields: [
+              {
+                type: "list",
+                query: "Enter the spot name(s):",
+                label: "Spot Name(s)",
+                name: "spot_names",
+                optional: false,
+                description: "For multiple spots, enter your spot one by one.",
+              },
+            ],
+          },
+          {
+            value: "everything",
+            label: "Everything",
+            fields: [
+              {
+                type: "string_tab",
+                label: "Everything",
+                name: "everything",
+                description:
+                  "This connector will index all spots the provided credentials have access to!",
+              },
+            ],
+          },
+        ],
+      },
+    ],
+    advanced_values: [],
+  },
 };
 export function createConnectorInitialValues(
  connector: ConfigurableSources
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@ -226,6 +226,12 @@ export interface AirtableCredentialJson {
  airtable_access_token: string;
 }

+export interface HighspotCredentialJson {
+  highspot_url: string;
+  highspot_key: string;
+  highspot_secret: string;
+}
+
 export const credentialTemplates: Record<ValidSources, any> = {
  github: { github_access_token: "" } as GithubCredentialJson,
  gitlab: {
@ -353,6 +359,11 @@ export const credentialTemplates: Record<ValidSources, any> = {
  gitbook: {
    gitbook_api_key: "",
  } as GitbookCredentialJson,
+  highspot: {
+    highspot_url: "",
+    highspot_key: "",
+    highspot_secret: "",
+  } as HighspotCredentialJson,
 };

 export const credentialDisplayNames: Record<string, string> = {
@ -488,6 +499,11 @@ export const credentialDisplayNames: Record<string, string> = {
  // GitBook
  gitbook_space_id: "GitBook Space ID",
  gitbook_api_key: "GitBook API Key",
+
+  //Highspot
+  highspot_url: "Highspot URL",
+  highspot_key: "Highspot Key",
+  highspot_secret: "Highspot Secret",
 };

 export function getDisplayNameForCredentialKey(key: string): string {
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@ -44,6 +44,7 @@ import {
  GlobeIcon2,
  FileIcon2,
  GitbookIcon,
+  HighspotIcon,
 } from "@/components/icons/icons";
 import { ValidSources } from "./types";
 import {
@ -329,6 +330,12 @@ export const SOURCE_METADATA_MAP: SourceMap = {
    category: SourceCategory.Wiki,
    docs: "https://docs.onyx.app/connectors/gitbook",
  },
+  highspot: {
+    icon: HighspotIcon,
+    displayName: "Highspot",
+    category: SourceCategory.Wiki,
+    docs: "https://docs.onyx.app/connectors/highspot",
+  },
  // currently used for the Internet Search tool docs, which is why
  // a globe is used
  not_applicable: {
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@ -390,6 +390,7 @@ export enum ValidSources {
  Egnyte = "egnyte",
  Airtable = "airtable",
  Gitbook = "gitbook",
+  Highspot = "highspot",
 }

 export const validAutoSyncSources = [