diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index f7f3972b3..aa740aa8d 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -50,6 +50,9 @@ env: GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }} # Notion NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }} + # Highspot + HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }} + HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }} jobs: connectors-check: diff --git a/backend/onyx/configs/constants.py b/backend/onyx/configs/constants.py index 2d7445362..cbcc71245 100644 --- a/backend/onyx/configs/constants.py +++ b/backend/onyx/configs/constants.py @@ -174,6 +174,7 @@ class DocumentSource(str, Enum): FIREFLIES = "fireflies" EGNYTE = "egnyte" AIRTABLE = "airtable" + HIGHSPOT = "highspot" # Special case just for integration tests MOCK_CONNECTOR = "mock_connector" diff --git a/backend/onyx/connectors/factory.py b/backend/onyx/connectors/factory.py index 73593cc60..2f0b10743 100644 --- a/backend/onyx/connectors/factory.py +++ b/backend/onyx/connectors/factory.py @@ -30,6 +30,7 @@ from onyx.connectors.gong.connector import GongConnector from onyx.connectors.google_drive.connector import GoogleDriveConnector from onyx.connectors.google_site.connector import GoogleSitesConnector from onyx.connectors.guru.connector import GuruConnector +from onyx.connectors.highspot.connector import HighspotConnector from onyx.connectors.hubspot.connector import HubSpotConnector from onyx.connectors.interfaces import BaseConnector from onyx.connectors.interfaces import CheckpointConnector @@ -117,6 +118,7 @@ def identify_connector_class( DocumentSource.FIREFLIES: FirefliesConnector, DocumentSource.EGNYTE: EgnyteConnector, DocumentSource.AIRTABLE: AirtableConnector, + DocumentSource.HIGHSPOT: HighspotConnector, # just for integration tests DocumentSource.MOCK_CONNECTOR: MockConnector, } diff --git a/backend/onyx/connectors/highspot/__init__.py b/backend/onyx/connectors/highspot/__init__.py new file mode 100644 index 000000000..df94b5875 --- /dev/null +++ b/backend/onyx/connectors/highspot/__init__.py @@ -0,0 +1,4 @@ +""" +Highspot connector package for Onyx. +Enables integration with Highspot's knowledge base. +""" diff --git a/backend/onyx/connectors/highspot/client.py b/backend/onyx/connectors/highspot/client.py new file mode 100644 index 000000000..7879e6e79 --- /dev/null +++ b/backend/onyx/connectors/highspot/client.py @@ -0,0 +1,280 @@ +import base64 +from typing import Any +from typing import Dict +from typing import List +from typing import Optional +from urllib.parse import urljoin + +import requests +from requests.adapters import HTTPAdapter +from requests.exceptions import HTTPError +from requests.exceptions import RequestException +from requests.exceptions import Timeout +from urllib3.util.retry import Retry + +from onyx.utils.logger import setup_logger + +logger = setup_logger() + + +class HighspotClientError(Exception): + """Base exception for Highspot API client errors.""" + + def __init__(self, message: str, status_code: Optional[int] = None): + self.message = message + self.status_code = status_code + super().__init__(self.message) + + +class HighspotAuthenticationError(HighspotClientError): + """Exception raised for authentication errors.""" + + +class HighspotRateLimitError(HighspotClientError): + """Exception raised when rate limit is exceeded.""" + + def __init__(self, message: str, retry_after: Optional[str] = None): + self.retry_after = retry_after + super().__init__(message) + + +class HighspotClient: + """ + Client for interacting with the Highspot API. + + Uses basic authentication with provided key (username) and secret (password). + Implements retry logic, error handling, and connection pooling. + """ + + BASE_URL = "https://api-su2.highspot.com/v1.0/" + + def __init__( + self, + key: str, + secret: str, + base_url: str = BASE_URL, + timeout: int = 30, + max_retries: int = 3, + backoff_factor: float = 0.5, + status_forcelist: Optional[List[int]] = None, + ): + """ + Initialize the Highspot API client. + + Args: + key: API key (used as username) + secret: API secret (used as password) + base_url: Base URL for the Highspot API + timeout: Request timeout in seconds + max_retries: Maximum number of retries for failed requests + backoff_factor: Backoff factor for retries + status_forcelist: HTTP status codes to retry on + """ + if not key or not secret: + raise ValueError("API key and secret are required") + + self.key = key + self.secret = secret + self.base_url = base_url + self.timeout = timeout + + # Set up session with retry logic + self.session = requests.Session() + retry_strategy = Retry( + total=max_retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist or [429, 500, 502, 503, 504], + allowed_methods=["GET", "POST", "PUT", "DELETE"], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + # Set up authentication + self._setup_auth() + + def _setup_auth(self) -> None: + """Set up basic authentication for the session.""" + auth = f"{self.key}:{self.secret}" + encoded_auth = base64.b64encode(auth.encode()).decode() + self.session.headers.update( + { + "Authorization": f"Basic {encoded_auth}", + "Content-Type": "application/json", + "Accept": "application/json", + } + ) + + def _make_request( + self, + method: str, + endpoint: str, + params: Optional[Dict[str, Any]] = None, + data: Optional[Dict[str, Any]] = None, + json_data: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + ) -> Dict[str, Any]: + """ + Make a request to the Highspot API. + + Args: + method: HTTP method (GET, POST, etc.) + endpoint: API endpoint + params: URL parameters + data: Form data + json_data: JSON data + headers: Additional headers + + Returns: + API response as a dictionary + + Raises: + HighspotClientError: On API errors + HighspotAuthenticationError: On authentication errors + HighspotRateLimitError: On rate limiting + requests.exceptions.RequestException: On request failures + """ + url = urljoin(self.base_url, endpoint) + request_headers = {} + if headers: + request_headers.update(headers) + + try: + logger.debug(f"Making {method} request to {url}") + response = self.session.request( + method=method, + url=url, + params=params, + data=data, + json=json_data, + headers=request_headers, + timeout=self.timeout, + ) + response.raise_for_status() + + if response.content and response.content.strip(): + return response.json() + return {} + + except HTTPError as e: + status_code = e.response.status_code + error_msg = str(e) + + try: + error_data = e.response.json() + if isinstance(error_data, dict): + error_msg = error_data.get("message", str(e)) + except (ValueError, KeyError): + pass + + if status_code == 401: + raise HighspotAuthenticationError(f"Authentication failed: {error_msg}") + elif status_code == 429: + retry_after = e.response.headers.get("Retry-After") + raise HighspotRateLimitError( + f"Rate limit exceeded: {error_msg}", retry_after=retry_after + ) + else: + raise HighspotClientError( + f"API error {status_code}: {error_msg}", status_code=status_code + ) + + except Timeout: + raise HighspotClientError("Request timed out") + except RequestException as e: + raise HighspotClientError(f"Request failed: {str(e)}") + + def get_spots(self) -> List[Dict[str, Any]]: + """ + Get all available spots. + + Returns: + List of spots with their names and IDs + """ + params = {"right": "view"} + response = self._make_request("GET", "spots", params=params) + logger.info(f"Received {response} spots") + total_counts = response.get("counts_total") + # Fix comparison to handle None value + if total_counts is not None and total_counts > 0: + return response.get("collection", []) + return [] + + def get_spot(self, spot_id: str) -> Dict[str, Any]: + """ + Get details for a specific spot. + + Args: + spot_id: ID of the spot + + Returns: + Spot details + """ + if not spot_id: + raise ValueError("spot_id is required") + return self._make_request("GET", f"spots/{spot_id}") + + def get_spot_items( + self, spot_id: str, offset: int = 0, page_size: int = 100 + ) -> Dict[str, Any]: + """ + Get items in a specific spot. + + Args: + spot_id: ID of the spot + offset: offset number + page_size: Number of items per page + + Returns: + Items in the spot + """ + if not spot_id: + raise ValueError("spot_id is required") + + params = {"spot": spot_id, "start": offset, "limit": page_size} + return self._make_request("GET", "items", params=params) + + def get_item(self, item_id: str) -> Dict[str, Any]: + """ + Get details for a specific item. + + Args: + item_id: ID of the item + + Returns: + Item details + """ + if not item_id: + raise ValueError("item_id is required") + return self._make_request("GET", f"items/{item_id}") + + def get_item_content(self, item_id: str) -> bytes: + """ + Get the raw content of an item. + + Args: + item_id: ID of the item + + Returns: + Raw content bytes + """ + if not item_id: + raise ValueError("item_id is required") + + url = urljoin(self.base_url, f"items/{item_id}/content") + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + return response.content + + def health_check(self) -> bool: + """ + Check if the API is accessible and credentials are valid. + + Returns: + True if API is accessible, False otherwise + """ + try: + self._make_request("GET", "spots", params={"limit": 1}) + return True + except (HighspotClientError, HighspotAuthenticationError): + return False diff --git a/backend/onyx/connectors/highspot/connector.py b/backend/onyx/connectors/highspot/connector.py new file mode 100644 index 000000000..380d878a5 --- /dev/null +++ b/backend/onyx/connectors/highspot/connector.py @@ -0,0 +1,431 @@ +from datetime import datetime +from io import BytesIO +from typing import Any +from typing import Dict +from typing import List +from typing import Optional + +from onyx.configs.app_configs import INDEX_BATCH_SIZE +from onyx.configs.constants import DocumentSource +from onyx.connectors.highspot.client import HighspotClient +from onyx.connectors.highspot.client import HighspotClientError +from onyx.connectors.highspot.utils import scrape_url_content +from onyx.connectors.interfaces import GenerateDocumentsOutput +from onyx.connectors.interfaces import GenerateSlimDocumentOutput +from onyx.connectors.interfaces import LoadConnector +from onyx.connectors.interfaces import PollConnector +from onyx.connectors.interfaces import SecondsSinceUnixEpoch +from onyx.connectors.interfaces import SlimConnector +from onyx.connectors.models import ConnectorMissingCredentialError +from onyx.connectors.models import Document +from onyx.connectors.models import SlimDocument +from onyx.connectors.models import TextSection +from onyx.file_processing.extract_file_text import extract_file_text +from onyx.file_processing.extract_file_text import VALID_FILE_EXTENSIONS +from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface +from onyx.utils.logger import setup_logger + +logger = setup_logger() +_SLIM_BATCH_SIZE = 1000 + + +class HighspotConnector(LoadConnector, PollConnector, SlimConnector): + """ + Connector for loading data from Highspot. + + Retrieves content from specified spots using the Highspot API. + If no spots are specified, retrieves content from all available spots. + """ + + def __init__( + self, + spot_names: List[str] = [], + batch_size: int = INDEX_BATCH_SIZE, + ): + """ + Initialize the Highspot connector. + + Args: + spot_names: List of spot names to retrieve content from (if empty, gets all spots) + batch_size: Number of items to retrieve in each batch + """ + self.spot_names = spot_names + self.batch_size = batch_size + self._client: Optional[HighspotClient] = None + self._spot_id_map: Dict[str, str] = {} # Maps spot names to spot IDs + self._all_spots_fetched = False + self.highspot_url: Optional[str] = None + self.key: Optional[str] = None + self.secret: Optional[str] = None + + @property + def client(self) -> HighspotClient: + if self._client is None: + if not self.key or not self.secret: + raise ConnectorMissingCredentialError("Highspot") + # Ensure highspot_url is a string, use default if None + base_url = ( + self.highspot_url + if self.highspot_url is not None + else HighspotClient.BASE_URL + ) + self._client = HighspotClient(self.key, self.secret, base_url=base_url) + return self._client + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + logger.info("Loading Highspot credentials") + self.highspot_url = credentials.get("highspot_url") + self.key = credentials.get("highspot_key") + self.secret = credentials.get("highspot_secret") + return None + + def _populate_spot_id_map(self) -> None: + """ + Populate the spot ID map with all available spots. + Keys are stored as lowercase for case-insensitive lookups. + """ + spots = self.client.get_spots() + for spot in spots: + if "title" in spot and "id" in spot: + spot_name = spot["title"] + self._spot_id_map[spot_name.lower()] = spot["id"] + + self._all_spots_fetched = True + logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot") + + def _get_all_spot_names(self) -> List[str]: + """ + Retrieve all available spot names. + + Returns: + List of all spot names + """ + if not self._all_spots_fetched: + self._populate_spot_id_map() + + return [spot_name for spot_name in self._spot_id_map.keys()] + + def _get_spot_id_from_name(self, spot_name: str) -> str: + """ + Get spot ID from a spot name. + + Args: + spot_name: Name of the spot + + Returns: + ID of the spot + + Raises: + ValueError: If spot name is not found + """ + if not self._all_spots_fetched: + self._populate_spot_id_map() + + spot_name_lower = spot_name.lower() + if spot_name_lower not in self._spot_id_map: + raise ValueError(f"Spot '{spot_name}' not found") + + return self._spot_id_map[spot_name_lower] + + def load_from_state(self) -> GenerateDocumentsOutput: + """ + Load content from configured spots in Highspot. + If no spots are configured, loads from all spots. + + Yields: + Batches of Document objects + """ + return self.poll_source(None, None) + + def poll_source( + self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None + ) -> GenerateDocumentsOutput: + """ + Poll Highspot for content updated since the start time. + + Args: + start: Start time as seconds since Unix epoch + end: End time as seconds since Unix epoch + + Yields: + Batches of Document objects + """ + doc_batch: list[Document] = [] + + # If no spots specified, get all spots + spot_names_to_process = self.spot_names + if not spot_names_to_process: + spot_names_to_process = self._get_all_spot_names() + logger.info( + f"No spots specified, using all {len(spot_names_to_process)} available spots" + ) + + for spot_name in spot_names_to_process: + try: + spot_id = self._get_spot_id_from_name(spot_name) + if spot_id is None: + logger.warning(f"Spot ID not found for spot {spot_name}") + continue + offset = 0 + has_more = True + + while has_more: + logger.info( + f"Retrieving items from spot {spot_name}, offset {offset}" + ) + response = self.client.get_spot_items( + spot_id=spot_id, offset=offset, page_size=self.batch_size + ) + items = response.get("collection", []) + logger.info(f"Received Items: {items}") + if not items: + has_more = False + continue + + for item in items: + try: + item_id = item.get("id") + if not item_id: + logger.warning("Item without ID found, skipping") + continue + + item_details = self.client.get_item(item_id) + if not item_details: + logger.warning( + f"Item {item_id} details not found, skipping" + ) + continue + # Apply time filter if specified + if start or end: + updated_at = item_details.get("date_updated") + if updated_at: + # Convert to datetime for comparison + try: + updated_time = datetime.fromisoformat( + updated_at.replace("Z", "+00:00") + ) + if ( + start and updated_time.timestamp() < start + ) or (end and updated_time.timestamp() > end): + continue + except (ValueError, TypeError): + # Skip if date cannot be parsed + logger.warning( + f"Invalid date format for item {item_id}: {updated_at}" + ) + continue + + content = self._get_item_content(item_details) + title = item_details.get("title", "") + + doc_batch.append( + Document( + id=f"HIGHSPOT_{item_id}", + sections=[ + TextSection( + link=item_details.get( + "url", + f"https://www.highspot.com/items/{item_id}", + ), + text=content, + ) + ], + source=DocumentSource.HIGHSPOT, + semantic_identifier=title, + metadata={ + "spot_name": spot_name, + "type": item_details.get("content_type", ""), + "created_at": item_details.get( + "date_added", "" + ), + "author": item_details.get("author", ""), + "language": item_details.get("language", ""), + "can_download": str( + item_details.get("can_download", False) + ), + }, + doc_updated_at=item_details.get("date_updated"), + ) + ) + + if len(doc_batch) >= self.batch_size: + yield doc_batch + doc_batch = [] + + except HighspotClientError as e: + item_id = "ID" if not item_id else item_id + logger.error(f"Error retrieving item {item_id}: {str(e)}") + + has_more = len(items) >= self.batch_size + offset += self.batch_size + + except (HighspotClientError, ValueError) as e: + logger.error(f"Error processing spot {spot_name}: {str(e)}") + + if doc_batch: + yield doc_batch + + def _get_item_content(self, item_details: Dict[str, Any]) -> str: + """ + Get the text content of an item. + + Args: + item_details: Item details from the API + + Returns: + Text content of the item + """ + item_id = item_details.get("id", "") + content_name = item_details.get("content_name", "") + is_valid_format = content_name and "." in content_name + file_extension = content_name.split(".")[-1].lower() if is_valid_format else "" + file_extension = "." + file_extension if file_extension else "" + can_download = item_details.get("can_download", False) + content_type = item_details.get("content_type", "") + + # Extract title and description once at the beginning + title, description = self._extract_title_and_description(item_details) + default_content = f"{title}\n{description}" + logger.info(f"Processing item {item_id} with extension {file_extension}") + + try: + if content_type == "WebLink": + url = item_details.get("url") + if not url: + return default_content + content = scrape_url_content(url, True) + return content if content else default_content + + elif ( + is_valid_format + and file_extension in VALID_FILE_EXTENSIONS + and can_download + ): + # For documents, try to get the text content + if not item_id: # Ensure item_id is defined + return default_content + + content_response = self.client.get_item_content(item_id) + # Process and extract text from binary content based on type + if content_response: + text_content = extract_file_text( + BytesIO(content_response), content_name + ) + return text_content + return default_content + + else: + return default_content + + except HighspotClientError as e: + # Use item_id safely in the warning message + error_context = f"item {item_id}" if item_id else "item" + logger.warning(f"Could not retrieve content for {error_context}: {str(e)}") + return "" + + def _extract_title_and_description( + self, item_details: Dict[str, Any] + ) -> tuple[str, str]: + """ + Extract the title and description from item details. + + Args: + item_details: Item details from the API + + Returns: + Tuple of title and description + """ + title = item_details.get("title", "") + description = item_details.get("description", "") + return title, description + + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + callback: IndexingHeartbeatInterface | None = None, + ) -> GenerateSlimDocumentOutput: + """ + Retrieve all document IDs from the configured spots. + If no spots are configured, retrieves from all spots. + + Args: + start: Optional start time filter + end: Optional end time filter + callback: Optional indexing heartbeat callback + + Yields: + Batches of SlimDocument objects + """ + slim_doc_batch: list[SlimDocument] = [] + + # If no spots specified, get all spots + spot_names_to_process = self.spot_names + if not spot_names_to_process: + spot_names_to_process = self._get_all_spot_names() + logger.info( + f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents" + ) + + for spot_name in spot_names_to_process: + try: + spot_id = self._get_spot_id_from_name(spot_name) + offset = 0 + has_more = True + + while has_more: + logger.info( + f"Retrieving slim documents from spot {spot_name}, offset {offset}" + ) + response = self.client.get_spot_items( + spot_id=spot_id, offset=offset, page_size=self.batch_size + ) + + items = response.get("collection", []) + if not items: + has_more = False + continue + + for item in items: + item_id = item.get("id") + if not item_id: + continue + + slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}")) + + if len(slim_doc_batch) >= _SLIM_BATCH_SIZE: + yield slim_doc_batch + slim_doc_batch = [] + + has_more = len(items) >= self.batch_size + offset += self.batch_size + + except (HighspotClientError, ValueError) as e: + logger.error( + f"Error retrieving slim documents from spot {spot_name}: {str(e)}" + ) + + if slim_doc_batch: + yield slim_doc_batch + + def validate_credentials(self) -> bool: + """ + Validate that the provided credentials can access the Highspot API. + + Returns: + True if credentials are valid, False otherwise + """ + try: + return self.client.health_check() + except Exception as e: + logger.error(f"Failed to validate credentials: {str(e)}") + return False + + +if __name__ == "__main__": + spot_names: List[str] = [] + connector = HighspotConnector(spot_names) + credentials = {"highspot_key": "", "highspot_secret": ""} + connector.load_credentials(credentials=credentials) + for doc in connector.load_from_state(): + print(doc) diff --git a/backend/onyx/connectors/highspot/utils.py b/backend/onyx/connectors/highspot/utils.py new file mode 100644 index 000000000..efc00dac1 --- /dev/null +++ b/backend/onyx/connectors/highspot/utils.py @@ -0,0 +1,122 @@ +from typing import Optional +from urllib.parse import urlparse + +from bs4 import BeautifulSoup +from playwright.sync_api import sync_playwright + +from onyx.file_processing.html_utils import web_html_cleanup +from onyx.utils.logger import setup_logger + +logger = setup_logger() + +# Constants +WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20 +JAVASCRIPT_DISABLED_MESSAGE = "You have JavaScript disabled in your browser" +DEFAULT_TIMEOUT = 60000 # 60 seconds + + +def scrape_url_content( + url: str, scroll_before_scraping: bool = False, timeout_ms: int = DEFAULT_TIMEOUT +) -> Optional[str]: + """ + Scrapes content from a given URL and returns the cleaned text. + + Args: + url: The URL to scrape + scroll_before_scraping: Whether to scroll through the page to load lazy content + timeout_ms: Timeout in milliseconds for page navigation and loading + + Returns: + The cleaned text content of the page or None if scraping fails + """ + playwright = None + browser = None + try: + validate_url(url) + playwright = sync_playwright().start() + browser = playwright.chromium.launch(headless=True) + context = browser.new_context() + page = context.new_page() + + logger.info(f"Navigating to URL: {url}") + try: + page.goto(url, timeout=timeout_ms) + except Exception as e: + logger.error(f"Failed to navigate to {url}: {str(e)}") + return None + + if scroll_before_scraping: + logger.debug("Scrolling page to load lazy content") + scroll_attempts = 0 + previous_height = page.evaluate("document.body.scrollHeight") + while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS: + page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + try: + page.wait_for_load_state("networkidle", timeout=timeout_ms) + except Exception as e: + logger.warning(f"Network idle wait timed out: {str(e)}") + break + + new_height = page.evaluate("document.body.scrollHeight") + if new_height == previous_height: + break + previous_height = new_height + scroll_attempts += 1 + + content = page.content() + soup = BeautifulSoup(content, "html.parser") + + parsed_html = web_html_cleanup(soup) + + if JAVASCRIPT_DISABLED_MESSAGE in parsed_html.cleaned_text: + logger.debug("JavaScript disabled message detected, checking iframes") + try: + iframe_count = page.frame_locator("iframe").locator("html").count() + if iframe_count > 0: + iframe_texts = ( + page.frame_locator("iframe").locator("html").all_inner_texts() + ) + iframe_content = "\n".join(iframe_texts) + + if len(parsed_html.cleaned_text) < 700: + parsed_html.cleaned_text = iframe_content + else: + parsed_html.cleaned_text += "\n" + iframe_content + except Exception as e: + logger.warning(f"Error processing iframes: {str(e)}") + + return parsed_html.cleaned_text + + except Exception as e: + logger.error(f"Error scraping URL {url}: {str(e)}") + return None + + finally: + if browser: + try: + browser.close() + except Exception as e: + logger.debug(f"Error closing browser: {str(e)}") + if playwright: + try: + playwright.stop() + except Exception as e: + logger.debug(f"Error stopping playwright: {str(e)}") + + +def validate_url(url: str) -> None: + """ + Validates that a URL is properly formatted. + + Args: + url: The URL to validate + + Raises: + ValueError: If URL is not valid + """ + parse = urlparse(url) + if parse.scheme != "http" and parse.scheme != "https": + raise ValueError("URL must be of scheme https?://") + + if not parse.hostname: + raise ValueError("URL must include a hostname") diff --git a/backend/tests/daily/connectors/highspot/test_highspot_connector.py b/backend/tests/daily/connectors/highspot/test_highspot_connector.py new file mode 100644 index 000000000..6da06b58c --- /dev/null +++ b/backend/tests/daily/connectors/highspot/test_highspot_connector.py @@ -0,0 +1,98 @@ +import json +import os +import time +from pathlib import Path + +import pytest + +from onyx.configs.constants import DocumentSource +from onyx.connectors.highspot.connector import HighspotConnector +from onyx.connectors.models import Document + + +def load_test_data(file_name: str = "test_highspot_data.json") -> dict: + """Load test data from JSON file.""" + current_dir = Path(__file__).parent + with open(current_dir / file_name, "r") as f: + return json.load(f) + + +@pytest.fixture +def highspot_connector() -> HighspotConnector: + """Create a Highspot connector with credentials from environment variables.""" + # Check if required environment variables are set + if not os.environ.get("HIGHSPOT_KEY") or not os.environ.get("HIGHSPOT_SECRET"): + pytest.fail("HIGHSPOT_KEY or HIGHSPOT_SECRET environment variables not set") + + connector = HighspotConnector( + spot_names=["Test content"], # Use specific spot name instead of empty list + batch_size=10, # Smaller batch size for testing + ) + connector.load_credentials( + { + "highspot_key": os.environ["HIGHSPOT_KEY"], + "highspot_secret": os.environ["HIGHSPOT_SECRET"], + "highspot_url": os.environ.get( + "HIGHSPOT_URL", "https://api-su2.highspot.com/v1.0/" + ), + } + ) + return connector + + +def test_highspot_connector_basic(highspot_connector: HighspotConnector) -> None: + """Test basic functionality of the Highspot connector.""" + all_docs: list[Document] = [] + test_data = load_test_data() + target_test_doc_id = test_data.get("target_doc_id") + target_test_doc: Document | None = None + + # Test loading documents + for doc_batch in highspot_connector.poll_source(0, time.time()): + for doc in doc_batch: + all_docs.append(doc) + if doc.id == f"HIGHSPOT_{target_test_doc_id}": + target_test_doc = doc + + # Verify documents were loaded + assert len(all_docs) > 0 + + # If we have a specific test document ID, validate it + if target_test_doc_id and target_test_doc is not None: + assert target_test_doc.semantic_identifier == test_data.get( + "semantic_identifier" + ) + assert target_test_doc.source == DocumentSource.HIGHSPOT + assert target_test_doc.metadata is not None + + assert len(target_test_doc.sections) == 1 + section = target_test_doc.sections[0] + assert section.link is not None + # Only check if content exists, as exact content might change + assert section.text is not None + assert len(section.text) > 0 + + +def test_highspot_connector_slim(highspot_connector: HighspotConnector) -> None: + """Test slim document retrieval.""" + # Get all doc IDs from the full connector + all_full_doc_ids = set() + for doc_batch in highspot_connector.load_from_state(): + all_full_doc_ids.update([doc.id for doc in doc_batch]) + + # Get all doc IDs from the slim connector + all_slim_doc_ids = set() + for slim_doc_batch in highspot_connector.retrieve_all_slim_documents(): + all_slim_doc_ids.update([doc.id for doc in slim_doc_batch]) + + # The set of full doc IDs should be a subset of the slim doc IDs + assert all_full_doc_ids.issubset(all_slim_doc_ids) + # Make sure we actually got some documents + assert len(all_slim_doc_ids) > 0 + + +def test_highspot_connector_validate_credentials( + highspot_connector: HighspotConnector, +) -> None: + """Test credential validation.""" + assert highspot_connector.validate_credentials() is True diff --git a/backend/tests/daily/connectors/highspot/test_highspot_data.json b/backend/tests/daily/connectors/highspot/test_highspot_data.json new file mode 100644 index 000000000..d796b3d60 --- /dev/null +++ b/backend/tests/daily/connectors/highspot/test_highspot_data.json @@ -0,0 +1,5 @@ +{ + "target_doc_id": "67cd8eb35d3ee0487de2e704", + "semantic_identifier": "Highspot in Action _ Salesforce Integration", + "link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704" +} diff --git a/web/public/Highspot.png b/web/public/Highspot.png new file mode 100644 index 000000000..7bf874f5b Binary files /dev/null and b/web/public/Highspot.png differ diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index 55d2ea648..e7f0a7390 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -89,6 +89,7 @@ import cohereIcon from "../../../public/Cohere.svg"; import voyageIcon from "../../../public/Voyage.png"; import googleIcon from "../../../public/Google.webp"; import xenforoIcon from "../../../public/Xenforo.svg"; +import highspotIcon from "../../../public/Highspot.png"; import { FaGithub, FaRobot } from "react-icons/fa"; import { cn } from "@/lib/utils"; @@ -2912,6 +2913,13 @@ export const GitbookIcon = ({ ); +export const HighspotIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + return ; +}; + export const PinnedIcon = ({ size = 16, className = defaultTailwindCSS, diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index e26631864..f644bc6a4 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -1249,6 +1249,47 @@ For example, specifying .*-support.* as a "channel" will cause the connector to ], overrideDefaultFreq: 60 * 60 * 24, }, + highspot: { + description: "Configure Highspot connector", + values: [ + { + type: "tab", + name: "highspot_scope", + label: "What should we index from Highspot?", + optional: true, + tabs: [ + { + value: "spots", + label: "Specific Spots", + fields: [ + { + type: "list", + query: "Enter the spot name(s):", + label: "Spot Name(s)", + name: "spot_names", + optional: false, + description: "For multiple spots, enter your spot one by one.", + }, + ], + }, + { + value: "everything", + label: "Everything", + fields: [ + { + type: "string_tab", + label: "Everything", + name: "everything", + description: + "This connector will index all spots the provided credentials have access to!", + }, + ], + }, + ], + }, + ], + advanced_values: [], + }, }; export function createConnectorInitialValues( connector: ConfigurableSources diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts index ac35beb29..0b34d47cc 100644 --- a/web/src/lib/connectors/credentials.ts +++ b/web/src/lib/connectors/credentials.ts @@ -226,6 +226,12 @@ export interface AirtableCredentialJson { airtable_access_token: string; } +export interface HighspotCredentialJson { + highspot_url: string; + highspot_key: string; + highspot_secret: string; +} + export const credentialTemplates: Record = { github: { github_access_token: "" } as GithubCredentialJson, gitlab: { @@ -353,6 +359,11 @@ export const credentialTemplates: Record = { gitbook: { gitbook_api_key: "", } as GitbookCredentialJson, + highspot: { + highspot_url: "", + highspot_key: "", + highspot_secret: "", + } as HighspotCredentialJson, }; export const credentialDisplayNames: Record = { @@ -488,6 +499,11 @@ export const credentialDisplayNames: Record = { // GitBook gitbook_space_id: "GitBook Space ID", gitbook_api_key: "GitBook API Key", + + //Highspot + highspot_url: "Highspot URL", + highspot_key: "Highspot Key", + highspot_secret: "Highspot Secret", }; export function getDisplayNameForCredentialKey(key: string): string { diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts index 7a3341256..6a1bd0ce5 100644 --- a/web/src/lib/sources.ts +++ b/web/src/lib/sources.ts @@ -44,6 +44,7 @@ import { GlobeIcon2, FileIcon2, GitbookIcon, + HighspotIcon, } from "@/components/icons/icons"; import { ValidSources } from "./types"; import { @@ -329,6 +330,12 @@ export const SOURCE_METADATA_MAP: SourceMap = { category: SourceCategory.Wiki, docs: "https://docs.onyx.app/connectors/gitbook", }, + highspot: { + icon: HighspotIcon, + displayName: "Highspot", + category: SourceCategory.Wiki, + docs: "https://docs.onyx.app/connectors/highspot", + }, // currently used for the Internet Search tool docs, which is why // a globe is used not_applicable: { diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 901aa85d7..c2007f7c2 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -390,6 +390,7 @@ export enum ValidSources { Egnyte = "egnyte", Airtable = "airtable", Gitbook = "gitbook", + Highspot = "highspot", } export const validAutoSyncSources = [