diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index e34b8b894d53..52314db920c8 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -107,6 +107,7 @@ class DocumentSource(str, Enum): R2 = "r2" GOOGLE_CLOUD_STORAGE = "google_cloud_storage" OCI_STORAGE = "oci_storage" + XENFORO = "xenforo" NOT_APPLICABLE = "not_applicable" diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 6df16bea6411..7d6fc9c11678 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -42,6 +42,7 @@ from danswer.connectors.slack.load_connector import SlackLoadConnector from danswer.connectors.teams.connector import TeamsConnector from danswer.connectors.web.connector import WebConnector from danswer.connectors.wikipedia.connector import WikipediaConnector +from danswer.connectors.xenforo.connector import XenforoConnector from danswer.connectors.zendesk.connector import ZendeskConnector from danswer.connectors.zulip.connector import ZulipConnector from danswer.db.credentials import backend_update_credential_json @@ -97,6 +98,7 @@ def identify_connector_class( DocumentSource.R2: BlobStorageConnector, DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector, DocumentSource.OCI_STORAGE: BlobStorageConnector, + DocumentSource.XENFORO: XenforoConnector, } connector_by_source = connector_map.get(source, {}) diff --git a/backend/danswer/connectors/xenforo/__init__.py b/backend/danswer/connectors/xenforo/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backend/danswer/connectors/xenforo/connector.py b/backend/danswer/connectors/xenforo/connector.py new file mode 100644 index 000000000000..7f5221543f1b --- /dev/null +++ b/backend/danswer/connectors/xenforo/connector.py @@ -0,0 +1,244 @@ +""" +This is the XenforoConnector class. It is used to connect to a Xenforo forum and load or update documents from the forum. + +To use this class, you need to provide the URL of the Xenforo forum board you want to connect to when creating an instance +of the class. The URL should be a string that starts with 'http://' or 'https://', followed by the domain name of the +forum, followed by the board name. For example: + + base_url = 'https://www.example.com/forum/boards/some-topic/' + +The `load_from_state` method is used to load documents from the forum. It takes an optional `state` parameter, which +can be used to specify a state from which to start loading documents. +""" +import re +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from typing import Any +from urllib.parse import urlparse + +import pytz +import requests +from bs4 import BeautifulSoup +from bs4 import Tag + +from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.models import BasicExpertInfo +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def get_title(soup: BeautifulSoup) -> str: + el = soup.find("h1", "p-title-value") + if not el: + return "" + title = el.text + for char in (";", ":", "!", "*", "/", "\\", "?", '"', "<", ">", "|"): + title = title.replace(char, "_") + return title + + +def get_pages(soup: BeautifulSoup, url: str) -> list[str]: + page_tags = soup.select("li.pageNav-page") + page_numbers = [] + for button in page_tags: + if re.match(r"^\d+$", button.text): + page_numbers.append(button.text) + + max_pages = int(max(page_numbers, key=int)) if page_numbers else 1 + + all_pages = [] + for x in range(1, int(max_pages) + 1): + all_pages.append(f"{url}page-{x}") + return all_pages + + +def parse_post_date(post_element: BeautifulSoup) -> datetime: + el = post_element.find("time") + if not isinstance(el, Tag) or "datetime" not in el.attrs: + return datetime.utcfromtimestamp(0).replace(tzinfo=timezone.utc) + + date_value = el["datetime"] + + # Ensure date_value is a string (if it's a list, take the first element) + if isinstance(date_value, list): + date_value = date_value[0] + + post_date = datetime.strptime(date_value, "%Y-%m-%dT%H:%M:%S%z") + return datetime_to_utc(post_date) + + +def scrape_page_posts( + soup: BeautifulSoup, + page_index: int, + url: str, + initial_run: bool, + start_time: datetime, +) -> list: + title = get_title(soup) + + documents = [] + for post in soup.find_all("div", class_="message-inner"): + post_date = parse_post_date(post) + if initial_run or post_date > start_time: + el = post.find("div", class_="bbWrapper") + if not el: + continue + post_text = el.get_text(strip=True) + "\n" + author_tag = post.find("a", class_="username") + if author_tag is None: + author_tag = post.find("span", class_="username") + author = author_tag.get_text(strip=True) if author_tag else "Deleted author" + formatted_time = post_date.strftime("%Y-%m-%d %H:%M:%S") + + # TODO: if a caller calls this for each page of a thread, it may see the + # same post multiple times if there is a sticky post + # that appears on each page of a thread. + # it's important to generate unique doc id's, so page index is part of the + # id. We may want to de-dupe this stuff inside the indexing service. + document = Document( + id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}", + sections=[Section(link=url, text=post_text)], + title=title, + source=DocumentSource.XENFORO, + semantic_identifier=title, + primary_owners=[BasicExpertInfo(display_name=author)], + metadata={ + "type": "post", + "author": author, + "time": formatted_time, + }, + doc_updated_at=post_date, + ) + + documents.append(document) + return documents + + +class XenforoConnector(LoadConnector): + # Class variable to track if the connector has been run before + has_been_run_before = False + + def __init__(self, base_url: str) -> None: + self.base_url = base_url + self.initial_run = not XenforoConnector.has_been_run_before + self.start = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=1) + self.cookies: dict[str, str] = {} + # mimic user browser to avoid being blocked by the website (see: https://www.useragents.me/) + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/121.0.0.0 Safari/537.36" + } + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + if credentials: + logger.warning("Unexpected credentials provided for Xenforo Connector") + return None + + def load_from_state(self) -> GenerateDocumentsOutput: + # Standardize URL to always end in /. + if self.base_url[-1] != "/": + self.base_url += "/" + + # Remove all extra parameters from the end such as page, post. + matches = ("threads/", "boards/", "forums/") + for each in matches: + if each in self.base_url: + try: + self.base_url = self.base_url[ + 0 : self.base_url.index( + "/", self.base_url.index(each) + len(each) + ) + + 1 + ] + except ValueError: + pass + + doc_batch: list[Document] = [] + all_threads = [] + + # If the URL contains "boards/" or "forums/", find all threads. + if "boards/" in self.base_url or "forums/" in self.base_url: + pages = get_pages(self.requestsite(self.base_url), self.base_url) + + # Get all pages on thread_list_page + for pre_count, thread_list_page in enumerate(pages, start=1): + logger.info( + f"Getting pages from thread_list_page.. Current: {pre_count}/{len(pages)}\r" + ) + all_threads += self.get_threads(thread_list_page) + # If the URL contains "threads/", add the thread to the list. + elif "threads/" in self.base_url: + all_threads.append(self.base_url) + + # Process all threads + for thread_count, thread_url in enumerate(all_threads, start=1): + soup = self.requestsite(thread_url) + if soup is None: + logger.error(f"Failed to load page: {self.base_url}") + continue + pages = get_pages(soup, thread_url) + # Getting all pages for all threads + for page_index, page in enumerate(pages, start=1): + logger.info( + f"Progress: Page {page_index}/{len(pages)} - Thread {thread_count}/{len(all_threads)}\r" + ) + soup_page = self.requestsite(page) + doc_batch.extend( + scrape_page_posts( + soup_page, page_index, thread_url, self.initial_run, self.start + ) + ) + if doc_batch: + yield doc_batch + + # Mark the initial run finished after all threads and pages have been processed + XenforoConnector.has_been_run_before = True + + def get_threads(self, url: str) -> list[str]: + soup = self.requestsite(url) + thread_tags = soup.find_all(class_="structItem-title") + base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url)) + threads = [] + for x in thread_tags: + y = x.find_all(href=True) + for element in y: + link = element["href"] + if "threads/" in link: + stripped = link[0 : link.rfind("/") + 1] + if base_url + stripped not in threads: + threads.append(base_url + stripped) + return threads + + def requestsite(self, url: str) -> BeautifulSoup: + try: + response = requests.get( + url, cookies=self.cookies, headers=self.headers, timeout=10 + ) + if response.status_code != 200: + logger.error( + f"<{url}> Request Error: {response.status_code} - {response.reason}" + ) + return BeautifulSoup(response.text, "html.parser") + except TimeoutError: + logger.error("Timed out Error.") + except Exception as e: + logger.error(f"Error on {url}") + logger.exception(e) + return BeautifulSoup("", "html.parser") + + +if __name__ == "__main__": + connector = XenforoConnector( + # base_url="https://cassiopaea.org/forum/threads/how-to-change-your-emotional-state.41381/" + base_url="https://xenforo.com/community/threads/whats-new-with-enhanced-search-resource-manager-and-media-gallery-in-xenforo-2-3.220935/" + ) + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/web/public/Xenforo.svg b/web/public/Xenforo.svg new file mode 100644 index 000000000000..fcf93ac92304 --- /dev/null +++ b/web/public/Xenforo.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx index 1beb199777f6..119d8391b2c1 100644 --- a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx +++ b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx @@ -28,6 +28,7 @@ import { createConnectorValidationSchema, defaultPruneFreqDays, defaultRefreshFreqMinutes, + isLoadState, } from "@/lib/connectors/connectors"; import { Modal } from "@/components/Modal"; import GDriveMain from "./pages/gdrive/GoogleDrivePage"; @@ -316,7 +317,7 @@ export default function AddConnector({ const { message, isSuccess, response } = await submitConnector( { connector_specific_config: transformedConnectorSpecificConfig, - input_type: connector == "web" ? "load_state" : "poll", // single case + input_type: isLoadState(connector) ? "load_state" : "poll", // single case name: name, source: connector, is_public: access_type == "public", diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx index b14e532f4178..1f6f1f2e8d13 100644 --- a/web/src/components/icons/icons.tsx +++ b/web/src/components/icons/icons.tsx @@ -86,7 +86,7 @@ import clickupIcon from "../../../public/Clickup.svg"; import cohereIcon from "../../../public/Cohere.svg"; import voyageIcon from "../../../public/Voyage.png"; import googleIcon from "../../../public/Google.webp"; - +import xenforoIcon from "../../../public/Xenforo.svg"; import { FaRobot } from "react-icons/fa"; export interface IconProps { @@ -2811,6 +2811,21 @@ export const WindowsIcon = ({ ); }; + +export const XenforoIcon = ({ + size = 16, + className = defaultTailwindCSS, +}: IconProps) => { + return ( +
+ Logo +
+ ); +}; + export const AsanaIcon = ({ size = 16, className = defaultTailwindCSS, diff --git a/web/src/lib/connectors/connectors.ts b/web/src/lib/connectors/connectors.ts index dc11611a960e..19db22d51808 100644 --- a/web/src/lib/connectors/connectors.ts +++ b/web/src/lib/connectors/connectors.ts @@ -3,6 +3,16 @@ import { IsPublicGroupSelectorFormType } from "@/components/IsPublicGroupSelecto import { ConfigurableSources, ValidInputTypes, ValidSources } from "../types"; import { AccessTypeGroupSelectorFormType } from "@/components/admin/connectors/AccessTypeGroupSelector"; +export function isLoadState(connector_name: string): boolean { + // TODO: centralize connector metadata like this somewhere instead of hardcoding it here + const loadStateConnectors = ["web", "xenforo"]; + if (loadStateConnectors.includes(connector_name)) { + return true; + } + + return false; +} + export type InputType = | "list" | "text" @@ -764,6 +774,20 @@ For example, specifying .*-support.* as a "channel" will cause the connector to }, ], }, + xenforo: { + description: "Configure Xenforo connector", + values: [ + { + type: "text", + query: "Enter forum or thread URL:", + label: "URL", + name: "base_url", + optional: false, + description: + "The XenForo v2.2 forum URL to index. Can be board or thread.", + }, + ], + }, asana: { description: "Configure Asana connector", values: [ @@ -1054,6 +1078,10 @@ export interface GoogleSitesConfig { base_url: string; } +export interface XenforoConfig { + base_url: string; +} + export interface ZendeskConfig {} export interface DropboxConfig {} diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts index 481a2b3380a6..d7bcef0adafb 100644 --- a/web/src/lib/connectors/credentials.ts +++ b/web/src/lib/connectors/credentials.ts @@ -289,6 +289,7 @@ export const credentialTemplates: Record = { access_key_id: "", secret_access_key: "", } as OCICredentialJson, + xenforo: null, google_sites: null, file: null, wikipedia: null, diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts index 6ee40e3c5f9a..7347964a7566 100644 --- a/web/src/lib/sources.ts +++ b/web/src/lib/sources.ts @@ -37,6 +37,7 @@ import { OCIStorageIcon, GoogleStorageIcon, ColorSlackIcon, + XenforoIcon, } from "@/components/icons/icons"; import { ValidSources } from "./types"; import { @@ -279,6 +280,11 @@ const SOURCE_METADATA_MAP: SourceMap = { category: SourceCategory.Storage, docs: "https://docs.danswer.dev/connectors/google_storage", }, + xenforo: { + icon: XenforoIcon, + displayName: "Xenforo", + category: SourceCategory.Messaging, + }, ingestion_api: { icon: GlobeIcon, displayName: "Ingestion", diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts index 391c953290b1..d106310563e3 100644 --- a/web/src/lib/types.ts +++ b/web/src/lib/types.ts @@ -251,6 +251,7 @@ const validSources = [ "s3", "r2", "google_cloud_storage", + "xenforo", "oci_storage", "not_applicable", "ingestion_api",