diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py
index e34b8b894d53..52314db920c8 100644
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -107,6 +107,7 @@ class DocumentSource(str, Enum):
R2 = "r2"
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
OCI_STORAGE = "oci_storage"
+ XENFORO = "xenforo"
NOT_APPLICABLE = "not_applicable"
diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py
index 6df16bea6411..7d6fc9c11678 100644
--- a/backend/danswer/connectors/factory.py
+++ b/backend/danswer/connectors/factory.py
@@ -42,6 +42,7 @@ from danswer.connectors.slack.load_connector import SlackLoadConnector
from danswer.connectors.teams.connector import TeamsConnector
from danswer.connectors.web.connector import WebConnector
from danswer.connectors.wikipedia.connector import WikipediaConnector
+from danswer.connectors.xenforo.connector import XenforoConnector
from danswer.connectors.zendesk.connector import ZendeskConnector
from danswer.connectors.zulip.connector import ZulipConnector
from danswer.db.credentials import backend_update_credential_json
@@ -97,6 +98,7 @@ def identify_connector_class(
DocumentSource.R2: BlobStorageConnector,
DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
DocumentSource.OCI_STORAGE: BlobStorageConnector,
+ DocumentSource.XENFORO: XenforoConnector,
}
connector_by_source = connector_map.get(source, {})
diff --git a/backend/danswer/connectors/xenforo/__init__.py b/backend/danswer/connectors/xenforo/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/backend/danswer/connectors/xenforo/connector.py b/backend/danswer/connectors/xenforo/connector.py
new file mode 100644
index 000000000000..7f5221543f1b
--- /dev/null
+++ b/backend/danswer/connectors/xenforo/connector.py
@@ -0,0 +1,244 @@
+"""
+This is the XenforoConnector class. It is used to connect to a Xenforo forum and load or update documents from the forum.
+
+To use this class, you need to provide the URL of the Xenforo forum board you want to connect to when creating an instance
+of the class. The URL should be a string that starts with 'http://' or 'https://', followed by the domain name of the
+forum, followed by the board name. For example:
+
+ base_url = 'https://www.example.com/forum/boards/some-topic/'
+
+The `load_from_state` method is used to load documents from the forum. It takes an optional `state` parameter, which
+can be used to specify a state from which to start loading documents.
+"""
+import re
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+from typing import Any
+from urllib.parse import urlparse
+
+import pytz
+import requests
+from bs4 import BeautifulSoup
+from bs4 import Tag
+
+from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
+from danswer.connectors.interfaces import GenerateDocumentsOutput
+from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.models import BasicExpertInfo
+from danswer.connectors.models import Document
+from danswer.connectors.models import Section
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def get_title(soup: BeautifulSoup) -> str:
+ el = soup.find("h1", "p-title-value")
+ if not el:
+ return ""
+ title = el.text
+ for char in (";", ":", "!", "*", "/", "\\", "?", '"', "<", ">", "|"):
+ title = title.replace(char, "_")
+ return title
+
+
+def get_pages(soup: BeautifulSoup, url: str) -> list[str]:
+ page_tags = soup.select("li.pageNav-page")
+ page_numbers = []
+ for button in page_tags:
+ if re.match(r"^\d+$", button.text):
+ page_numbers.append(button.text)
+
+ max_pages = int(max(page_numbers, key=int)) if page_numbers else 1
+
+ all_pages = []
+ for x in range(1, int(max_pages) + 1):
+ all_pages.append(f"{url}page-{x}")
+ return all_pages
+
+
+def parse_post_date(post_element: BeautifulSoup) -> datetime:
+ el = post_element.find("time")
+ if not isinstance(el, Tag) or "datetime" not in el.attrs:
+ return datetime.utcfromtimestamp(0).replace(tzinfo=timezone.utc)
+
+ date_value = el["datetime"]
+
+ # Ensure date_value is a string (if it's a list, take the first element)
+ if isinstance(date_value, list):
+ date_value = date_value[0]
+
+ post_date = datetime.strptime(date_value, "%Y-%m-%dT%H:%M:%S%z")
+ return datetime_to_utc(post_date)
+
+
+def scrape_page_posts(
+ soup: BeautifulSoup,
+ page_index: int,
+ url: str,
+ initial_run: bool,
+ start_time: datetime,
+) -> list:
+ title = get_title(soup)
+
+ documents = []
+ for post in soup.find_all("div", class_="message-inner"):
+ post_date = parse_post_date(post)
+ if initial_run or post_date > start_time:
+ el = post.find("div", class_="bbWrapper")
+ if not el:
+ continue
+ post_text = el.get_text(strip=True) + "\n"
+ author_tag = post.find("a", class_="username")
+ if author_tag is None:
+ author_tag = post.find("span", class_="username")
+ author = author_tag.get_text(strip=True) if author_tag else "Deleted author"
+ formatted_time = post_date.strftime("%Y-%m-%d %H:%M:%S")
+
+ # TODO: if a caller calls this for each page of a thread, it may see the
+ # same post multiple times if there is a sticky post
+ # that appears on each page of a thread.
+ # it's important to generate unique doc id's, so page index is part of the
+ # id. We may want to de-dupe this stuff inside the indexing service.
+ document = Document(
+ id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}",
+ sections=[Section(link=url, text=post_text)],
+ title=title,
+ source=DocumentSource.XENFORO,
+ semantic_identifier=title,
+ primary_owners=[BasicExpertInfo(display_name=author)],
+ metadata={
+ "type": "post",
+ "author": author,
+ "time": formatted_time,
+ },
+ doc_updated_at=post_date,
+ )
+
+ documents.append(document)
+ return documents
+
+
+class XenforoConnector(LoadConnector):
+ # Class variable to track if the connector has been run before
+ has_been_run_before = False
+
+ def __init__(self, base_url: str) -> None:
+ self.base_url = base_url
+ self.initial_run = not XenforoConnector.has_been_run_before
+ self.start = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=1)
+ self.cookies: dict[str, str] = {}
+ # mimic user browser to avoid being blocked by the website (see: https://www.useragents.me/)
+ self.headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/121.0.0.0 Safari/537.36"
+ }
+
+ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
+ if credentials:
+ logger.warning("Unexpected credentials provided for Xenforo Connector")
+ return None
+
+ def load_from_state(self) -> GenerateDocumentsOutput:
+ # Standardize URL to always end in /.
+ if self.base_url[-1] != "/":
+ self.base_url += "/"
+
+ # Remove all extra parameters from the end such as page, post.
+ matches = ("threads/", "boards/", "forums/")
+ for each in matches:
+ if each in self.base_url:
+ try:
+ self.base_url = self.base_url[
+ 0 : self.base_url.index(
+ "/", self.base_url.index(each) + len(each)
+ )
+ + 1
+ ]
+ except ValueError:
+ pass
+
+ doc_batch: list[Document] = []
+ all_threads = []
+
+ # If the URL contains "boards/" or "forums/", find all threads.
+ if "boards/" in self.base_url or "forums/" in self.base_url:
+ pages = get_pages(self.requestsite(self.base_url), self.base_url)
+
+ # Get all pages on thread_list_page
+ for pre_count, thread_list_page in enumerate(pages, start=1):
+ logger.info(
+ f"Getting pages from thread_list_page.. Current: {pre_count}/{len(pages)}\r"
+ )
+ all_threads += self.get_threads(thread_list_page)
+ # If the URL contains "threads/", add the thread to the list.
+ elif "threads/" in self.base_url:
+ all_threads.append(self.base_url)
+
+ # Process all threads
+ for thread_count, thread_url in enumerate(all_threads, start=1):
+ soup = self.requestsite(thread_url)
+ if soup is None:
+ logger.error(f"Failed to load page: {self.base_url}")
+ continue
+ pages = get_pages(soup, thread_url)
+ # Getting all pages for all threads
+ for page_index, page in enumerate(pages, start=1):
+ logger.info(
+ f"Progress: Page {page_index}/{len(pages)} - Thread {thread_count}/{len(all_threads)}\r"
+ )
+ soup_page = self.requestsite(page)
+ doc_batch.extend(
+ scrape_page_posts(
+ soup_page, page_index, thread_url, self.initial_run, self.start
+ )
+ )
+ if doc_batch:
+ yield doc_batch
+
+ # Mark the initial run finished after all threads and pages have been processed
+ XenforoConnector.has_been_run_before = True
+
+ def get_threads(self, url: str) -> list[str]:
+ soup = self.requestsite(url)
+ thread_tags = soup.find_all(class_="structItem-title")
+ base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
+ threads = []
+ for x in thread_tags:
+ y = x.find_all(href=True)
+ for element in y:
+ link = element["href"]
+ if "threads/" in link:
+ stripped = link[0 : link.rfind("/") + 1]
+ if base_url + stripped not in threads:
+ threads.append(base_url + stripped)
+ return threads
+
+ def requestsite(self, url: str) -> BeautifulSoup:
+ try:
+ response = requests.get(
+ url, cookies=self.cookies, headers=self.headers, timeout=10
+ )
+ if response.status_code != 200:
+ logger.error(
+ f"<{url}> Request Error: {response.status_code} - {response.reason}"
+ )
+ return BeautifulSoup(response.text, "html.parser")
+ except TimeoutError:
+ logger.error("Timed out Error.")
+ except Exception as e:
+ logger.error(f"Error on {url}")
+ logger.exception(e)
+ return BeautifulSoup("", "html.parser")
+
+
+if __name__ == "__main__":
+ connector = XenforoConnector(
+ # base_url="https://cassiopaea.org/forum/threads/how-to-change-your-emotional-state.41381/"
+ base_url="https://xenforo.com/community/threads/whats-new-with-enhanced-search-resource-manager-and-media-gallery-in-xenforo-2-3.220935/"
+ )
+ document_batches = connector.load_from_state()
+ print(next(document_batches))
diff --git a/web/public/Xenforo.svg b/web/public/Xenforo.svg
new file mode 100644
index 000000000000..fcf93ac92304
--- /dev/null
+++ b/web/public/Xenforo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
index 1beb199777f6..119d8391b2c1 100644
--- a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
+++ b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
@@ -28,6 +28,7 @@ import {
createConnectorValidationSchema,
defaultPruneFreqDays,
defaultRefreshFreqMinutes,
+ isLoadState,
} from "@/lib/connectors/connectors";
import { Modal } from "@/components/Modal";
import GDriveMain from "./pages/gdrive/GoogleDrivePage";
@@ -316,7 +317,7 @@ export default function AddConnector({
const { message, isSuccess, response } = await submitConnector(
{
connector_specific_config: transformedConnectorSpecificConfig,
- input_type: connector == "web" ? "load_state" : "poll", // single case
+ input_type: isLoadState(connector) ? "load_state" : "poll", // single case
name: name,
source: connector,
is_public: access_type == "public",
diff --git a/web/src/components/icons/icons.tsx b/web/src/components/icons/icons.tsx
index b14e532f4178..1f6f1f2e8d13 100644
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -86,7 +86,7 @@ import clickupIcon from "../../../public/Clickup.svg";
import cohereIcon from "../../../public/Cohere.svg";
import voyageIcon from "../../../public/Voyage.png";
import googleIcon from "../../../public/Google.webp";
-
+import xenforoIcon from "../../../public/Xenforo.svg";
import { FaRobot } from "react-icons/fa";
export interface IconProps {
@@ -2811,6 +2811,21 @@ export const WindowsIcon = ({
);
};
+
+export const XenforoIcon = ({
+ size = 16,
+ className = defaultTailwindCSS,
+}: IconProps) => {
+ return (
+
+
+
+ );
+};
+
export const AsanaIcon = ({
size = 16,
className = defaultTailwindCSS,
diff --git a/web/src/lib/connectors/connectors.ts b/web/src/lib/connectors/connectors.ts
index dc11611a960e..19db22d51808 100644
--- a/web/src/lib/connectors/connectors.ts
+++ b/web/src/lib/connectors/connectors.ts
@@ -3,6 +3,16 @@ import { IsPublicGroupSelectorFormType } from "@/components/IsPublicGroupSelecto
import { ConfigurableSources, ValidInputTypes, ValidSources } from "../types";
import { AccessTypeGroupSelectorFormType } from "@/components/admin/connectors/AccessTypeGroupSelector";
+export function isLoadState(connector_name: string): boolean {
+ // TODO: centralize connector metadata like this somewhere instead of hardcoding it here
+ const loadStateConnectors = ["web", "xenforo"];
+ if (loadStateConnectors.includes(connector_name)) {
+ return true;
+ }
+
+ return false;
+}
+
export type InputType =
| "list"
| "text"
@@ -764,6 +774,20 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
},
],
},
+ xenforo: {
+ description: "Configure Xenforo connector",
+ values: [
+ {
+ type: "text",
+ query: "Enter forum or thread URL:",
+ label: "URL",
+ name: "base_url",
+ optional: false,
+ description:
+ "The XenForo v2.2 forum URL to index. Can be board or thread.",
+ },
+ ],
+ },
asana: {
description: "Configure Asana connector",
values: [
@@ -1054,6 +1078,10 @@ export interface GoogleSitesConfig {
base_url: string;
}
+export interface XenforoConfig {
+ base_url: string;
+}
+
export interface ZendeskConfig {}
export interface DropboxConfig {}
diff --git a/web/src/lib/connectors/credentials.ts b/web/src/lib/connectors/credentials.ts
index 481a2b3380a6..d7bcef0adafb 100644
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@@ -289,6 +289,7 @@ export const credentialTemplates: Record = {
access_key_id: "",
secret_access_key: "",
} as OCICredentialJson,
+ xenforo: null,
google_sites: null,
file: null,
wikipedia: null,
diff --git a/web/src/lib/sources.ts b/web/src/lib/sources.ts
index 6ee40e3c5f9a..7347964a7566 100644
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@@ -37,6 +37,7 @@ import {
OCIStorageIcon,
GoogleStorageIcon,
ColorSlackIcon,
+ XenforoIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
@@ -279,6 +280,11 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Storage,
docs: "https://docs.danswer.dev/connectors/google_storage",
},
+ xenforo: {
+ icon: XenforoIcon,
+ displayName: "Xenforo",
+ category: SourceCategory.Messaging,
+ },
ingestion_api: {
icon: GlobeIcon,
displayName: "Ingestion",
diff --git a/web/src/lib/types.ts b/web/src/lib/types.ts
index 391c953290b1..d106310563e3 100644
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -251,6 +251,7 @@ const validSources = [
"s3",
"r2",
"google_cloud_storage",
+ "xenforo",
"oci_storage",
"not_applicable",
"ingestion_api",