Feature/xenforo (#2497)

* Xenforo forum parser support * clarify ssl cert reqs * missed a file * add isLoadState function, fix up xenforo for data driven connector approach * fixing a new edge case to skip an unexpected parsed element * change documentsource to xenforo * make doc id unique and comment what's happening * remove stray log line * address code review --------- Co-authored-by: sime2408 <simun.sunjic@gmail.com> Co-authored-by: Richard Kuo <rkuo@rkuo.com>
2025-09-27 20:38:32 +02:00 · 2024-09-27 09:36:05 -07:00
parent f9638f2ea5
commit 19e57474dc
11 changed files with 302 additions and 2 deletions
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -107,6 +107,7 @@ class DocumentSource(str, Enum):
    R2 = "r2"
    GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
    OCI_STORAGE = "oci_storage"
    XENFORO = "xenforo"
    NOT_APPLICABLE = "not_applicable"
--- a/backend/danswer/connectors/factory.py
+++ b/backend/danswer/connectors/factory.py
@@ -42,6 +42,7 @@ from danswer.connectors.slack.load_connector import SlackLoadConnector
 from danswer.connectors.teams.connector import TeamsConnector
 from danswer.connectors.web.connector import WebConnector
 from danswer.connectors.wikipedia.connector import WikipediaConnector
 from danswer.connectors.xenforo.connector import XenforoConnector
 from danswer.connectors.zendesk.connector import ZendeskConnector
 from danswer.connectors.zulip.connector import ZulipConnector
 from danswer.db.credentials import backend_update_credential_json
@@ -97,6 +98,7 @@ def identify_connector_class(
        DocumentSource.R2: BlobStorageConnector,
        DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
        DocumentSource.OCI_STORAGE: BlobStorageConnector,
        DocumentSource.XENFORO: XenforoConnector,
    }
    connector_by_source = connector_map.get(source, {})
--- a/backend/danswer/connectors/xenforo/init.py
+++ b/backend/danswer/connectors/xenforo/init.py
--- a/backend/danswer/connectors/xenforo/connector.py
+++ b/backend/danswer/connectors/xenforo/connector.py
@@ -0,0 +1,244 @@
 """
 This is the XenforoConnector class. It is used to connect to a Xenforo forum and load or update documents from the forum.
 To use this class, you need to provide the URL of the Xenforo forum board you want to connect to when creating an instance
 of the class. The URL should be a string that starts with 'http://' or 'https://', followed by the domain name of the
 forum, followed by the board name. For example:
    base_url = 'https://www.example.com/forum/boards/some-topic/'
 The `load_from_state` method is used to load documents from the forum. It takes an optional `state` parameter, which
 can be used to specify a state from which to start loading documents.
 """
 import re
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
 from typing import Any
 from urllib.parse import urlparse
 import pytz
 import requests
 from bs4 import BeautifulSoup
 from bs4 import Tag
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import BasicExpertInfo
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.utils.logger import setup_logger
 logger = setup_logger()
 def get_title(soup: BeautifulSoup) -> str:
    el = soup.find("h1", "p-title-value")
    if not el:
        return ""
    title = el.text
    for char in (";", ":", "!", "*", "/", "\\", "?", '"', "<", ">", "|"):
        title = title.replace(char, "_")
    return title
 def get_pages(soup: BeautifulSoup, url: str) -> list[str]:
    page_tags = soup.select("li.pageNav-page")
    page_numbers = []
    for button in page_tags:
        if re.match(r"^\d+$", button.text):
            page_numbers.append(button.text)
    max_pages = int(max(page_numbers, key=int)) if page_numbers else 1
    all_pages = []
    for x in range(1, int(max_pages) + 1):
        all_pages.append(f"{url}page-{x}")
    return all_pages
 def parse_post_date(post_element: BeautifulSoup) -> datetime:
    el = post_element.find("time")
    if not isinstance(el, Tag) or "datetime" not in el.attrs:
        return datetime.utcfromtimestamp(0).replace(tzinfo=timezone.utc)
    date_value = el["datetime"]
    # Ensure date_value is a string (if it's a list, take the first element)
    if isinstance(date_value, list):
        date_value = date_value[0]
    post_date = datetime.strptime(date_value, "%Y-%m-%dT%H:%M:%S%z")
    return datetime_to_utc(post_date)
 def scrape_page_posts(
    soup: BeautifulSoup,
    page_index: int,
    url: str,
    initial_run: bool,
    start_time: datetime,
 ) -> list:
    title = get_title(soup)
    documents = []
    for post in soup.find_all("div", class_="message-inner"):
        post_date = parse_post_date(post)
        if initial_run or post_date > start_time:
            el = post.find("div", class_="bbWrapper")
            if not el:
                continue
            post_text = el.get_text(strip=True) + "\n"
            author_tag = post.find("a", class_="username")
            if author_tag is None:
                author_tag = post.find("span", class_="username")
            author = author_tag.get_text(strip=True) if author_tag else "Deleted author"
            formatted_time = post_date.strftime("%Y-%m-%d %H:%M:%S")
            # TODO: if a caller calls this for each page of a thread, it may see the
            # same post multiple times if there is a sticky post
            # that appears on each page of a thread.
            # it's important to generate unique doc id's, so page index is part of the
            # id. We may want to de-dupe this stuff inside the indexing service.
            document = Document(
                id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}",
                sections=[Section(link=url, text=post_text)],
                title=title,
                source=DocumentSource.XENFORO,
                semantic_identifier=title,
                primary_owners=[BasicExpertInfo(display_name=author)],
                metadata={
                    "type": "post",
                    "author": author,
                    "time": formatted_time,
                },
                doc_updated_at=post_date,
            )
            documents.append(document)
    return documents
 class XenforoConnector(LoadConnector):
    # Class variable to track if the connector has been run before
    has_been_run_before = False
    def __init__(self, base_url: str) -> None:
        self.base_url = base_url
        self.initial_run = not XenforoConnector.has_been_run_before
        self.start = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=1)
        self.cookies: dict[str, str] = {}
        # mimic user browser to avoid being blocked by the website (see: https://www.useragents.me/)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/121.0.0.0 Safari/537.36"
        }
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        if credentials:
            logger.warning("Unexpected credentials provided for Xenforo Connector")
        return None
    def load_from_state(self) -> GenerateDocumentsOutput:
        # Standardize URL to always end in /.
        if self.base_url[-1] != "/":
            self.base_url += "/"
        # Remove all extra parameters from the end such as page, post.
        matches = ("threads/", "boards/", "forums/")
        for each in matches:
            if each in self.base_url:
                try:
                    self.base_url = self.base_url[
                        0 : self.base_url.index(
                            "/", self.base_url.index(each) + len(each)
                        )
                        + 1
                    ]
                except ValueError:
                    pass
        doc_batch: list[Document] = []
        all_threads = []
        # If the URL contains "boards/" or "forums/", find all threads.
        if "boards/" in self.base_url or "forums/" in self.base_url:
            pages = get_pages(self.requestsite(self.base_url), self.base_url)
            # Get all pages on thread_list_page
            for pre_count, thread_list_page in enumerate(pages, start=1):
                logger.info(
                    f"Getting pages from thread_list_page.. Current: {pre_count}/{len(pages)}\r"
                )
                all_threads += self.get_threads(thread_list_page)
        # If the URL contains "threads/", add the thread to the list.
        elif "threads/" in self.base_url:
            all_threads.append(self.base_url)
        # Process all threads
        for thread_count, thread_url in enumerate(all_threads, start=1):
            soup = self.requestsite(thread_url)
            if soup is None:
                logger.error(f"Failed to load page: {self.base_url}")
                continue
            pages = get_pages(soup, thread_url)
            # Getting all pages for all threads
            for page_index, page in enumerate(pages, start=1):
                logger.info(
                    f"Progress: Page {page_index}/{len(pages)} - Thread {thread_count}/{len(all_threads)}\r"
                )
                soup_page = self.requestsite(page)
                doc_batch.extend(
                    scrape_page_posts(
                        soup_page, page_index, thread_url, self.initial_run, self.start
                    )
                )
            if doc_batch:
                yield doc_batch
        # Mark the initial run finished after all threads and pages have been processed
        XenforoConnector.has_been_run_before = True
    def get_threads(self, url: str) -> list[str]:
        soup = self.requestsite(url)
        thread_tags = soup.find_all(class_="structItem-title")
        base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
        threads = []
        for x in thread_tags:
            y = x.find_all(href=True)
            for element in y:
                link = element["href"]
                if "threads/" in link:
                    stripped = link[0 : link.rfind("/") + 1]
                    if base_url + stripped not in threads:
                        threads.append(base_url + stripped)
        return threads
    def requestsite(self, url: str) -> BeautifulSoup:
        try:
            response = requests.get(
                url, cookies=self.cookies, headers=self.headers, timeout=10
            )
            if response.status_code != 200:
                logger.error(
                    f"<{url}> Request Error: {response.status_code} - {response.reason}"
                )
            return BeautifulSoup(response.text, "html.parser")
        except TimeoutError:
            logger.error("Timed out Error.")
        except Exception as e:
            logger.error(f"Error on {url}")
            logger.exception(e)
        return BeautifulSoup("", "html.parser")
 if __name__ == "__main__":
    connector = XenforoConnector(
        # base_url="https://cassiopaea.org/forum/threads/how-to-change-your-emotional-state.41381/"
        base_url="https://xenforo.com/community/threads/whats-new-with-enhanced-search-resource-manager-and-media-gallery-in-xenforo-2-3.220935/"
    )
    document_batches = connector.load_from_state()
    print(next(document_batches))
--- a/web/public/Xenforo.svg
+++ b/web/public/Xenforo.svg
@@ -0,0 +1 @@
 <svg enable-background="new 0 0 1024 1024" viewBox="0 0 1024 1024" xmlns="http://www.w3.org/2000/svg"><circle cx="512" cy="512" fill="#006296" r="512"/><path d="m235 511.5 35.9 52.3h-17.3l-26.2-40-26.3 40h-17.3l35.9-51.7-32.2-47.2h17.5l22.4 36m4.2-6.5 18.3-29.5h17.3l-27.9 40.3m49.9.6c0-2.2 0-4.3.2-6.2v-.7c.6-8.8 2.2-14.7 5-17.7 3.5-3.8 11.1-5.6 22.8-5.6s19.3 1.6 22.8 4.8c3.3 3 5 9.2 5.2 18.6h14.6c-.9-13-3.6-21.9-8.1-26.6-5.7-5.9-17.6-8.8-35.8-8.8-16.7 0-27.9 3.4-33.6 10.1-5.7 6.8-8.6 20-8.5 39.7 0 21.3 2.8 35.4 8.5 42 5.7 6.7 17.6 10 35.8 10 15.6 0 26.6-2.1 32.8-6.2s9.4-11.3 9.4-21.6l-.1-3.6h-14.8v3.1c0 6.8-1.7 11.1-5.2 13.2-3.5 2-11.1 3-22.7 2.9-12.3 0-20.1-2.1-23.4-6.4s-5-14.2-5-29.8h71.2v-7.8c0-1.2 0-2.4 0-3.5zm.2-7v.7c0-.2 0-.4 0-.7zm132.2-23.3c9.2 0 15.5 1.5 18.7 4.4s4.9 8.6 4.9 17.1v1.8h14.6c0-13.7-2.6-23-7.5-27.9s-14.6-7.4-28.7-7.4c-17.5 0-28.5 5.1-32.9 15.2l-.4-.3.4-13.4h-14v33.9h14.8c1.8-15.7 11.8-23.4 30.1-23.4zm-45 30.3h14.5v58h-14.5zm68.6 0h14.5v58h-14.5zm50.9-20.4v-49.3h70.6v-13.4h-86.6v76.2h82.5c.4-4.5 1.1-9 2.1-13.4zm-15.9 20.4h15.9v58h-15.9zm165.8 8.6c0 17.6-1.5 28.4-4.6 32.6s-11.1 6.3-24 6.3c-12.8 0-20.8-2.1-23.8-6.3-3.1-4.2-4.6-15-4.6-32.6 0-3.1 0-6 .1-8.6h-15.1c-.1 2.7-.2 5.6-.2 8.6 0 21.2 2.7 35 8.2 41.4 5.4 6.3 17.3 9.5 35.6 9.5s30.1-3.2 35.6-9.5 8.2-20.1 8.2-41.4c0-3 0-5.9-.2-8.6h-15.2zm-52.3-32.7c3.1-4.2 11-6.3 23.8-6.3s20.8 2.1 24 6.3c2 2.8 3.4 8.5 4 17.1h15.1c-1.1-12.6-3.6-21.2-7.6-25.9-5.4-6.3-17.3-9.5-35.6-9.5s-30.1 3.2-35.5 9.5c-4 4.6-6.5 13.3-7.6 25.9h15.1c.9-8.6 2.2-14.3 4.3-17.1zm81.2 24.1h14.5v58h-14.5zm38.7-31.2c11 0 16.5 5.1 16.5 15.2 0 .9-.1 2.7-.3 5.3l-.4 3.7h14.3l.2-8c0-18.8-9.1-28.1-27.3-28.1-13.2 0-22.6 4.7-28.3 14l-.3-.3 1.4-11.4h-14.5v34h14.7c1.5-16.3 9.4-24.4 24-24.4zm118.4-1.6c-5.4-6.3-17.3-9.5-35.6-9.5s-30.1 3.2-35.5 9.5c-4 4.6-6.5 13.3-7.6 25.9h15.1c.7-8.6 2-14.3 4-17.1 3.1-4.2 11-6.3 23.9-6.3 12.8 0 20.8 2.1 24 6.3 3.1 4.2 4.6 15.1 4.6 32.6 0 17.6-1.5 28.5-4.6 32.6-3.1 4.2-11.1 6.3-24 6.3s-20.9-2.1-23.9-6.3c-3.1-4.2-4.6-15-4.6-32.6 0-3.1 0-6 .1-8.6h-15.2c-.1 2.7-.2 5.6-.2 8.6 0 21.2 2.7 35 8.2 41.4 5.4 6.3 17.3 9.5 35.5 9.5 18.3 0 30.1-3.2 35.6-9.5s8.2-20.1 8.2-41.4c.1-21.3-2.6-35-8-41.4z" fill="#fff"/></svg>
--- a/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
+++ b/web/src/app/admin/connectors/[connector]/AddConnectorPage.tsx
@@ -28,6 +28,7 @@ import {
  createConnectorValidationSchema,
  defaultPruneFreqDays,
  defaultRefreshFreqMinutes,
  isLoadState,
 } from "@/lib/connectors/connectors";
 import { Modal } from "@/components/Modal";
 import GDriveMain from "./pages/gdrive/GoogleDrivePage";
@@ -316,7 +317,7 @@ export default function AddConnector({
        const { message, isSuccess, response } = await submitConnector<any>(
          {
            connector_specific_config: transformedConnectorSpecificConfig,
-            input_type: connector == "web" ? "load_state" : "poll", // single case
+            input_type: isLoadState(connector) ? "load_state" : "poll", // single case
            name: name,
            source: connector,
            is_public: access_type == "public",
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -86,7 +86,7 @@ import clickupIcon from "../../../public/Clickup.svg";
 import cohereIcon from "../../../public/Cohere.svg";
 import voyageIcon from "../../../public/Voyage.png";
 import googleIcon from "../../../public/Google.webp";
-
+import xenforoIcon from "../../../public/Xenforo.svg";
 import { FaRobot } from "react-icons/fa";
 export interface IconProps {
@@ -2811,6 +2811,21 @@ export const WindowsIcon = ({
    </svg>
  );
 };
 export const XenforoIcon = ({
  size = 16,
  className = defaultTailwindCSS,
 }: IconProps) => {
  return (
    <div
      style={{ width: `${size}px`, height: `${size}px` }}
      className={`w-[${size}px] h-[${size}px] ` + className}
    >
      <Image src={xenforoIcon} alt="Logo" width="96" height="96" />
    </div>
  );
 };
 export const AsanaIcon = ({
  size = 16,
  className = defaultTailwindCSS,
--- a/web/src/lib/connectors/connectors.ts
+++ b/web/src/lib/connectors/connectors.ts
@@ -3,6 +3,16 @@ import { IsPublicGroupSelectorFormType } from "@/components/IsPublicGroupSelecto
 import { ConfigurableSources, ValidInputTypes, ValidSources } from "../types";
 import { AccessTypeGroupSelectorFormType } from "@/components/admin/connectors/AccessTypeGroupSelector";
 export function isLoadState(connector_name: string): boolean {
  // TODO: centralize connector metadata like this somewhere instead of hardcoding it here
  const loadStateConnectors = ["web", "xenforo"];
  if (loadStateConnectors.includes(connector_name)) {
    return true;
  }
  return false;
 }
 export type InputType =
  | "list"
  | "text"
@@ -764,6 +774,20 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
      },
    ],
  },
  xenforo: {
    description: "Configure Xenforo connector",
    values: [
      {
        type: "text",
        query: "Enter forum or thread URL:",
        label: "URL",
        name: "base_url",
        optional: false,
        description:
          "The XenForo v2.2 forum URL to index. Can be board or thread.",
      },
    ],
  },
  asana: {
    description: "Configure Asana connector",
    values: [
@@ -1054,6 +1078,10 @@ export interface GoogleSitesConfig {
  base_url: string;
 }
 export interface XenforoConfig {
  base_url: string;
 }
 export interface ZendeskConfig {}
 export interface DropboxConfig {}
--- a/web/src/lib/connectors/credentials.ts
+++ b/web/src/lib/connectors/credentials.ts
@@ -289,6 +289,7 @@ export const credentialTemplates: Record<ValidSources, any> = {
    access_key_id: "",
    secret_access_key: "",
  } as OCICredentialJson,
  xenforo: null,
  google_sites: null,
  file: null,
  wikipedia: null,
--- a/web/src/lib/sources.ts
+++ b/web/src/lib/sources.ts
@@ -37,6 +37,7 @@ import {
  OCIStorageIcon,
  GoogleStorageIcon,
  ColorSlackIcon,
  XenforoIcon,
 } from "@/components/icons/icons";
 import { ValidSources } from "./types";
 import {
@@ -279,6 +280,11 @@ const SOURCE_METADATA_MAP: SourceMap = {
    category: SourceCategory.Storage,
    docs: "https://docs.danswer.dev/connectors/google_storage",
  },
  xenforo: {
    icon: XenforoIcon,
    displayName: "Xenforo",
    category: SourceCategory.Messaging,
  },
  ingestion_api: {
    icon: GlobeIcon,
    displayName: "Ingestion",
--- a/web/src/lib/types.ts
+++ b/web/src/lib/types.ts
@@ -251,6 +251,7 @@ const validSources = [
  "s3",
  "r2",
  "google_cloud_storage",
  "xenforo",
  "oci_storage",
  "not_applicable",
  "ingestion_api",
		`@@ -0,0 +1 @@`
							<svg enable-background="new 0 0 1024 1024" viewBox="0 0 1024 1024" xmlns="http://www.w3.org/2000/svg"><circle cx="512" cy="512" fill="#006296" r="512"/><path d="m235 511.5 35.9 52.3h-17.3l-26.2-40-26.3 40h-17.3l35.9-51.7-32.2-47.2h17.5l22.4 36m4.2-6.5 18.3-29.5h17.3l-27.9 40.3m49.9.6c0-2.2 0-4.3.2-6.2v-.7c.6-8.8 2.2-14.7 5-17.7 3.5-3.8 11.1-5.6 22.8-5.6s19.3 1.6 22.8 4.8c3.3 3 5 9.2 5.2 18.6h14.6c-.9-13-3.6-21.9-8.1-26.6-5.7-5.9-17.6-8.8-35.8-8.8-16.7 0-27.9 3.4-33.6 10.1-5.7 6.8-8.6 20-8.5 39.7 0 21.3 2.8 35.4 8.5 42 5.7 6.7 17.6 10 35.8 10 15.6 0 26.6-2.1 32.8-6.2s9.4-11.3 9.4-21.6l-.1-3.6h-14.8v3.1c0 6.8-1.7 11.1-5.2 13.2-3.5 2-11.1 3-22.7 2.9-12.3 0-20.1-2.1-23.4-6.4s-5-14.2-5-29.8h71.2v-7.8c0-1.2 0-2.4 0-3.5zm.2-7v.7c0-.2 0-.4 0-.7zm132.2-23.3c9.2 0 15.5 1.5 18.7 4.4s4.9 8.6 4.9 17.1v1.8h14.6c0-13.7-2.6-23-7.5-27.9s-14.6-7.4-28.7-7.4c-17.5 0-28.5 5.1-32.9 15.2l-.4-.3.4-13.4h-14v33.9h14.8c1.8-15.7 11.8-23.4 30.1-23.4zm-45 30.3h14.5v58h-14.5zm68.6 0h14.5v58h-14.5zm50.9-20.4v-49.3h70.6v-13.4h-86.6v76.2h82.5c.4-4.5 1.1-9 2.1-13.4zm-15.9 20.4h15.9v58h-15.9zm165.8 8.6c0 17.6-1.5 28.4-4.6 32.6s-11.1 6.3-24 6.3c-12.8 0-20.8-2.1-23.8-6.3-3.1-4.2-4.6-15-4.6-32.6 0-3.1 0-6 .1-8.6h-15.1c-.1 2.7-.2 5.6-.2 8.6 0 21.2 2.7 35 8.2 41.4 5.4 6.3 17.3 9.5 35.6 9.5s30.1-3.2 35.6-9.5 8.2-20.1 8.2-41.4c0-3 0-5.9-.2-8.6h-15.2zm-52.3-32.7c3.1-4.2 11-6.3 23.8-6.3s20.8 2.1 24 6.3c2 2.8 3.4 8.5 4 17.1h15.1c-1.1-12.6-3.6-21.2-7.6-25.9-5.4-6.3-17.3-9.5-35.6-9.5s-30.1 3.2-35.5 9.5c-4 4.6-6.5 13.3-7.6 25.9h15.1c.9-8.6 2.2-14.3 4.3-17.1zm81.2 24.1h14.5v58h-14.5zm38.7-31.2c11 0 16.5 5.1 16.5 15.2 0 .9-.1 2.7-.3 5.3l-.4 3.7h14.3l.2-8c0-18.8-9.1-28.1-27.3-28.1-13.2 0-22.6 4.7-28.3 14l-.3-.3 1.4-11.4h-14.5v34h14.7c1.5-16.3 9.4-24.4 24-24.4zm118.4-1.6c-5.4-6.3-17.3-9.5-35.6-9.5s-30.1 3.2-35.5 9.5c-4 4.6-6.5 13.3-7.6 25.9h15.1c.7-8.6 2-14.3 4-17.1 3.1-4.2 11-6.3 23.9-6.3 12.8 0 20.8 2.1 24 6.3 3.1 4.2 4.6 15.1 4.6 32.6 0 17.6-1.5 28.5-4.6 32.6-3.1 4.2-11.1 6.3-24 6.3s-20.9-2.1-23.9-6.3c-3.1-4.2-4.6-15-4.6-32.6 0-3.1 0-6 .1-8.6h-15.2c-.1 2.7-.2 5.6-.2 8.6 0 21.2 2.7 35 8.2 41.4 5.4 6.3 17.3 9.5 35.5 9.5 18.3 0 30.1-3.2 35.6-9.5s8.2-20.1 8.2-41.4c.1-21.3-2.6-35-8-41.4z" fill="#fff"/></svg>