mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-20 13:05:49 +02:00
Feature/xenforo (#2497)
* Xenforo forum parser support * clarify ssl cert reqs * missed a file * add isLoadState function, fix up xenforo for data driven connector approach * fixing a new edge case to skip an unexpected parsed element * change documentsource to xenforo * make doc id unique and comment what's happening * remove stray log line * address code review --------- Co-authored-by: sime2408 <simun.sunjic@gmail.com> Co-authored-by: Richard Kuo <rkuo@rkuo.com>
This commit is contained in:
@@ -107,6 +107,7 @@ class DocumentSource(str, Enum):
|
||||
R2 = "r2"
|
||||
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
|
||||
OCI_STORAGE = "oci_storage"
|
||||
XENFORO = "xenforo"
|
||||
NOT_APPLICABLE = "not_applicable"
|
||||
|
||||
|
||||
|
@@ -42,6 +42,7 @@ from danswer.connectors.slack.load_connector import SlackLoadConnector
|
||||
from danswer.connectors.teams.connector import TeamsConnector
|
||||
from danswer.connectors.web.connector import WebConnector
|
||||
from danswer.connectors.wikipedia.connector import WikipediaConnector
|
||||
from danswer.connectors.xenforo.connector import XenforoConnector
|
||||
from danswer.connectors.zendesk.connector import ZendeskConnector
|
||||
from danswer.connectors.zulip.connector import ZulipConnector
|
||||
from danswer.db.credentials import backend_update_credential_json
|
||||
@@ -97,6 +98,7 @@ def identify_connector_class(
|
||||
DocumentSource.R2: BlobStorageConnector,
|
||||
DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
|
||||
DocumentSource.OCI_STORAGE: BlobStorageConnector,
|
||||
DocumentSource.XENFORO: XenforoConnector,
|
||||
}
|
||||
connector_by_source = connector_map.get(source, {})
|
||||
|
||||
|
0
backend/danswer/connectors/xenforo/__init__.py
Normal file
0
backend/danswer/connectors/xenforo/__init__.py
Normal file
244
backend/danswer/connectors/xenforo/connector.py
Normal file
244
backend/danswer/connectors/xenforo/connector.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
This is the XenforoConnector class. It is used to connect to a Xenforo forum and load or update documents from the forum.
|
||||
|
||||
To use this class, you need to provide the URL of the Xenforo forum board you want to connect to when creating an instance
|
||||
of the class. The URL should be a string that starts with 'http://' or 'https://', followed by the domain name of the
|
||||
forum, followed by the board name. For example:
|
||||
|
||||
base_url = 'https://www.example.com/forum/boards/some-topic/'
|
||||
|
||||
The `load_from_state` method is used to load documents from the forum. It takes an optional `state` parameter, which
|
||||
can be used to specify a state from which to start loading documents.
|
||||
"""
|
||||
import re
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pytz
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import Tag
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def get_title(soup: BeautifulSoup) -> str:
|
||||
el = soup.find("h1", "p-title-value")
|
||||
if not el:
|
||||
return ""
|
||||
title = el.text
|
||||
for char in (";", ":", "!", "*", "/", "\\", "?", '"', "<", ">", "|"):
|
||||
title = title.replace(char, "_")
|
||||
return title
|
||||
|
||||
|
||||
def get_pages(soup: BeautifulSoup, url: str) -> list[str]:
|
||||
page_tags = soup.select("li.pageNav-page")
|
||||
page_numbers = []
|
||||
for button in page_tags:
|
||||
if re.match(r"^\d+$", button.text):
|
||||
page_numbers.append(button.text)
|
||||
|
||||
max_pages = int(max(page_numbers, key=int)) if page_numbers else 1
|
||||
|
||||
all_pages = []
|
||||
for x in range(1, int(max_pages) + 1):
|
||||
all_pages.append(f"{url}page-{x}")
|
||||
return all_pages
|
||||
|
||||
|
||||
def parse_post_date(post_element: BeautifulSoup) -> datetime:
|
||||
el = post_element.find("time")
|
||||
if not isinstance(el, Tag) or "datetime" not in el.attrs:
|
||||
return datetime.utcfromtimestamp(0).replace(tzinfo=timezone.utc)
|
||||
|
||||
date_value = el["datetime"]
|
||||
|
||||
# Ensure date_value is a string (if it's a list, take the first element)
|
||||
if isinstance(date_value, list):
|
||||
date_value = date_value[0]
|
||||
|
||||
post_date = datetime.strptime(date_value, "%Y-%m-%dT%H:%M:%S%z")
|
||||
return datetime_to_utc(post_date)
|
||||
|
||||
|
||||
def scrape_page_posts(
|
||||
soup: BeautifulSoup,
|
||||
page_index: int,
|
||||
url: str,
|
||||
initial_run: bool,
|
||||
start_time: datetime,
|
||||
) -> list:
|
||||
title = get_title(soup)
|
||||
|
||||
documents = []
|
||||
for post in soup.find_all("div", class_="message-inner"):
|
||||
post_date = parse_post_date(post)
|
||||
if initial_run or post_date > start_time:
|
||||
el = post.find("div", class_="bbWrapper")
|
||||
if not el:
|
||||
continue
|
||||
post_text = el.get_text(strip=True) + "\n"
|
||||
author_tag = post.find("a", class_="username")
|
||||
if author_tag is None:
|
||||
author_tag = post.find("span", class_="username")
|
||||
author = author_tag.get_text(strip=True) if author_tag else "Deleted author"
|
||||
formatted_time = post_date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# TODO: if a caller calls this for each page of a thread, it may see the
|
||||
# same post multiple times if there is a sticky post
|
||||
# that appears on each page of a thread.
|
||||
# it's important to generate unique doc id's, so page index is part of the
|
||||
# id. We may want to de-dupe this stuff inside the indexing service.
|
||||
document = Document(
|
||||
id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}",
|
||||
sections=[Section(link=url, text=post_text)],
|
||||
title=title,
|
||||
source=DocumentSource.XENFORO,
|
||||
semantic_identifier=title,
|
||||
primary_owners=[BasicExpertInfo(display_name=author)],
|
||||
metadata={
|
||||
"type": "post",
|
||||
"author": author,
|
||||
"time": formatted_time,
|
||||
},
|
||||
doc_updated_at=post_date,
|
||||
)
|
||||
|
||||
documents.append(document)
|
||||
return documents
|
||||
|
||||
|
||||
class XenforoConnector(LoadConnector):
|
||||
# Class variable to track if the connector has been run before
|
||||
has_been_run_before = False
|
||||
|
||||
def __init__(self, base_url: str) -> None:
|
||||
self.base_url = base_url
|
||||
self.initial_run = not XenforoConnector.has_been_run_before
|
||||
self.start = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=1)
|
||||
self.cookies: dict[str, str] = {}
|
||||
# mimic user browser to avoid being blocked by the website (see: https://www.useragents.me/)
|
||||
self.headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/121.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
if credentials:
|
||||
logger.warning("Unexpected credentials provided for Xenforo Connector")
|
||||
return None
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
# Standardize URL to always end in /.
|
||||
if self.base_url[-1] != "/":
|
||||
self.base_url += "/"
|
||||
|
||||
# Remove all extra parameters from the end such as page, post.
|
||||
matches = ("threads/", "boards/", "forums/")
|
||||
for each in matches:
|
||||
if each in self.base_url:
|
||||
try:
|
||||
self.base_url = self.base_url[
|
||||
0 : self.base_url.index(
|
||||
"/", self.base_url.index(each) + len(each)
|
||||
)
|
||||
+ 1
|
||||
]
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
doc_batch: list[Document] = []
|
||||
all_threads = []
|
||||
|
||||
# If the URL contains "boards/" or "forums/", find all threads.
|
||||
if "boards/" in self.base_url or "forums/" in self.base_url:
|
||||
pages = get_pages(self.requestsite(self.base_url), self.base_url)
|
||||
|
||||
# Get all pages on thread_list_page
|
||||
for pre_count, thread_list_page in enumerate(pages, start=1):
|
||||
logger.info(
|
||||
f"Getting pages from thread_list_page.. Current: {pre_count}/{len(pages)}\r"
|
||||
)
|
||||
all_threads += self.get_threads(thread_list_page)
|
||||
# If the URL contains "threads/", add the thread to the list.
|
||||
elif "threads/" in self.base_url:
|
||||
all_threads.append(self.base_url)
|
||||
|
||||
# Process all threads
|
||||
for thread_count, thread_url in enumerate(all_threads, start=1):
|
||||
soup = self.requestsite(thread_url)
|
||||
if soup is None:
|
||||
logger.error(f"Failed to load page: {self.base_url}")
|
||||
continue
|
||||
pages = get_pages(soup, thread_url)
|
||||
# Getting all pages for all threads
|
||||
for page_index, page in enumerate(pages, start=1):
|
||||
logger.info(
|
||||
f"Progress: Page {page_index}/{len(pages)} - Thread {thread_count}/{len(all_threads)}\r"
|
||||
)
|
||||
soup_page = self.requestsite(page)
|
||||
doc_batch.extend(
|
||||
scrape_page_posts(
|
||||
soup_page, page_index, thread_url, self.initial_run, self.start
|
||||
)
|
||||
)
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
|
||||
# Mark the initial run finished after all threads and pages have been processed
|
||||
XenforoConnector.has_been_run_before = True
|
||||
|
||||
def get_threads(self, url: str) -> list[str]:
|
||||
soup = self.requestsite(url)
|
||||
thread_tags = soup.find_all(class_="structItem-title")
|
||||
base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
|
||||
threads = []
|
||||
for x in thread_tags:
|
||||
y = x.find_all(href=True)
|
||||
for element in y:
|
||||
link = element["href"]
|
||||
if "threads/" in link:
|
||||
stripped = link[0 : link.rfind("/") + 1]
|
||||
if base_url + stripped not in threads:
|
||||
threads.append(base_url + stripped)
|
||||
return threads
|
||||
|
||||
def requestsite(self, url: str) -> BeautifulSoup:
|
||||
try:
|
||||
response = requests.get(
|
||||
url, cookies=self.cookies, headers=self.headers, timeout=10
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.error(
|
||||
f"<{url}> Request Error: {response.status_code} - {response.reason}"
|
||||
)
|
||||
return BeautifulSoup(response.text, "html.parser")
|
||||
except TimeoutError:
|
||||
logger.error("Timed out Error.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error on {url}")
|
||||
logger.exception(e)
|
||||
return BeautifulSoup("", "html.parser")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
connector = XenforoConnector(
|
||||
# base_url="https://cassiopaea.org/forum/threads/how-to-change-your-emotional-state.41381/"
|
||||
base_url="https://xenforo.com/community/threads/whats-new-with-enhanced-search-resource-manager-and-media-gallery-in-xenforo-2-3.220935/"
|
||||
)
|
||||
document_batches = connector.load_from_state()
|
||||
print(next(document_batches))
|
1
web/public/Xenforo.svg
Normal file
1
web/public/Xenforo.svg
Normal file
@@ -0,0 +1 @@
|
||||
<svg enable-background="new 0 0 1024 1024" viewBox="0 0 1024 1024" xmlns="http://www.w3.org/2000/svg"><circle cx="512" cy="512" fill="#006296" r="512"/><path d="m235 511.5 35.9 52.3h-17.3l-26.2-40-26.3 40h-17.3l35.9-51.7-32.2-47.2h17.5l22.4 36m4.2-6.5 18.3-29.5h17.3l-27.9 40.3m49.9.6c0-2.2 0-4.3.2-6.2v-.7c.6-8.8 2.2-14.7 5-17.7 3.5-3.8 11.1-5.6 22.8-5.6s19.3 1.6 22.8 4.8c3.3 3 5 9.2 5.2 18.6h14.6c-.9-13-3.6-21.9-8.1-26.6-5.7-5.9-17.6-8.8-35.8-8.8-16.7 0-27.9 3.4-33.6 10.1-5.7 6.8-8.6 20-8.5 39.7 0 21.3 2.8 35.4 8.5 42 5.7 6.7 17.6 10 35.8 10 15.6 0 26.6-2.1 32.8-6.2s9.4-11.3 9.4-21.6l-.1-3.6h-14.8v3.1c0 6.8-1.7 11.1-5.2 13.2-3.5 2-11.1 3-22.7 2.9-12.3 0-20.1-2.1-23.4-6.4s-5-14.2-5-29.8h71.2v-7.8c0-1.2 0-2.4 0-3.5zm.2-7v.7c0-.2 0-.4 0-.7zm132.2-23.3c9.2 0 15.5 1.5 18.7 4.4s4.9 8.6 4.9 17.1v1.8h14.6c0-13.7-2.6-23-7.5-27.9s-14.6-7.4-28.7-7.4c-17.5 0-28.5 5.1-32.9 15.2l-.4-.3.4-13.4h-14v33.9h14.8c1.8-15.7 11.8-23.4 30.1-23.4zm-45 30.3h14.5v58h-14.5zm68.6 0h14.5v58h-14.5zm50.9-20.4v-49.3h70.6v-13.4h-86.6v76.2h82.5c.4-4.5 1.1-9 2.1-13.4zm-15.9 20.4h15.9v58h-15.9zm165.8 8.6c0 17.6-1.5 28.4-4.6 32.6s-11.1 6.3-24 6.3c-12.8 0-20.8-2.1-23.8-6.3-3.1-4.2-4.6-15-4.6-32.6 0-3.1 0-6 .1-8.6h-15.1c-.1 2.7-.2 5.6-.2 8.6 0 21.2 2.7 35 8.2 41.4 5.4 6.3 17.3 9.5 35.6 9.5s30.1-3.2 35.6-9.5 8.2-20.1 8.2-41.4c0-3 0-5.9-.2-8.6h-15.2zm-52.3-32.7c3.1-4.2 11-6.3 23.8-6.3s20.8 2.1 24 6.3c2 2.8 3.4 8.5 4 17.1h15.1c-1.1-12.6-3.6-21.2-7.6-25.9-5.4-6.3-17.3-9.5-35.6-9.5s-30.1 3.2-35.5 9.5c-4 4.6-6.5 13.3-7.6 25.9h15.1c.9-8.6 2.2-14.3 4.3-17.1zm81.2 24.1h14.5v58h-14.5zm38.7-31.2c11 0 16.5 5.1 16.5 15.2 0 .9-.1 2.7-.3 5.3l-.4 3.7h14.3l.2-8c0-18.8-9.1-28.1-27.3-28.1-13.2 0-22.6 4.7-28.3 14l-.3-.3 1.4-11.4h-14.5v34h14.7c1.5-16.3 9.4-24.4 24-24.4zm118.4-1.6c-5.4-6.3-17.3-9.5-35.6-9.5s-30.1 3.2-35.5 9.5c-4 4.6-6.5 13.3-7.6 25.9h15.1c.7-8.6 2-14.3 4-17.1 3.1-4.2 11-6.3 23.9-6.3 12.8 0 20.8 2.1 24 6.3 3.1 4.2 4.6 15.1 4.6 32.6 0 17.6-1.5 28.5-4.6 32.6-3.1 4.2-11.1 6.3-24 6.3s-20.9-2.1-23.9-6.3c-3.1-4.2-4.6-15-4.6-32.6 0-3.1 0-6 .1-8.6h-15.2c-.1 2.7-.2 5.6-.2 8.6 0 21.2 2.7 35 8.2 41.4 5.4 6.3 17.3 9.5 35.5 9.5 18.3 0 30.1-3.2 35.6-9.5s8.2-20.1 8.2-41.4c.1-21.3-2.6-35-8-41.4z" fill="#fff"/></svg>
|
After Width: | Height: | Size: 2.1 KiB |
@@ -28,6 +28,7 @@ import {
|
||||
createConnectorValidationSchema,
|
||||
defaultPruneFreqDays,
|
||||
defaultRefreshFreqMinutes,
|
||||
isLoadState,
|
||||
} from "@/lib/connectors/connectors";
|
||||
import { Modal } from "@/components/Modal";
|
||||
import GDriveMain from "./pages/gdrive/GoogleDrivePage";
|
||||
@@ -316,7 +317,7 @@ export default function AddConnector({
|
||||
const { message, isSuccess, response } = await submitConnector<any>(
|
||||
{
|
||||
connector_specific_config: transformedConnectorSpecificConfig,
|
||||
input_type: connector == "web" ? "load_state" : "poll", // single case
|
||||
input_type: isLoadState(connector) ? "load_state" : "poll", // single case
|
||||
name: name,
|
||||
source: connector,
|
||||
is_public: access_type == "public",
|
||||
|
@@ -86,7 +86,7 @@ import clickupIcon from "../../../public/Clickup.svg";
|
||||
import cohereIcon from "../../../public/Cohere.svg";
|
||||
import voyageIcon from "../../../public/Voyage.png";
|
||||
import googleIcon from "../../../public/Google.webp";
|
||||
|
||||
import xenforoIcon from "../../../public/Xenforo.svg";
|
||||
import { FaRobot } from "react-icons/fa";
|
||||
|
||||
export interface IconProps {
|
||||
@@ -2811,6 +2811,21 @@ export const WindowsIcon = ({
|
||||
</svg>
|
||||
);
|
||||
};
|
||||
|
||||
export const XenforoIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => {
|
||||
return (
|
||||
<div
|
||||
style={{ width: `${size}px`, height: `${size}px` }}
|
||||
className={`w-[${size}px] h-[${size}px] ` + className}
|
||||
>
|
||||
<Image src={xenforoIcon} alt="Logo" width="96" height="96" />
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export const AsanaIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
|
@@ -3,6 +3,16 @@ import { IsPublicGroupSelectorFormType } from "@/components/IsPublicGroupSelecto
|
||||
import { ConfigurableSources, ValidInputTypes, ValidSources } from "../types";
|
||||
import { AccessTypeGroupSelectorFormType } from "@/components/admin/connectors/AccessTypeGroupSelector";
|
||||
|
||||
export function isLoadState(connector_name: string): boolean {
|
||||
// TODO: centralize connector metadata like this somewhere instead of hardcoding it here
|
||||
const loadStateConnectors = ["web", "xenforo"];
|
||||
if (loadStateConnectors.includes(connector_name)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
export type InputType =
|
||||
| "list"
|
||||
| "text"
|
||||
@@ -764,6 +774,20 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
|
||||
},
|
||||
],
|
||||
},
|
||||
xenforo: {
|
||||
description: "Configure Xenforo connector",
|
||||
values: [
|
||||
{
|
||||
type: "text",
|
||||
query: "Enter forum or thread URL:",
|
||||
label: "URL",
|
||||
name: "base_url",
|
||||
optional: false,
|
||||
description:
|
||||
"The XenForo v2.2 forum URL to index. Can be board or thread.",
|
||||
},
|
||||
],
|
||||
},
|
||||
asana: {
|
||||
description: "Configure Asana connector",
|
||||
values: [
|
||||
@@ -1054,6 +1078,10 @@ export interface GoogleSitesConfig {
|
||||
base_url: string;
|
||||
}
|
||||
|
||||
export interface XenforoConfig {
|
||||
base_url: string;
|
||||
}
|
||||
|
||||
export interface ZendeskConfig {}
|
||||
|
||||
export interface DropboxConfig {}
|
||||
|
@@ -289,6 +289,7 @@ export const credentialTemplates: Record<ValidSources, any> = {
|
||||
access_key_id: "",
|
||||
secret_access_key: "",
|
||||
} as OCICredentialJson,
|
||||
xenforo: null,
|
||||
google_sites: null,
|
||||
file: null,
|
||||
wikipedia: null,
|
||||
|
@@ -37,6 +37,7 @@ import {
|
||||
OCIStorageIcon,
|
||||
GoogleStorageIcon,
|
||||
ColorSlackIcon,
|
||||
XenforoIcon,
|
||||
} from "@/components/icons/icons";
|
||||
import { ValidSources } from "./types";
|
||||
import {
|
||||
@@ -279,6 +280,11 @@ const SOURCE_METADATA_MAP: SourceMap = {
|
||||
category: SourceCategory.Storage,
|
||||
docs: "https://docs.danswer.dev/connectors/google_storage",
|
||||
},
|
||||
xenforo: {
|
||||
icon: XenforoIcon,
|
||||
displayName: "Xenforo",
|
||||
category: SourceCategory.Messaging,
|
||||
},
|
||||
ingestion_api: {
|
||||
icon: GlobeIcon,
|
||||
displayName: "Ingestion",
|
||||
|
@@ -251,6 +251,7 @@ const validSources = [
|
||||
"s3",
|
||||
"r2",
|
||||
"google_cloud_storage",
|
||||
"xenforo",
|
||||
"oci_storage",
|
||||
"not_applicable",
|
||||
"ingestion_api",
|
||||
|
Reference in New Issue
Block a user