Discourse Connector (#1420)

This commit is contained in:
Yuhong Sun 2024-05-05 16:54:08 -07:00 committed by GitHub
parent 03911de8b2
commit 060a8d0aad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 522 additions and 0 deletions

View File

@ -94,6 +94,7 @@ class DocumentSource(str, Enum):
ZENDESK = "zendesk"
LOOPIO = "loopio"
SHAREPOINT = "sharepoint"
DISCOURSE = "discourse"
AXERO = "axero"

View File

@ -0,0 +1,215 @@
import time
import urllib.parse
from datetime import datetime
from datetime import timezone
from typing import Any
import requests
from pydantic import BaseModel
from requests import Response
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger
logger = setup_logger()
class DiscoursePerms(BaseModel):
api_key: str
api_username: str
@retry_builder()
def discourse_request(
endpoint: str, perms: DiscoursePerms, params: dict | None = None
) -> Response:
headers = {"Api-Key": perms.api_key, "Api-Username": perms.api_username}
response = requests.get(endpoint, headers=headers, params=params)
response.raise_for_status()
return response
class DiscourseConnector(PollConnector):
def __init__(
self,
base_url: str,
categories: list[str] | None = None,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
parsed_url = urllib.parse.urlparse(base_url)
if not parsed_url.scheme:
base_url = "https://" + base_url
self.base_url = base_url
self.categories = [c.lower() for c in categories] if categories else []
self.category_id_map: dict[int, str] = {}
self.batch_size = batch_size
self.permissions: DiscoursePerms | None = None
def _get_categories_map(
self,
) -> None:
assert self.permissions is not None
categories_endpoint = urllib.parse.urljoin(self.base_url, "categories.json")
response = discourse_request(
endpoint=categories_endpoint,
perms=self.permissions,
params={"include_subcategories": True},
)
categories = response.json()["category_list"]["categories"]
self.category_id_map = {
category["id"]: category["name"]
for category in categories
if not self.categories or category["name"].lower() in self.categories
}
def _get_latest_topics(
self, start: datetime | None, end: datetime | None
) -> list[int]:
assert self.permissions is not None
topic_ids = []
valid_categories = set(self.category_id_map.keys())
latest_endpoint = urllib.parse.urljoin(self.base_url, "latest.json")
response = discourse_request(endpoint=latest_endpoint, perms=self.permissions)
topics = response.json()["topic_list"]["topics"]
for topic in topics:
last_time = topic.get("last_posted_at")
if not last_time:
continue
last_time_dt = time_str_to_utc(last_time)
if start and start > last_time_dt:
continue
if end and end < last_time_dt:
continue
if valid_categories and topic.get("category_id") not in valid_categories:
continue
topic_ids.append(topic["id"])
return topic_ids
def _get_doc_from_topic(self, topic_id: int) -> Document:
assert self.permissions is not None
topic_endpoint = urllib.parse.urljoin(self.base_url, f"t/{topic_id}.json")
response = discourse_request(
endpoint=topic_endpoint,
perms=self.permissions,
)
topic = response.json()
topic_url = urllib.parse.urljoin(self.base_url, f"t/{topic['slug']}")
sections = []
poster = None
responders = []
seen_names = set()
for ind, post in enumerate(topic["post_stream"]["posts"]):
if ind == 0:
poster_name = post.get("name")
if poster_name:
seen_names.add(poster_name)
poster = BasicExpertInfo(display_name=poster_name)
else:
responder_name = post.get("name")
if responder_name and responder_name not in seen_names:
seen_names.add(responder_name)
responders.append(BasicExpertInfo(display_name=responder_name))
sections.append(
Section(link=topic_url, text=parse_html_page_basic(post["cooked"]))
)
metadata: dict[str, str | list[str]] = {
"category": self.category_id_map[topic["category_id"]],
}
if topic.get("tags"):
metadata["tags"] = topic["tags"]
doc = Document(
id="_".join([DocumentSource.DISCOURSE.value, str(topic["id"])]),
sections=sections,
source=DocumentSource.DISCOURSE,
semantic_identifier=topic["title"],
doc_updated_at=time_str_to_utc(topic["last_posted_at"]),
primary_owners=[poster] if poster else None,
secondary_owners=responders or None,
metadata=metadata,
)
return doc
def _yield_discourse_documents(
self, topic_ids: list[int]
) -> GenerateDocumentsOutput:
doc_batch: list[Document] = []
for topic_id in topic_ids:
doc_batch.append(self._get_doc_from_topic(topic_id))
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.permissions = DiscoursePerms(
api_key=credentials["discourse_api_key"],
api_username=credentials["discourse_api_username"],
)
return None
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
if self.permissions is None:
raise ConnectorMissingCredentialError("Discourse")
start_datetime = datetime.utcfromtimestamp(start).replace(tzinfo=timezone.utc)
end_datetime = datetime.utcfromtimestamp(end).replace(tzinfo=timezone.utc)
self._get_categories_map()
latest_topic_ids = self._get_latest_topics(
start=start_datetime, end=end_datetime
)
return self._yield_discourse_documents(latest_topic_ids)
if __name__ == "__main__":
import os
connector = DiscourseConnector(base_url=os.environ["DISCOURSE_BASE_URL"])
connector.load_credentials(
{
"discourse_api_key": os.environ["DISCOURSE_API_KEY"],
"discourse_api_username": os.environ["DISCOURSE_API_USERNAME"],
}
)
current = time.time()
one_year_ago = current - 24 * 60 * 60 * 360
latest_docs = connector.poll_source(one_year_ago, current)
print(next(latest_docs))

View File

@ -6,6 +6,7 @@ from danswer.connectors.axero.connector import AxeroConnector
from danswer.connectors.bookstack.connector import BookstackConnector
from danswer.connectors.confluence.connector import ConfluenceConnector
from danswer.connectors.danswer_jira.connector import JiraConnector
from danswer.connectors.discourse.connector import DiscourseConnector
from danswer.connectors.document360.connector import Document360Connector
from danswer.connectors.file.connector import LocalFileConnector
from danswer.connectors.github.connector import GithubConnector
@ -71,6 +72,7 @@ def identify_connector_class(
DocumentSource.ZENDESK: ZendeskConnector,
DocumentSource.LOOPIO: LoopioConnector,
DocumentSource.SHAREPOINT: SharepointConnector,
DocumentSource.DISCOURSE: DiscourseConnector,
DocumentSource.AXERO: AxeroConnector,
}
connector_by_source = connector_map.get(source, {})

BIN
web/public/Discourse.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

View File

@ -0,0 +1,274 @@
"use client";
import * as Yup from "yup";
import { DiscourseIcon, TrashIcon } from "@/components/icons/icons";
import {
TextFormField,
TextArrayFieldBuilder,
} from "@/components/admin/connectors/Field";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { CredentialForm } from "@/components/admin/connectors/CredentialForm";
import {
Credential,
ConnectorIndexingStatus,
DiscourseConfig,
DiscourseCredentialJson,
} from "@/lib/types";
import useSWR, { useSWRConfig } from "swr";
import { fetcher } from "@/lib/fetcher";
import { LoadingAnimation } from "@/components/Loading";
import { adminDeleteCredential, linkCredential } from "@/lib/credential";
import { ConnectorForm } from "@/components/admin/connectors/ConnectorForm";
import { ConnectorsTable } from "@/components/admin/connectors/table/ConnectorsTable";
import { usePopup } from "@/components/admin/connectors/Popup";
import { usePublicCredentials } from "@/lib/hooks";
import { Card, Divider, Text, Title } from "@tremor/react";
import { AdminPageTitle } from "@/components/admin/Title";
const Main = () => {
const { popup, setPopup } = usePopup();
const { mutate } = useSWRConfig();
const {
data: connectorIndexingStatuses,
isLoading: isConnectorIndexingStatusesLoading,
error: isConnectorIndexingStatusesError,
} = useSWR<ConnectorIndexingStatus<any, any>[]>(
"/api/manage/admin/connector/indexing-status",
fetcher
);
const {
data: credentialsData,
isLoading: isCredentialsLoading,
error: isCredentialsError,
refreshCredentials,
} = usePublicCredentials();
if (
(!connectorIndexingStatuses && isConnectorIndexingStatusesLoading) ||
(!credentialsData && isCredentialsLoading)
) {
return <LoadingAnimation text="Loading" />;
}
if (isConnectorIndexingStatusesError || !connectorIndexingStatuses) {
return <div>Failed to load connectors</div>;
}
if (isCredentialsError || !credentialsData) {
return <div>Failed to load credentials</div>;
}
const discourseConnectorIndexingStatuses: ConnectorIndexingStatus<
DiscourseConfig,
DiscourseCredentialJson
>[] = connectorIndexingStatuses.filter(
(connectorIndexingStatus) =>
connectorIndexingStatus.connector.source === "discourse"
);
const discourseCredential: Credential<DiscourseCredentialJson> | undefined =
credentialsData.find(
(credential) => credential.credential_json?.discourse_api_username
);
return (
<>
{popup}
<Text>
This connector allows you to sync all your Discourse Topics into
Danswer. More details on how to setup the Discourse connector can be
found in{" "}
<a
className="text-link"
href="https://docs.danswer.dev/connectors/discourse"
target="_blank"
>
this guide.
</a>
</Text>
<Title className="mb-2 mt-6 ml-auto mr-auto">
Step 1: Provide your API Access info
</Title>
{discourseCredential ? (
<>
<div className="flex mb-1 text-sm">
<p className="my-auto">Existing API Key: </p>
<p className="ml-1 italic my-auto max-w-md truncate">
{discourseCredential.credential_json?.discourse_api_key}
</p>
<button
className="ml-1 hover:bg-hover rounded p-1"
onClick={async () => {
if (discourseConnectorIndexingStatuses.length > 0) {
setPopup({
type: "error",
message:
"Must delete all connectors before deleting credentials",
});
return;
}
await adminDeleteCredential(discourseCredential.id);
refreshCredentials();
}}
>
<TrashIcon />
</button>
</div>
</>
) : (
<>
<Card className="mt-4">
<CredentialForm<DiscourseCredentialJson>
formBody={
<>
<TextFormField
name="discourse_api_username"
label="API Key Username:"
/>
<TextFormField
name="discourse_api_key"
label="API Key:"
type="password"
/>
</>
}
validationSchema={Yup.object().shape({
discourse_api_username: Yup.string().required(
"Please enter the Username associated with the API key"
),
discourse_api_key: Yup.string().required(
"Please enter the API key"
),
})}
initialValues={{
discourse_api_username: "",
discourse_api_key: "",
}}
onSubmit={(isSuccess) => {
if (isSuccess) {
refreshCredentials();
}
}}
/>
</Card>
</>
)}
<Title className="mb-2 mt-6 ml-auto mr-auto">
Step 2: Which Categories do you want to make searchable?
</Title>
{discourseConnectorIndexingStatuses.length > 0 && (
<>
<Text className="mb-2">
We pull Topics with new Posts every <b>10</b> minutes.
</Text>
<div className="mb-2">
<ConnectorsTable<DiscourseConfig, DiscourseCredentialJson>
connectorIndexingStatuses={discourseConnectorIndexingStatuses}
liveCredential={discourseCredential}
getCredential={(credential) =>
credential.credential_json.discourse_api_username
}
specialColumns={[
{
header: "Categories",
key: "categories",
getValue: (ccPairStatus) =>
ccPairStatus.connector.connector_specific_config
.categories &&
ccPairStatus.connector.connector_specific_config.categories
.length > 0
? ccPairStatus.connector.connector_specific_config.categories.join(
", "
)
: "",
},
]}
includeName={true}
onUpdate={() =>
mutate("/api/manage/admin/connector/indexing-status")
}
onCredentialLink={async (connectorId) => {
if (discourseCredential) {
await linkCredential(connectorId, discourseCredential.id);
mutate("/api/manage/admin/connector/indexing-status");
}
}}
/>
</div>
<Divider />
</>
)}
{discourseCredential ? (
<>
<Card className="mt-4">
<h2 className="font-bold mb-3">Create a new Discourse Connector</h2>
<ConnectorForm<DiscourseConfig>
nameBuilder={(values) =>
values.categories
? `${values.base_url}-${values.categories.join("_")}`
: `${values.base_url}-All`
}
source="discourse"
inputType="poll"
formBody={
<>
<TextFormField
name="base_url"
label="Base URL:"
subtext="This might be something like https://danswer.discourse.group/ or https://community.yourcompany.com/"
/>
</>
}
formBodyBuilder={TextArrayFieldBuilder({
name: "categories",
label: "Categories:",
subtext:
"Specify 0 or more Categories to index. If no Categories are specified, Topics from " +
"all categories will be indexed.",
})}
validationSchema={Yup.object().shape({
base_url: Yup.string().required(
"Please the base URL of your Discourse site."
),
categories: Yup.array().of(
Yup.string().required("Category names must be strings")
),
})}
initialValues={{
categories: [],
base_url: "",
}}
refreshFreq={10 * 60} // 10 minutes
credentialId={discourseCredential.id}
/>
</Card>
</>
) : (
<Text>
Please provide your API Key Info in Step 1 first! Once done with that,
you can then start indexing all your Discourse Topics.
</Text>
)}
</>
);
};
export default function Page() {
return (
<div className="mx-auto container">
<div className="mb-4">
<HealthCheckBanner />
</div>
<AdminPageTitle icon={<DiscourseIcon size={32} />} title="Discourse" />
<Main />
</div>
);
}

View File

@ -52,6 +52,7 @@ import document360Icon from "../../../public/Document360.png";
import googleSitesIcon from "../../../public/GoogleSites.png";
import zendeskIcon from "../../../public/Zendesk.svg";
import sharepointIcon from "../../../public/Sharepoint.png";
import discourseIcon from "../../../public/Discourse.png";
import { FaRobot } from "react-icons/fa";
interface IconProps {
@ -601,6 +602,18 @@ export const ZendeskIcon = ({
</div>
);
export const DiscourseIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<div
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
>
<Image src={discourseIcon} alt="Logo" width="96" height="96" />
</div>
);
export const AxeroIcon = ({
size = 16,
className = defaultTailwindCSS,

View File

@ -2,6 +2,7 @@ import {
AxeroIcon,
BookstackIcon,
ConfluenceIcon,
DiscourseIcon,
Document360Icon,
FileIcon,
GithubIcon,
@ -155,6 +156,11 @@ const SOURCE_METADATA_MAP: SourceMap = {
displayName: "Sharepoint",
category: SourceCategory.AppConnection,
},
discourse: {
icon: DiscourseIcon,
displayName: "Discourse",
category: SourceCategory.AppConnection,
},
axero: {
icon: AxeroIcon,
displayName: "Axero",

View File

@ -39,6 +39,7 @@ export type ValidSources =
| "loopio"
| "sharepoint"
| "zendesk"
| "discourse"
| "axero";
export type ValidInputTypes = "load_state" | "poll" | "event";
@ -118,6 +119,11 @@ export interface SharepointConfig {
sites?: string[];
}
export interface DiscourseConfig {
base_url: string;
categories?: string[];
}
export interface AxeroConfig {
spaces?: string[];
}
@ -337,6 +343,11 @@ export interface SharepointCredentialJson {
aad_directory_id: string;
}
export interface DiscourseCredentialJson {
discourse_api_key: string;
discourse_api_username: string;
}
export interface AxeroCredentialJson {
base_url: string;
axero_api_token: string;