mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 15:30:59 +02:00
Axero Forums Support (#1287)
This commit is contained in:
parent
c7efce3bde
commit
4abf5f27a0
@ -4,6 +4,7 @@ from datetime import timezone
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
@ -12,6 +13,10 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
process_in_batches,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
|
||||
rate_limit_builder,
|
||||
)
|
||||
from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
@ -31,6 +36,15 @@ def _get_auth_header(api_key: str) -> dict[str, str]:
|
||||
return {"Rest-Api-Key": api_key}
|
||||
|
||||
|
||||
@retry_builder()
|
||||
@rate_limit_builder(max_calls=5, period=1)
|
||||
def _rate_limited_request(
|
||||
endpoint: str, headers: dict, params: dict | None = None
|
||||
) -> Any:
|
||||
# https://my.axerosolutions.com/spaces/5/communifire-documentation/wiki/view/370/rest-api
|
||||
return requests.get(endpoint, headers=headers, params=params)
|
||||
|
||||
|
||||
# https://my.axerosolutions.com/spaces/5/communifire-documentation/wiki/view/595/rest-api-get-content-list
|
||||
def _get_entities(
|
||||
entity_type: int,
|
||||
@ -56,7 +70,9 @@ def _get_entities(
|
||||
if space_id is not None:
|
||||
params["SpaceID"] = space_id
|
||||
|
||||
res = requests.get(endpoint, headers=_get_auth_header(api_key), params=params)
|
||||
res = _rate_limited_request(
|
||||
endpoint, headers=_get_auth_header(api_key), params=params
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
# Axero limitations:
|
||||
@ -95,6 +111,126 @@ def _get_entities(
|
||||
return pages_to_return
|
||||
|
||||
|
||||
def _get_obj_by_id(obj_id: int, api_key: str, axero_base_url: str) -> dict:
|
||||
endpoint = axero_base_url + f"api/content/{obj_id}"
|
||||
res = _rate_limited_request(endpoint, headers=_get_auth_header(api_key))
|
||||
res.raise_for_status()
|
||||
|
||||
return res.json()
|
||||
|
||||
|
||||
class AxeroForum(BaseModel):
|
||||
doc_id: str
|
||||
title: str
|
||||
link: str
|
||||
initial_content: str
|
||||
responses: list[str]
|
||||
last_update: datetime
|
||||
|
||||
|
||||
def _map_post_to_parent(
|
||||
posts: dict,
|
||||
api_key: str,
|
||||
axero_base_url: str,
|
||||
) -> list[AxeroForum]:
|
||||
"""Cannot handle in batches since the posts aren't ordered or structured in any way
|
||||
may need to map any number of them to the initial post"""
|
||||
epoch_str = "1970-01-01T00:00:00.000"
|
||||
post_map: dict[int, AxeroForum] = {}
|
||||
|
||||
for ind, post in enumerate(posts):
|
||||
if (ind + 1) % 25 == 0:
|
||||
logger.debug(f"Processed {ind + 1} posts or responses")
|
||||
|
||||
post_time = time_str_to_utc(
|
||||
post.get("DateUpdated") or post.get("DateCreated") or epoch_str
|
||||
)
|
||||
p_id = post.get("ParentContentID")
|
||||
if p_id in post_map:
|
||||
axero_forum = post_map[p_id]
|
||||
axero_forum.responses.insert(0, post.get("ContentSummary"))
|
||||
axero_forum.last_update = max(axero_forum.last_update, post_time)
|
||||
else:
|
||||
initial_post_d = _get_obj_by_id(p_id, api_key, axero_base_url)[
|
||||
"ResponseData"
|
||||
]
|
||||
initial_post_time = time_str_to_utc(
|
||||
initial_post_d.get("DateUpdated")
|
||||
or initial_post_d.get("DateCreated")
|
||||
or epoch_str
|
||||
)
|
||||
post_map[p_id] = AxeroForum(
|
||||
doc_id="AXERO_" + str(initial_post_d.get("ContentID")),
|
||||
title=initial_post_d.get("ContentTitle"),
|
||||
link=initial_post_d.get("ContentURL"),
|
||||
initial_content=initial_post_d.get("ContentSummary"),
|
||||
responses=[post.get("ContentSummary")],
|
||||
last_update=max(post_time, initial_post_time),
|
||||
)
|
||||
|
||||
return list(post_map.values())
|
||||
|
||||
|
||||
def _get_forums(
|
||||
api_key: str,
|
||||
axero_base_url: str,
|
||||
space_id: str | None = None,
|
||||
) -> list[dict]:
|
||||
endpoint = axero_base_url + "api/content/list"
|
||||
page_num = 1
|
||||
pages_fetched = 0
|
||||
pages_to_return = []
|
||||
break_out = False
|
||||
|
||||
while True:
|
||||
params = {
|
||||
"EntityType": "54",
|
||||
"SortColumn": "DateUpdated",
|
||||
"SortOrder": "1", # descending
|
||||
"StartPage": str(page_num),
|
||||
}
|
||||
|
||||
if space_id is not None:
|
||||
params["SpaceID"] = space_id
|
||||
|
||||
res = _rate_limited_request(
|
||||
endpoint, headers=_get_auth_header(api_key), params=params
|
||||
)
|
||||
res.raise_for_status()
|
||||
|
||||
data = res.json()
|
||||
total_records = data["TotalRecords"]
|
||||
contents = data["ResponseData"]
|
||||
pages_fetched += len(contents)
|
||||
logger.debug(f"Fetched {pages_fetched} forums")
|
||||
|
||||
for page in contents:
|
||||
pages_to_return.append(page)
|
||||
|
||||
if pages_fetched >= total_records:
|
||||
break
|
||||
|
||||
page_num += 1
|
||||
|
||||
if break_out:
|
||||
break
|
||||
|
||||
return pages_to_return
|
||||
|
||||
|
||||
def _translate_forum_to_doc(af: AxeroForum) -> Document:
|
||||
doc = Document(
|
||||
id=af.doc_id,
|
||||
sections=[Section(link=af.link, text=reply) for reply in af.responses],
|
||||
source=DocumentSource.AXERO,
|
||||
semantic_identifier=af.title,
|
||||
doc_updated_at=af.last_update,
|
||||
metadata={},
|
||||
)
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
def _translate_content_to_doc(content: dict) -> Document:
|
||||
page_text = ""
|
||||
summary = content.get("ContentSummary")
|
||||
@ -126,8 +262,7 @@ class AxeroConnector(PollConnector):
|
||||
include_article: bool = True,
|
||||
include_blog: bool = True,
|
||||
include_wiki: bool = True,
|
||||
# Forums not supported atm
|
||||
include_forum: bool = False,
|
||||
include_forum: bool = True,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
self.include_article = include_article
|
||||
@ -165,12 +300,11 @@ class AxeroConnector(PollConnector):
|
||||
entity_types.append(4)
|
||||
if self.include_wiki:
|
||||
entity_types.append(9)
|
||||
if self.include_forum:
|
||||
raise NotImplementedError("Forums for Axero not supported currently")
|
||||
|
||||
iterable_space_ids = self.space_ids if self.space_ids else [None]
|
||||
|
||||
for space_id in iterable_space_ids:
|
||||
entity_types = []
|
||||
for entity in entity_types:
|
||||
axero_obj = _get_entities(
|
||||
entity_type=entity,
|
||||
@ -186,6 +320,31 @@ class AxeroConnector(PollConnector):
|
||||
batch_size=self.batch_size,
|
||||
)
|
||||
|
||||
if self.include_forum:
|
||||
forums_posts = _get_forums(
|
||||
api_key=self.axero_key,
|
||||
axero_base_url=self.base_url,
|
||||
space_id=space_id,
|
||||
)
|
||||
|
||||
all_axero_forums = _map_post_to_parent(
|
||||
posts=forums_posts,
|
||||
api_key=self.axero_key,
|
||||
axero_base_url=self.base_url,
|
||||
)
|
||||
|
||||
filtered_forums = [
|
||||
f
|
||||
for f in all_axero_forums
|
||||
if f.last_update >= start_datetime and f.last_update <= end_datetime
|
||||
]
|
||||
|
||||
yield from process_in_batches(
|
||||
objects=filtered_forums,
|
||||
process_function=_translate_forum_to_doc,
|
||||
batch_size=self.batch_size,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
@ -150,8 +150,8 @@ const MainSection = () => {
|
||||
{axeroConnectorIndexingStatuses.length > 0 && (
|
||||
<>
|
||||
<Text className="mb-2">
|
||||
We pull the latest <i>Articles</i>, <i>Blogs</i>, and <i>Wikis</i>{" "}
|
||||
every <b>10</b> minutes.
|
||||
We pull the latest <i>Articles</i>, <i>Blogs</i>, <i>Wikis</i> and{" "}
|
||||
<i>Forums</i> once per day.
|
||||
</Text>
|
||||
<div className="mb-2">
|
||||
<ConnectorsTable<AxeroConfig, AxeroCredentialJson>
|
||||
@ -222,7 +222,7 @@ const MainSection = () => {
|
||||
initialValues={{
|
||||
spaces: [],
|
||||
}}
|
||||
refreshFreq={10 * 60} // 10 minutes
|
||||
refreshFreq={60 * 60 * 24} // 1 day
|
||||
credentialId={axeroCredential.id}
|
||||
/>
|
||||
</Card>
|
||||
|
Loading…
x
Reference in New Issue
Block a user