mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-19 12:03:54 +02:00
welcome to onyx
This commit is contained in:
0
backend/onyx/connectors/zulip/__init__.py
Normal file
0
backend/onyx/connectors/zulip/__init__.py
Normal file
140
backend/onyx/connectors/zulip/connector.py
Normal file
140
backend/onyx/connectors/zulip/connector.py
Normal file
@@ -0,0 +1,140 @@
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Generator
|
||||
from typing import Any
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
from zulip import Client
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.zulip.schemas import GetMessagesResponse
|
||||
from onyx.connectors.zulip.schemas import Message
|
||||
from onyx.connectors.zulip.utils import build_search_narrow
|
||||
from onyx.connectors.zulip.utils import call_api
|
||||
from onyx.connectors.zulip.utils import encode_zulip_narrow_operand
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
# Potential improvements
|
||||
# 1. Group documents messages into topics, make 1 document per topic per week
|
||||
# 2. Add end date support once https://github.com/zulip/zulip/issues/25436 is solved
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class ZulipConnector(LoadConnector, PollConnector):
|
||||
def __init__(
|
||||
self, realm_name: str, realm_url: str, batch_size: int = INDEX_BATCH_SIZE
|
||||
) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.realm_name = realm_name
|
||||
self.realm_url = realm_url if realm_url.endswith("/") else realm_url + "/"
|
||||
self.client: Client | None = None
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
contents = credentials["zuliprc_content"]
|
||||
# The input field converts newlines to spaces in the provided
|
||||
# zuliprc file. This reverts them back to newlines.
|
||||
contents_spaces_to_newlines = contents.replace(" ", "\n")
|
||||
# create a temporary zuliprc file
|
||||
tempdir = tempfile.tempdir
|
||||
if tempdir is None:
|
||||
raise Exception("Could not determine tempfile directory")
|
||||
config_file = os.path.join(tempdir, f"zuliprc-{self.realm_name}")
|
||||
with open(config_file, "w") as f:
|
||||
f.write(contents_spaces_to_newlines)
|
||||
self.client = Client(config_file=config_file)
|
||||
return None
|
||||
|
||||
def _message_to_narrow_link(self, m: Message) -> str:
|
||||
stream_name = m.display_recipient # assume str
|
||||
stream_operand = encode_zulip_narrow_operand(f"{m.stream_id}-{stream_name}")
|
||||
topic_operand = encode_zulip_narrow_operand(m.subject)
|
||||
|
||||
narrow_link = f"{self.realm_url}#narrow/stream/{stream_operand}/topic/{topic_operand}/near/{m.id}"
|
||||
return narrow_link
|
||||
|
||||
def _get_message_batch(self, anchor: str) -> Tuple[bool, List[Message]]:
|
||||
if self.client is None:
|
||||
raise ConnectorMissingCredentialError("Zulip")
|
||||
|
||||
logger.info(f"Fetching messages starting with anchor={anchor}")
|
||||
request = build_search_narrow(
|
||||
limit=INDEX_BATCH_SIZE, anchor=anchor, apply_md=False
|
||||
)
|
||||
response = GetMessagesResponse(**call_api(self.client.get_messages, request))
|
||||
|
||||
end = False
|
||||
if len(response.messages) == 0 or response.found_oldest:
|
||||
end = True
|
||||
|
||||
# reverse, so that the last message is the new anchor
|
||||
# and the order is from newest to oldest
|
||||
return end, response.messages[::-1]
|
||||
|
||||
def _message_to_doc(self, message: Message) -> Document:
|
||||
text = f"{message.sender_full_name}: {message.content}"
|
||||
|
||||
return Document(
|
||||
id=f"{message.stream_id}__{message.id}",
|
||||
sections=[
|
||||
Section(
|
||||
link=self._message_to_narrow_link(message),
|
||||
text=text,
|
||||
)
|
||||
],
|
||||
source=DocumentSource.ZULIP,
|
||||
semantic_identifier=message.display_recipient or message.subject,
|
||||
metadata={},
|
||||
)
|
||||
|
||||
def _get_docs(
|
||||
self, anchor: str, start: SecondsSinceUnixEpoch | None = None
|
||||
) -> Generator[Document, None, None]:
|
||||
message: Message | None = None
|
||||
while True:
|
||||
end, message_batch = self._get_message_batch(anchor)
|
||||
|
||||
for message in message_batch:
|
||||
if start is not None and float(message.timestamp) < start:
|
||||
return
|
||||
yield self._message_to_doc(message)
|
||||
|
||||
if end or message is None:
|
||||
return
|
||||
|
||||
# Last message is oldest, use as next anchor
|
||||
anchor = str(message.id)
|
||||
|
||||
def _poll_source(
|
||||
self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
|
||||
) -> GenerateDocumentsOutput:
|
||||
# Since Zulip doesn't support searching by timestamp,
|
||||
# we have to always start from the newest message
|
||||
# and go backwards.
|
||||
anchor = "newest"
|
||||
|
||||
docs = []
|
||||
for doc in self._get_docs(anchor=anchor, start=start):
|
||||
docs.append(doc)
|
||||
if len(docs) == self.batch_size:
|
||||
yield docs
|
||||
docs = []
|
||||
if docs:
|
||||
yield docs
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
return self._poll_source(start=None, end=None)
|
||||
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
return self._poll_source(start, end)
|
43
backend/onyx/connectors/zulip/schemas.py
Normal file
43
backend/onyx/connectors/zulip/schemas.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import Any
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
id: int
|
||||
sender_id: int
|
||||
content: str
|
||||
recipient_id: int
|
||||
timestamp: int
|
||||
client: str
|
||||
is_me_message: bool
|
||||
sender_full_name: str
|
||||
sender_email: str
|
||||
sender_realm_str: str
|
||||
subject: str
|
||||
topic_links: Optional[List[Any]] = None
|
||||
last_edit_timestamp: Optional[int]
|
||||
edit_history: Any = None
|
||||
reactions: List[Any]
|
||||
submessages: List[Any]
|
||||
flags: List[str] = Field(default_factory=list)
|
||||
display_recipient: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
stream_id: int
|
||||
avatar_url: Optional[str]
|
||||
content_type: Optional[str]
|
||||
rendered_content: Optional[str] = None
|
||||
|
||||
|
||||
class GetMessagesResponse(BaseModel):
|
||||
result: str
|
||||
msg: str
|
||||
found_anchor: Optional[bool] = None
|
||||
found_oldest: Optional[bool] = None
|
||||
found_newest: Optional[bool] = None
|
||||
history_limited: Optional[bool] = None
|
||||
anchor: Optional[str] = None
|
||||
messages: List[Message] = Field(default_factory=list)
|
102
backend/onyx/connectors/zulip/utils.py
Normal file
102
backend/onyx/connectors/zulip/utils.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class ZulipAPIError(Exception):
|
||||
def __init__(self, code: Any = None, msg: str | None = None) -> None:
|
||||
self.code = code
|
||||
self.msg = msg
|
||||
|
||||
def __str__(self) -> str:
|
||||
return (
|
||||
f"Error occurred during Zulip API call: {self.msg}" + ""
|
||||
if self.code is None
|
||||
else f" ({self.code})"
|
||||
)
|
||||
|
||||
|
||||
class ZulipHTTPError(ZulipAPIError):
|
||||
def __init__(self, msg: str | None = None, status_code: Any = None) -> None:
|
||||
super().__init__(code=None, msg=msg)
|
||||
self.status_code = status_code
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"HTTP error {self.status_code} occurred during Zulip API call"
|
||||
|
||||
|
||||
def __call_with_retry(fun: Callable, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
||||
result = fun(*args, **kwargs)
|
||||
if result.get("result") == "error":
|
||||
if result.get("code") == "RATE_LIMIT_HIT":
|
||||
retry_after = float(result["retry-after"]) + 1
|
||||
logger.warn(f"Rate limit hit, retrying after {retry_after} seconds")
|
||||
time.sleep(retry_after)
|
||||
return __call_with_retry(fun, *args)
|
||||
return result
|
||||
|
||||
|
||||
def __raise_if_error(response: dict[str, Any]) -> None:
|
||||
if response.get("result") == "error":
|
||||
raise ZulipAPIError(
|
||||
code=response.get("code"),
|
||||
msg=response.get("msg"),
|
||||
)
|
||||
elif response.get("result") == "http-error":
|
||||
raise ZulipHTTPError(
|
||||
msg=response.get("msg"), status_code=response.get("status_code")
|
||||
)
|
||||
|
||||
|
||||
def call_api(fun: Callable, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
||||
response = __call_with_retry(fun, *args, **kwargs)
|
||||
__raise_if_error(response)
|
||||
return response
|
||||
|
||||
|
||||
def build_search_narrow(
|
||||
*,
|
||||
stream: Optional[str] = None,
|
||||
topic: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
content: Optional[str] = None,
|
||||
apply_md: bool = False,
|
||||
anchor: str = "newest",
|
||||
) -> Dict[str, Any]:
|
||||
narrow_filters = []
|
||||
|
||||
if stream:
|
||||
narrow_filters.append({"operator": "stream", "operand": stream})
|
||||
|
||||
if topic:
|
||||
narrow_filters.append({"operator": "topic", "operand": topic})
|
||||
|
||||
if content:
|
||||
narrow_filters.append({"operator": "has", "operand": content})
|
||||
|
||||
if not stream and not topic and not content:
|
||||
narrow_filters.append({"operator": "streams", "operand": "public"})
|
||||
|
||||
narrow = {
|
||||
"anchor": anchor,
|
||||
"num_before": limit,
|
||||
"num_after": 0,
|
||||
"narrow": narrow_filters,
|
||||
}
|
||||
narrow["apply_markdown"] = apply_md
|
||||
|
||||
return narrow
|
||||
|
||||
|
||||
def encode_zulip_narrow_operand(value: str) -> str:
|
||||
# like https://github.com/zulip/zulip/blob/1577662a6/static/js/hash_util.js#L18-L25
|
||||
# safe characters necessary to make Python match Javascript's escaping behaviour,
|
||||
# see: https://stackoverflow.com/a/74439601
|
||||
return quote(value, safe="!~*'()").replace(".", "%2E").replace("%", ".")
|
Reference in New Issue
Block a user