mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-16 11:00:58 +02:00
Fix Zulip connector schema + links and enable temporal metadata (#4005)
This commit is contained in:
parent
7fd5d31dbe
commit
f371efc916
@ -1,9 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import urllib.parse
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import List
|
from typing import List
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
from typing import Dict, Union
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from zulip import Client
|
from zulip import Client
|
||||||
|
|
||||||
@ -36,9 +39,40 @@ class ZulipConnector(LoadConnector, PollConnector):
|
|||||||
) -> None:
|
) -> None:
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.realm_name = realm_name
|
self.realm_name = realm_name
|
||||||
self.realm_url = realm_url if realm_url.endswith("/") else realm_url + "/"
|
|
||||||
|
# Clean and normalize the URL
|
||||||
|
realm_url = realm_url.strip().lower()
|
||||||
|
|
||||||
|
# Remove any trailing slashes
|
||||||
|
realm_url = realm_url.rstrip('/')
|
||||||
|
|
||||||
|
# Ensure the URL has a scheme
|
||||||
|
if not realm_url.startswith(('http://', 'https://')):
|
||||||
|
realm_url = f'https://{realm_url}'
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = urllib.parse.urlparse(realm_url)
|
||||||
|
|
||||||
|
# Extract the base domain without any paths or ports
|
||||||
|
netloc = parsed.netloc.split(':')[0] # Remove port if present
|
||||||
|
|
||||||
|
if not netloc:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid realm URL format: {realm_url}. "
|
||||||
|
f"URL must include a valid domain name."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Always use HTTPS for security
|
||||||
|
self.base_url = f"https://{netloc}"
|
||||||
self.client: Client | None = None
|
self.client: Client | None = None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Failed to parse Zulip realm URL: {realm_url}. "
|
||||||
|
f"Please provide a URL in the format: domain.com or https://domain.com. "
|
||||||
|
f"Error: {str(e)}"
|
||||||
|
)
|
||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
contents = credentials["zuliprc_content"]
|
contents = credentials["zuliprc_content"]
|
||||||
# The input field converts newlines to spaces in the provided
|
# The input field converts newlines to spaces in the provided
|
||||||
@ -55,12 +89,17 @@ class ZulipConnector(LoadConnector, PollConnector):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _message_to_narrow_link(self, m: Message) -> str:
|
def _message_to_narrow_link(self, m: Message) -> str:
|
||||||
|
try:
|
||||||
stream_name = m.display_recipient # assume str
|
stream_name = m.display_recipient # assume str
|
||||||
stream_operand = encode_zulip_narrow_operand(f"{m.stream_id}-{stream_name}")
|
stream_operand = encode_zulip_narrow_operand(f"{m.stream_id}-{stream_name}")
|
||||||
topic_operand = encode_zulip_narrow_operand(m.subject)
|
topic_operand = encode_zulip_narrow_operand(m.subject)
|
||||||
|
|
||||||
narrow_link = f"{self.realm_url}#narrow/stream/{stream_operand}/topic/{topic_operand}/near/{m.id}"
|
narrow_link = f"{self.base_url}#narrow/stream/{stream_operand}/topic/{topic_operand}/near/{m.id}"
|
||||||
return narrow_link
|
return narrow_link
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error generating Zulip message link: {e}")
|
||||||
|
# Fallback to a basic link that at least includes the base URL
|
||||||
|
return f"{self.base_url}#narrow/id/{m.id}"
|
||||||
|
|
||||||
def _get_message_batch(self, anchor: str) -> Tuple[bool, List[Message]]:
|
def _get_message_batch(self, anchor: str) -> Tuple[bool, List[Message]]:
|
||||||
if self.client is None:
|
if self.client is None:
|
||||||
@ -83,6 +122,40 @@ class ZulipConnector(LoadConnector, PollConnector):
|
|||||||
def _message_to_doc(self, message: Message) -> Document:
|
def _message_to_doc(self, message: Message) -> Document:
|
||||||
text = f"{message.sender_full_name}: {message.content}"
|
text = f"{message.sender_full_name}: {message.content}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Convert timestamps to UTC datetime objects
|
||||||
|
post_time = datetime.fromtimestamp(message.timestamp, tz=timezone.utc)
|
||||||
|
edit_time = (
|
||||||
|
datetime.fromtimestamp(message.last_edit_timestamp, tz=timezone.utc)
|
||||||
|
if message.last_edit_timestamp is not None
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use the most recent edit time if available, otherwise use post time
|
||||||
|
doc_time = edit_time if edit_time is not None else post_time
|
||||||
|
|
||||||
|
except (ValueError, TypeError) as e:
|
||||||
|
logger.warning(f"Failed to parse timestamp for message {message.id}: {e}")
|
||||||
|
post_time = None
|
||||||
|
edit_time = None
|
||||||
|
doc_time = None
|
||||||
|
|
||||||
|
metadata: Dict[str, Union[str, List[str]]] = {
|
||||||
|
"stream_name": str(message.display_recipient),
|
||||||
|
"topic": str(message.subject),
|
||||||
|
"sender_name": str(message.sender_full_name),
|
||||||
|
"sender_email": str(message.sender_email),
|
||||||
|
"message_timestamp": str(message.timestamp),
|
||||||
|
"message_id": str(message.id),
|
||||||
|
"stream_id": str(message.stream_id),
|
||||||
|
"has_reactions": str(len(message.reactions) > 0),
|
||||||
|
"content_type": str(message.content_type or "text"),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Always include edit timestamp in metadata when available
|
||||||
|
if edit_time is not None:
|
||||||
|
metadata["edit_timestamp"] = str(message.last_edit_timestamp)
|
||||||
|
|
||||||
return Document(
|
return Document(
|
||||||
id=f"{message.stream_id}__{message.id}",
|
id=f"{message.stream_id}__{message.id}",
|
||||||
sections=[
|
sections=[
|
||||||
@ -92,8 +165,9 @@ class ZulipConnector(LoadConnector, PollConnector):
|
|||||||
)
|
)
|
||||||
],
|
],
|
||||||
source=DocumentSource.ZULIP,
|
source=DocumentSource.ZULIP,
|
||||||
semantic_identifier=message.display_recipient or message.subject,
|
semantic_identifier=f"{message.display_recipient} > {message.subject}",
|
||||||
metadata={},
|
metadata=metadata,
|
||||||
|
doc_updated_at=doc_time, # Use most recent edit time or post time
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_docs(
|
def _get_docs(
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import List
|
from typing import List
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
@ -19,7 +20,7 @@ class Message(BaseModel):
|
|||||||
sender_realm_str: str
|
sender_realm_str: str
|
||||||
subject: str
|
subject: str
|
||||||
topic_links: Optional[List[Any]] = None
|
topic_links: Optional[List[Any]] = None
|
||||||
last_edit_timestamp: Optional[int]
|
last_edit_timestamp: Optional[int] = None
|
||||||
edit_history: Any = None
|
edit_history: Any = None
|
||||||
reactions: List[Any]
|
reactions: List[Any]
|
||||||
submessages: List[Any]
|
submessages: List[Any]
|
||||||
@ -39,5 +40,5 @@ class GetMessagesResponse(BaseModel):
|
|||||||
found_oldest: Optional[bool] = None
|
found_oldest: Optional[bool] = None
|
||||||
found_newest: Optional[bool] = None
|
found_newest: Optional[bool] = None
|
||||||
history_limited: Optional[bool] = None
|
history_limited: Optional[bool] = None
|
||||||
anchor: Optional[str] = None
|
anchor: Optional[Union[str, int]] = None
|
||||||
messages: List[Message] = Field(default_factory=list)
|
messages: List[Message] = Field(default_factory=list)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user