mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-19 12:03:54 +02:00
Add Document UpdatedAt times for most connectors (#605)
This commit is contained in:
@@ -47,6 +47,7 @@ def format_document_soup(
|
||||
list_element_start = False
|
||||
verbatim_output = 0
|
||||
in_table = False
|
||||
last_added_newline = False
|
||||
for e in document.descendants:
|
||||
verbatim_output -= 1
|
||||
if isinstance(e, bs4.element.NavigableString):
|
||||
@@ -57,11 +58,29 @@ def format_document_soup(
|
||||
# Tables are represented in natural language with rows separated by newlines
|
||||
# Can't have newlines then in the table elements
|
||||
element_text = element_text.replace("\n", " ").strip()
|
||||
|
||||
# Some tags are translated to spaces but in the logic underneath this section, we
|
||||
# translate them to newlines as a browser should render them such as with br
|
||||
# This logic here avoids a space after newline when it shouldn't be there.
|
||||
if last_added_newline and element_text.startswith(" "):
|
||||
element_text = element_text[1:]
|
||||
last_added_newline = False
|
||||
|
||||
if element_text:
|
||||
if verbatim_output > 0:
|
||||
text += element_text
|
||||
else:
|
||||
text += strip_newlines(element_text)
|
||||
content_to_add = (
|
||||
element_text
|
||||
if verbatim_output > 0
|
||||
else strip_newlines(element_text)
|
||||
)
|
||||
|
||||
# Don't join separate elements without any spacing
|
||||
if (text and not text[-1].isspace()) and (
|
||||
content_to_add and not content_to_add[0].isspace()
|
||||
):
|
||||
text += " "
|
||||
|
||||
text += content_to_add
|
||||
|
||||
list_element_start = False
|
||||
elif isinstance(e, bs4.element.Tag):
|
||||
# table is standard HTML element
|
||||
@@ -82,9 +101,14 @@ def format_document_soup(
|
||||
elif e.name in ["p", "div"]:
|
||||
if not list_element_start:
|
||||
text += "\n"
|
||||
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
||||
elif e.name in ["h1", "h2", "h3", "h4"]:
|
||||
text += "\n"
|
||||
list_element_start = False
|
||||
last_added_newline = True
|
||||
elif e.name == "br":
|
||||
text += "\n"
|
||||
list_element_start = False
|
||||
last_added_newline = True
|
||||
elif e.name == "li":
|
||||
text += "\n- "
|
||||
list_element_start = True
|
||||
|
@@ -3,6 +3,7 @@ from datetime import timezone
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from dateutil.parser import parse
|
||||
from jira import JIRA
|
||||
from jira.resources import Issue
|
||||
|
||||
@@ -59,6 +60,8 @@ def fetch_jira_issues_batch(
|
||||
logger.warning(f"Found Jira object not of type Issue {jira}")
|
||||
continue
|
||||
|
||||
ticket_updated_time = parse(jira.fields.updated)
|
||||
|
||||
semantic_rep = (
|
||||
f"Jira Ticket Summary: {jira.fields.summary}\n"
|
||||
f"Description: {jira.fields.description}\n"
|
||||
@@ -75,6 +78,7 @@ def fetch_jira_issues_batch(
|
||||
sections=[Section(link=page_url, text=semantic_rep)],
|
||||
source=DocumentSource.JIRA,
|
||||
semantic_identifier=jira.fields.summary,
|
||||
doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
@@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector):
|
||||
start_ind += fetched_batch_size
|
||||
if fetched_batch_size < self.batch_size:
|
||||
break
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
connector = JiraConnector(os.environ["JIRA_PROJECT_URL"])
|
||||
connector.load_credentials(
|
||||
{
|
||||
"jira_user_email": os.environ["JIRA_USER_EMAIL"],
|
||||
"jira_api_token": os.environ["JIRA_API_TOKEN"],
|
||||
}
|
||||
)
|
||||
document_batches = connector.load_from_state()
|
||||
print(next(document_batches))
|
||||
|
@@ -5,10 +5,10 @@ from typing import List
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
@@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector):
|
||||
if end is not None and updated_at > end:
|
||||
continue
|
||||
|
||||
authors = [
|
||||
author["email_id"]
|
||||
for author in article_details.get("authors", [])
|
||||
if author["email_id"]
|
||||
]
|
||||
|
||||
doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
|
||||
|
||||
html_content = article_details["html_content"]
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
article_content = soup.get_text()
|
||||
article_content = parse_html_page_basic(html_content)
|
||||
doc_text = (
|
||||
f"workspace: {self.workspace}\n"
|
||||
f"category: {article['category_name']}\n"
|
||||
f"article: {article_details['title']} - "
|
||||
f"{article_details.get('description', '')} - "
|
||||
f"{article_details.get('description', '')}\n"
|
||||
f"{article_content}"
|
||||
)
|
||||
|
||||
@@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector):
|
||||
sections=[Section(link=doc_link, text=doc_text)],
|
||||
source=DocumentSource.DOCUMENT360,
|
||||
semantic_identifier=article_details["title"],
|
||||
doc_updated_at=updated_at,
|
||||
primary_owners=authors,
|
||||
metadata={},
|
||||
)
|
||||
|
||||
@@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector):
|
||||
|
||||
if __name__ == "__main__":
|
||||
import time
|
||||
import os
|
||||
|
||||
document360_connector = Document360Connector("Your Workspace", ["Your categories"])
|
||||
document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
|
||||
document360_connector.load_credentials(
|
||||
{"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"}
|
||||
{
|
||||
"portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
|
||||
"document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
|
||||
}
|
||||
)
|
||||
|
||||
current = time.time()
|
||||
one_day_ago = current - 24 * 60 * 60 # 1 days
|
||||
one_day_ago = current - 24 * 60 * 60 * 360 # 1 year
|
||||
latest_docs = document360_connector.poll_source(one_day_ago, current)
|
||||
|
||||
for doc in latest_docs:
|
||||
|
@@ -1,4 +1,6 @@
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
@@ -41,6 +43,7 @@ def _open_files_at_location(
|
||||
def _process_file(
|
||||
file_name: str,
|
||||
file: IO[Any],
|
||||
time_updated: datetime,
|
||||
pdf_pass: str | None = None,
|
||||
) -> list[Document]:
|
||||
extension = get_file_ext(file_name)
|
||||
@@ -63,6 +66,7 @@ def _process_file(
|
||||
sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
|
||||
source=DocumentSource.FILE,
|
||||
semantic_identifier=file_name,
|
||||
doc_updated_at=time_updated,
|
||||
metadata={},
|
||||
)
|
||||
]
|
||||
@@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector):
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
documents: list[Document] = []
|
||||
for file_location in self.file_locations:
|
||||
current_datetime = datetime.now(timezone.utc)
|
||||
files = _open_files_at_location(file_location)
|
||||
|
||||
for file_name, file in files:
|
||||
documents.extend(_process_file(file_name, file, self.pdf_pass))
|
||||
documents.extend(
|
||||
_process_file(file_name, file, current_datetime, self.pdf_pass)
|
||||
)
|
||||
|
||||
if len(documents) >= self.batch_size:
|
||||
yield documents
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import itertools
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
@@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
|
||||
sections=[Section(link=pull_request.html_url, text=full_context)],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=pull_request.title,
|
||||
# updated_at is UTC time but is timezone unaware, explicitly add UTC
|
||||
# as there is logic in indexing to prevent wrong timestamped docs
|
||||
# due to local time discrepancies with UTC
|
||||
doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata={
|
||||
"last_modified": str(pull_request.last_modified),
|
||||
"merged": pull_request.merged,
|
||||
"state": pull_request.state,
|
||||
},
|
||||
@@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document:
|
||||
sections=[Section(link=issue.html_url, text=full_context)],
|
||||
source=DocumentSource.GITHUB,
|
||||
semantic_identifier=issue.title,
|
||||
# updated_at is UTC time but is timezone unaware
|
||||
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata={
|
||||
"last_modified": str(issue.updated_at),
|
||||
"state": issue.state,
|
||||
},
|
||||
)
|
||||
|
@@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
response = requests.post(
|
||||
url, headers=self._get_auth_header(), json=body
|
||||
)
|
||||
# If no calls in the range, just break out
|
||||
if response.status_code == 404:
|
||||
break
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
@@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector):
|
||||
source=DocumentSource.GONG,
|
||||
# Should not ever be Untitled as a call cannot be made without a Title
|
||||
semantic_identifier=call_title or "Untitled",
|
||||
doc_updated_at=datetime.fromisoformat(
|
||||
call_metadata["started"]
|
||||
).astimezone(timezone.utc),
|
||||
metadata={"Start Time": call_metadata["started"]},
|
||||
)
|
||||
)
|
||||
@@ -270,6 +276,5 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
current = time.time()
|
||||
one_day_ago = current - 24 * 60 * 60 # 1 day
|
||||
latest_docs = connector.poll_source(one_day_ago, current)
|
||||
latest_docs = connector.load_from_state()
|
||||
print(next(latest_docs))
|
||||
|
@@ -1,8 +1,9 @@
|
||||
import datetime
|
||||
import io
|
||||
import tempfile
|
||||
from collections.abc import Iterator
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from enum import Enum
|
||||
from itertools import chain
|
||||
from typing import Any
|
||||
@@ -83,7 +84,7 @@ def _run_drive_file_query(
|
||||
includeItemsFromAllDrives=include_shared,
|
||||
fields=(
|
||||
"nextPageToken, files(mimeType, id, name, "
|
||||
"webViewLink, shortcutDetails)"
|
||||
"modifiedTime, webViewLink, shortcutDetails)"
|
||||
),
|
||||
pageToken=next_page_token,
|
||||
q=query,
|
||||
@@ -194,12 +195,10 @@ def _get_files(
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
|
||||
if time_range_start is not None:
|
||||
time_start = (
|
||||
datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
|
||||
)
|
||||
time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
|
||||
query += f"and modifiedTime >= '{time_start}' "
|
||||
if time_range_end is not None:
|
||||
time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
|
||||
time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
|
||||
query += f"and modifiedTime <= '{time_stop}' "
|
||||
if folder_id:
|
||||
query += f"and '{folder_id}' in parents "
|
||||
@@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
||||
],
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=datetime.fromisoformat(
|
||||
file["modifiedTime"]
|
||||
).astimezone(timezone.utc),
|
||||
metadata={} if text_contents else {IGNORE_FOR_QA: True},
|
||||
)
|
||||
)
|
||||
|
@@ -1,4 +1,5 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
@@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
sections=[Section(link=link, text=content_text)],
|
||||
source=DocumentSource.HUBSPOT,
|
||||
semantic_identifier=title,
|
||||
# Is already in tzutc, just replacing the timezone format
|
||||
doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
@@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
import time
|
||||
|
||||
test_connector = HubSpotConnector()
|
||||
test_connector.load_credentials(
|
||||
connector = HubSpotConnector()
|
||||
connector.load_credentials(
|
||||
{"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
|
||||
)
|
||||
all_docs = test_connector.load_from_state()
|
||||
|
||||
current = time.time()
|
||||
one_day_ago = current - 24 * 60 * 60 # 1 day
|
||||
latest_docs = test_connector.poll_source(one_day_ago, current)
|
||||
print(latest_docs)
|
||||
document_batches = connector.load_from_state()
|
||||
print(next(document_batches))
|
||||
|
@@ -2,6 +2,8 @@ import time
|
||||
from collections.abc import Generator
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import fields
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import Optional
|
||||
|
||||
@@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
],
|
||||
source=DocumentSource.NOTION,
|
||||
semantic_identifier=page_title,
|
||||
doc_updated_at=datetime.fromisoformat(
|
||||
page.last_edited_time
|
||||
).astimezone(timezone.utc),
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
@@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
|
||||
connector = NotionConnector(root_page_id=root_page_id)
|
||||
connector = NotionConnector()
|
||||
connector.load_credentials(
|
||||
{"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
|
||||
)
|
||||
|
@@ -1,7 +1,8 @@
|
||||
import json
|
||||
import os
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
@@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
|
||||
return threads
|
||||
|
||||
|
||||
def get_latest_message_time(thread: ThreadType) -> datetime:
|
||||
max_ts = max([float(msg.get("ts", 0)) for msg in thread])
|
||||
return datetime.fromtimestamp(max_ts, tz=timezone.utc)
|
||||
|
||||
|
||||
def get_event_time(event: dict[str, Any]) -> datetime | None:
|
||||
ts = event.get("ts")
|
||||
if not ts:
|
||||
return None
|
||||
return datetime.fromtimestamp(float(ts), tz=timezone.utc)
|
||||
|
||||
|
||||
def thread_to_doc(
|
||||
workspace: str,
|
||||
channel: ChannelType,
|
||||
@@ -148,6 +161,7 @@ def thread_to_doc(
|
||||
],
|
||||
source=DocumentSource.SLACK,
|
||||
semantic_identifier=channel["name"],
|
||||
doc_updated_at=get_latest_message_time(thread),
|
||||
title="", # slack docs don't really have a "title"
|
||||
metadata={},
|
||||
)
|
||||
@@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector):
|
||||
source=matching_doc.source,
|
||||
semantic_identifier=matching_doc.semantic_identifier,
|
||||
title="", # slack docs don't really have a "title"
|
||||
doc_updated_at=get_event_time(slack_event),
|
||||
metadata=matching_doc.metadata,
|
||||
)
|
||||
|
||||
@@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector):
|
||||
source=DocumentSource.SLACK,
|
||||
semantic_identifier=channel["name"],
|
||||
title="", # slack docs don't really have a "title"
|
||||
doc_updated_at=get_event_time(slack_event),
|
||||
metadata={},
|
||||
)
|
||||
|
||||
@@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector):
|
||||
|
||||
if documents:
|
||||
yield documents
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
import time
|
||||
|
||||
connector = SlackPollConnector(
|
||||
workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]]
|
||||
)
|
||||
connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})
|
||||
|
||||
current = time.time()
|
||||
one_day_ago = current - 24 * 60 * 60 # 1 day
|
||||
document_batches = connector.poll_source(one_day_ago, current)
|
||||
|
||||
print(next(document_batches))
|
||||
|
@@ -1,5 +1,4 @@
|
||||
import io
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
@@ -173,8 +172,6 @@ class WebConnector(LoadConnector):
|
||||
logger.info(f"Visiting {current_url}")
|
||||
|
||||
try:
|
||||
current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
|
||||
|
||||
if restart_playwright:
|
||||
playwright, context = start_playwright()
|
||||
restart_playwright = False
|
||||
@@ -192,7 +189,7 @@ class WebConnector(LoadConnector):
|
||||
sections=[Section(link=current_url, text=page_text)],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=current_url.split(".")[-1],
|
||||
metadata={"Time Visited": current_visit_time},
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
Reference in New Issue
Block a user