From 5ddc9b34abd5239eb92588852141a1359f54b7a5 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Fri, 20 Oct 2023 17:03:28 -0700 Subject: [PATCH] Add Document UpdatedAt times for most connectors (#605) --- .../cross_connector_utils/html_utils.py | 34 ++++++++++++++++--- .../connectors/danswer_jira/connector.py | 18 ++++++++++ .../connectors/document360/connector.py | 25 ++++++++++---- backend/danswer/connectors/file/connector.py | 9 ++++- .../danswer/connectors/github/connector.py | 9 +++-- backend/danswer/connectors/gong/connector.py | 9 +++-- .../connectors/google_drive/connector.py | 14 ++++---- .../danswer/connectors/hubspot/connector.py | 15 ++++---- .../danswer/connectors/notion/connector.py | 8 +++-- backend/danswer/connectors/slack/connector.py | 34 ++++++++++++++++++- backend/danswer/connectors/web/connector.py | 5 +-- 11 files changed, 142 insertions(+), 38 deletions(-) diff --git a/backend/danswer/connectors/cross_connector_utils/html_utils.py b/backend/danswer/connectors/cross_connector_utils/html_utils.py index 550658b47e75..0b4e9fade825 100644 --- a/backend/danswer/connectors/cross_connector_utils/html_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py @@ -47,6 +47,7 @@ def format_document_soup( list_element_start = False verbatim_output = 0 in_table = False + last_added_newline = False for e in document.descendants: verbatim_output -= 1 if isinstance(e, bs4.element.NavigableString): @@ -57,11 +58,29 @@ def format_document_soup( # Tables are represented in natural language with rows separated by newlines # Can't have newlines then in the table elements element_text = element_text.replace("\n", " ").strip() + + # Some tags are translated to spaces but in the logic underneath this section, we + # translate them to newlines as a browser should render them such as with br + # This logic here avoids a space after newline when it shouldn't be there. + if last_added_newline and element_text.startswith(" "): + element_text = element_text[1:] + last_added_newline = False + if element_text: - if verbatim_output > 0: - text += element_text - else: - text += strip_newlines(element_text) + content_to_add = ( + element_text + if verbatim_output > 0 + else strip_newlines(element_text) + ) + + # Don't join separate elements without any spacing + if (text and not text[-1].isspace()) and ( + content_to_add and not content_to_add[0].isspace() + ): + text += " " + + text += content_to_add + list_element_start = False elif isinstance(e, bs4.element.Tag): # table is standard HTML element @@ -82,9 +101,14 @@ def format_document_soup( elif e.name in ["p", "div"]: if not list_element_start: text += "\n" - elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]: + elif e.name in ["h1", "h2", "h3", "h4"]: text += "\n" list_element_start = False + last_added_newline = True + elif e.name == "br": + text += "\n" + list_element_start = False + last_added_newline = True elif e.name == "li": text += "\n- " list_element_start = True diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py index d68fd5cd0abc..0bfd74f60ff9 100644 --- a/backend/danswer/connectors/danswer_jira/connector.py +++ b/backend/danswer/connectors/danswer_jira/connector.py @@ -3,6 +3,7 @@ from datetime import timezone from typing import Any from urllib.parse import urlparse +from dateutil.parser import parse from jira import JIRA from jira.resources import Issue @@ -59,6 +60,8 @@ def fetch_jira_issues_batch( logger.warning(f"Found Jira object not of type Issue {jira}") continue + ticket_updated_time = parse(jira.fields.updated) + semantic_rep = ( f"Jira Ticket Summary: {jira.fields.summary}\n" f"Description: {jira.fields.description}\n" @@ -75,6 +78,7 @@ def fetch_jira_issues_batch( sections=[Section(link=page_url, text=semantic_rep)], source=DocumentSource.JIRA, semantic_identifier=jira.fields.summary, + doc_updated_at=ticket_updated_time.astimezone(timezone.utc), metadata={}, ) ) @@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector): start_ind += fetched_batch_size if fetched_batch_size < self.batch_size: break + + +if __name__ == "__main__": + import os + + connector = JiraConnector(os.environ["JIRA_PROJECT_URL"]) + connector.load_credentials( + { + "jira_user_email": os.environ["JIRA_USER_EMAIL"], + "jira_api_token": os.environ["JIRA_API_TOKEN"], + } + ) + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py index 408f1519abd4..a6e4062c0abf 100644 --- a/backend/danswer/connectors/document360/connector.py +++ b/backend/danswer/connectors/document360/connector.py @@ -5,10 +5,10 @@ from typing import List from typing import Optional import requests -from bs4 import BeautifulSoup from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector): if end is not None and updated_at > end: continue + authors = [ + author["email_id"] + for author in article_details.get("authors", []) + if author["email_id"] + ] + doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}" html_content = article_details["html_content"] - soup = BeautifulSoup(html_content, "html.parser") - article_content = soup.get_text() + article_content = parse_html_page_basic(html_content) doc_text = ( f"workspace: {self.workspace}\n" f"category: {article['category_name']}\n" f"article: {article_details['title']} - " - f"{article_details.get('description', '')} - " + f"{article_details.get('description', '')}\n" f"{article_content}" ) @@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector): sections=[Section(link=doc_link, text=doc_text)], source=DocumentSource.DOCUMENT360, semantic_identifier=article_details["title"], + doc_updated_at=updated_at, + primary_owners=authors, metadata={}, ) @@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector): if __name__ == "__main__": import time + import os - document360_connector = Document360Connector("Your Workspace", ["Your categories"]) + document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"]) document360_connector.load_credentials( - {"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"} + { + "portal_id": os.environ["DOCUMENT360_PORTAL_ID"], + "document360_api_token": os.environ["DOCUMENT360_API_TOKEN"], + } ) current = time.time() - one_day_ago = current - 24 * 60 * 60 # 1 days + one_day_ago = current - 24 * 60 * 60 * 360 # 1 year latest_docs = document360_connector.poll_source(one_day_ago, current) for doc in latest_docs: diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index b1a299ecca53..bcb291537c8a 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -1,4 +1,6 @@ from collections.abc import Generator +from datetime import datetime +from datetime import timezone from pathlib import Path from typing import Any from typing import IO @@ -41,6 +43,7 @@ def _open_files_at_location( def _process_file( file_name: str, file: IO[Any], + time_updated: datetime, pdf_pass: str | None = None, ) -> list[Document]: extension = get_file_ext(file_name) @@ -63,6 +66,7 @@ def _process_file( sections=[Section(link=metadata.get("link", ""), text=file_content_raw)], source=DocumentSource.FILE, semantic_identifier=file_name, + doc_updated_at=time_updated, metadata={}, ) ] @@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector): def load_from_state(self) -> GenerateDocumentsOutput: documents: list[Document] = [] for file_location in self.file_locations: + current_datetime = datetime.now(timezone.utc) files = _open_files_at_location(file_location) for file_name, file in files: - documents.extend(_process_file(file_name, file, self.pdf_pass)) + documents.extend( + _process_file(file_name, file, current_datetime, self.pdf_pass) + ) if len(documents) >= self.batch_size: yield documents diff --git a/backend/danswer/connectors/github/connector.py b/backend/danswer/connectors/github/connector.py index bafffe5b53d9..7fa19b1c4241 100644 --- a/backend/danswer/connectors/github/connector.py +++ b/backend/danswer/connectors/github/connector.py @@ -1,6 +1,7 @@ import itertools from collections.abc import Iterator from datetime import datetime +from datetime import timezone from typing import Any from typing import cast @@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document: sections=[Section(link=pull_request.html_url, text=full_context)], source=DocumentSource.GITHUB, semantic_identifier=pull_request.title, + # updated_at is UTC time but is timezone unaware, explicitly add UTC + # as there is logic in indexing to prevent wrong timestamped docs + # due to local time discrepancies with UTC + doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc), metadata={ - "last_modified": str(pull_request.last_modified), "merged": pull_request.merged, "state": pull_request.state, }, @@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document: sections=[Section(link=issue.html_url, text=full_context)], source=DocumentSource.GITHUB, semantic_identifier=issue.title, + # updated_at is UTC time but is timezone unaware + doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc), metadata={ - "last_modified": str(issue.updated_at), "state": issue.state, }, ) diff --git a/backend/danswer/connectors/gong/connector.py b/backend/danswer/connectors/gong/connector.py index 378fd856d0c0..7f5f7f91ba91 100644 --- a/backend/danswer/connectors/gong/connector.py +++ b/backend/danswer/connectors/gong/connector.py @@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector): response = requests.post( url, headers=self._get_auth_header(), json=body ) + # If no calls in the range, just break out + if response.status_code == 404: + break response.raise_for_status() data = response.json() @@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector): source=DocumentSource.GONG, # Should not ever be Untitled as a call cannot be made without a Title semantic_identifier=call_title or "Untitled", + doc_updated_at=datetime.fromisoformat( + call_metadata["started"] + ).astimezone(timezone.utc), metadata={"Start Time": call_metadata["started"]}, ) ) @@ -270,6 +276,5 @@ if __name__ == "__main__": ) current = time.time() - one_day_ago = current - 24 * 60 * 60 # 1 day - latest_docs = connector.poll_source(one_day_ago, current) + latest_docs = connector.load_from_state() print(next(latest_docs)) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 6310d168de01..5c56475f3a04 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -1,8 +1,9 @@ -import datetime import io import tempfile from collections.abc import Iterator from collections.abc import Sequence +from datetime import datetime +from datetime import timezone from enum import Enum from itertools import chain from typing import Any @@ -83,7 +84,7 @@ def _run_drive_file_query( includeItemsFromAllDrives=include_shared, fields=( "nextPageToken, files(mimeType, id, name, " - "webViewLink, shortcutDetails)" + "modifiedTime, webViewLink, shortcutDetails)" ), pageToken=next_page_token, q=query, @@ -194,12 +195,10 @@ def _get_files( ) -> Iterator[GoogleDriveFileType]: query = f"mimeType != '{DRIVE_FOLDER_TYPE}' " if time_range_start is not None: - time_start = ( - datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z" - ) + time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z" query += f"and modifiedTime >= '{time_start}' " if time_range_end is not None: - time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z" + time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z" query += f"and modifiedTime <= '{time_stop}' " if folder_id: query += f"and '{folder_id}' in parents " @@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector): ], source=DocumentSource.GOOGLE_DRIVE, semantic_identifier=file["name"], + doc_updated_at=datetime.fromisoformat( + file["modifiedTime"] + ).astimezone(timezone.utc), metadata={} if text_contents else {IGNORE_FOR_QA: True}, ) ) diff --git a/backend/danswer/connectors/hubspot/connector.py b/backend/danswer/connectors/hubspot/connector.py index 1901061fcc18..e59a4c9eb486 100644 --- a/backend/danswer/connectors/hubspot/connector.py +++ b/backend/danswer/connectors/hubspot/connector.py @@ -1,4 +1,5 @@ from datetime import datetime +from datetime import timezone from typing import Any import requests @@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector): sections=[Section(link=link, text=content_text)], source=DocumentSource.HUBSPOT, semantic_identifier=title, + # Is already in tzutc, just replacing the timezone format + doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc), metadata={}, ) ) @@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector): if __name__ == "__main__": import os - import time - test_connector = HubSpotConnector() - test_connector.load_credentials( + connector = HubSpotConnector() + connector.load_credentials( {"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]} ) - all_docs = test_connector.load_from_state() - current = time.time() - one_day_ago = current - 24 * 60 * 60 # 1 day - latest_docs = test_connector.poll_source(one_day_ago, current) - print(latest_docs) + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py index b030f1d2b5f4..2fa901fc6f8a 100644 --- a/backend/danswer/connectors/notion/connector.py +++ b/backend/danswer/connectors/notion/connector.py @@ -2,6 +2,8 @@ import time from collections.abc import Generator from dataclasses import dataclass from dataclasses import fields +from datetime import datetime +from datetime import timezone from typing import Any from typing import Optional @@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector): ], source=DocumentSource.NOTION, semantic_identifier=page_title, + doc_updated_at=datetime.fromisoformat( + page.last_edited_time + ).astimezone(timezone.utc), metadata={}, ) ) @@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector): if __name__ == "__main__": import os - root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID") - connector = NotionConnector(root_page_id=root_page_id) + connector = NotionConnector() connector.load_credentials( {"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")} ) diff --git a/backend/danswer/connectors/slack/connector.py b/backend/danswer/connectors/slack/connector.py index 53e0fee5a8b6..596b93b227b8 100644 --- a/backend/danswer/connectors/slack/connector.py +++ b/backend/danswer/connectors/slack/connector.py @@ -1,7 +1,8 @@ import json -import os from collections.abc import Callable from collections.abc import Generator +from datetime import datetime +from datetime import timezone from pathlib import Path from typing import Any from typing import cast @@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType return threads +def get_latest_message_time(thread: ThreadType) -> datetime: + max_ts = max([float(msg.get("ts", 0)) for msg in thread]) + return datetime.fromtimestamp(max_ts, tz=timezone.utc) + + +def get_event_time(event: dict[str, Any]) -> datetime | None: + ts = event.get("ts") + if not ts: + return None + return datetime.fromtimestamp(float(ts), tz=timezone.utc) + + def thread_to_doc( workspace: str, channel: ChannelType, @@ -148,6 +161,7 @@ def thread_to_doc( ], source=DocumentSource.SLACK, semantic_identifier=channel["name"], + doc_updated_at=get_latest_message_time(thread), title="", # slack docs don't really have a "title" metadata={}, ) @@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector): source=matching_doc.source, semantic_identifier=matching_doc.semantic_identifier, title="", # slack docs don't really have a "title" + doc_updated_at=get_event_time(slack_event), metadata=matching_doc.metadata, ) @@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector): source=DocumentSource.SLACK, semantic_identifier=channel["name"], title="", # slack docs don't really have a "title" + doc_updated_at=get_event_time(slack_event), metadata={}, ) @@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector): if documents: yield documents + + +if __name__ == "__main__": + import os + import time + + connector = SlackPollConnector( + workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]] + ) + connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]}) + + current = time.time() + one_day_ago = current - 24 * 60 * 60 # 1 day + document_batches = connector.poll_source(one_day_ago, current) + + print(next(document_batches)) diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index dfb1a878ea8c..99a1abb3adde 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -1,5 +1,4 @@ import io -from datetime import datetime from enum import Enum from typing import Any from typing import cast @@ -173,8 +172,6 @@ class WebConnector(LoadConnector): logger.info(f"Visiting {current_url}") try: - current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S") - if restart_playwright: playwright, context = start_playwright() restart_playwright = False @@ -192,7 +189,7 @@ class WebConnector(LoadConnector): sections=[Section(link=current_url, text=page_text)], source=DocumentSource.WEB, semantic_identifier=current_url.split(".")[-1], - metadata={"Time Visited": current_visit_time}, + metadata={}, ) ) continue