From 5ddc9b34abd5239eb92588852141a1359f54b7a5 Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Fri, 20 Oct 2023 17:03:28 -0700
Subject: [PATCH] Add Document UpdatedAt times for most connectors (#605)

---
 .../cross_connector_utils/html_utils.py       | 34 ++++++++++++++++---
 .../connectors/danswer_jira/connector.py      | 18 ++++++++++
 .../connectors/document360/connector.py       | 25 ++++++++++----
 backend/danswer/connectors/file/connector.py  |  9 ++++-
 .../danswer/connectors/github/connector.py    |  9 +++--
 backend/danswer/connectors/gong/connector.py  |  9 +++--
 .../connectors/google_drive/connector.py      | 14 ++++----
 .../danswer/connectors/hubspot/connector.py   | 15 ++++----
 .../danswer/connectors/notion/connector.py    |  8 +++--
 backend/danswer/connectors/slack/connector.py | 34 ++++++++++++++++++-
 backend/danswer/connectors/web/connector.py   |  5 +--
 11 files changed, 142 insertions(+), 38 deletions(-)

diff --git a/backend/danswer/connectors/cross_connector_utils/html_utils.py b/backend/danswer/connectors/cross_connector_utils/html_utils.py
index 550658b47e75..0b4e9fade825 100644
--- a/backend/danswer/connectors/cross_connector_utils/html_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -47,6 +47,7 @@ def format_document_soup(
     list_element_start = False
     verbatim_output = 0
     in_table = False
+    last_added_newline = False
     for e in document.descendants:
         verbatim_output -= 1
         if isinstance(e, bs4.element.NavigableString):
@@ -57,11 +58,29 @@ def format_document_soup(
                 # Tables are represented in natural language with rows separated by newlines
                 # Can't have newlines then in the table elements
                 element_text = element_text.replace("\n", " ").strip()
+
+            # Some tags are translated to spaces but in the logic underneath this section, we
+            # translate them to newlines as a browser should render them such as with br
+            # This logic here avoids a space after newline when it shouldn't be there.
+            if last_added_newline and element_text.startswith(" "):
+                element_text = element_text[1:]
+                last_added_newline = False
+
             if element_text:
-                if verbatim_output > 0:
-                    text += element_text
-                else:
-                    text += strip_newlines(element_text)
+                content_to_add = (
+                    element_text
+                    if verbatim_output > 0
+                    else strip_newlines(element_text)
+                )
+
+                # Don't join separate elements without any spacing
+                if (text and not text[-1].isspace()) and (
+                    content_to_add and not content_to_add[0].isspace()
+                ):
+                    text += " "
+
+                text += content_to_add
+
                 list_element_start = False
         elif isinstance(e, bs4.element.Tag):
             # table is standard HTML element
@@ -82,9 +101,14 @@ def format_document_soup(
             elif e.name in ["p", "div"]:
                 if not list_element_start:
                     text += "\n"
-            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+            elif e.name in ["h1", "h2", "h3", "h4"]:
                 text += "\n"
                 list_element_start = False
+                last_added_newline = True
+            elif e.name == "br":
+                text += "\n"
+                list_element_start = False
+                last_added_newline = True
             elif e.name == "li":
                 text += "\n- "
                 list_element_start = True
diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py
index d68fd5cd0abc..0bfd74f60ff9 100644
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@@ -3,6 +3,7 @@ from datetime import timezone
 from typing import Any
 from urllib.parse import urlparse
 
+from dateutil.parser import parse
 from jira import JIRA
 from jira.resources import Issue
 
@@ -59,6 +60,8 @@ def fetch_jira_issues_batch(
             logger.warning(f"Found Jira object not of type Issue {jira}")
             continue
 
+        ticket_updated_time = parse(jira.fields.updated)
+
         semantic_rep = (
             f"Jira Ticket Summary: {jira.fields.summary}\n"
             f"Description: {jira.fields.description}\n"
@@ -75,6 +78,7 @@ def fetch_jira_issues_batch(
                 sections=[Section(link=page_url, text=semantic_rep)],
                 source=DocumentSource.JIRA,
                 semantic_identifier=jira.fields.summary,
+                doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
                 metadata={},
             )
         )
@@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector):
             start_ind += fetched_batch_size
             if fetched_batch_size < self.batch_size:
                 break
+
+
+if __name__ == "__main__":
+    import os
+
+    connector = JiraConnector(os.environ["JIRA_PROJECT_URL"])
+    connector.load_credentials(
+        {
+            "jira_user_email": os.environ["JIRA_USER_EMAIL"],
+            "jira_api_token": os.environ["JIRA_API_TOKEN"],
+        }
+    )
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py
index 408f1519abd4..a6e4062c0abf 100644
--- a/backend/danswer/connectors/document360/connector.py
+++ b/backend/danswer/connectors/document360/connector.py
@@ -5,10 +5,10 @@ from typing import List
 from typing import Optional
 
 import requests
-from bs4 import BeautifulSoup
 
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector):
             if end is not None and updated_at > end:
                 continue
 
+            authors = [
+                author["email_id"]
+                for author in article_details.get("authors", [])
+                if author["email_id"]
+            ]
+
             doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
 
             html_content = article_details["html_content"]
-            soup = BeautifulSoup(html_content, "html.parser")
-            article_content = soup.get_text()
+            article_content = parse_html_page_basic(html_content)
             doc_text = (
                 f"workspace: {self.workspace}\n"
                 f"category: {article['category_name']}\n"
                 f"article: {article_details['title']} - "
-                f"{article_details.get('description', '')} - "
+                f"{article_details.get('description', '')}\n"
                 f"{article_content}"
             )
 
@@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector):
                 sections=[Section(link=doc_link, text=doc_text)],
                 source=DocumentSource.DOCUMENT360,
                 semantic_identifier=article_details["title"],
+                doc_updated_at=updated_at,
+                primary_owners=authors,
                 metadata={},
             )
 
@@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector):
 
 if __name__ == "__main__":
     import time
+    import os
 
-    document360_connector = Document360Connector("Your Workspace", ["Your categories"])
+    document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
     document360_connector.load_credentials(
-        {"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"}
+        {
+            "portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
+            "document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
+        }
     )
 
     current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 days
+    one_day_ago = current - 24 * 60 * 60 * 360  # 1 year
     latest_docs = document360_connector.poll_source(one_day_ago, current)
 
     for doc in latest_docs:
diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py
index b1a299ecca53..bcb291537c8a 100644
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,4 +1,6 @@
 from collections.abc import Generator
+from datetime import datetime
+from datetime import timezone
 from pathlib import Path
 from typing import Any
 from typing import IO
@@ -41,6 +43,7 @@ def _open_files_at_location(
 def _process_file(
     file_name: str,
     file: IO[Any],
+    time_updated: datetime,
     pdf_pass: str | None = None,
 ) -> list[Document]:
     extension = get_file_ext(file_name)
@@ -63,6 +66,7 @@ def _process_file(
             sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
             source=DocumentSource.FILE,
             semantic_identifier=file_name,
+            doc_updated_at=time_updated,
             metadata={},
         )
     ]
@@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector):
     def load_from_state(self) -> GenerateDocumentsOutput:
         documents: list[Document] = []
         for file_location in self.file_locations:
+            current_datetime = datetime.now(timezone.utc)
             files = _open_files_at_location(file_location)
 
             for file_name, file in files:
-                documents.extend(_process_file(file_name, file, self.pdf_pass))
+                documents.extend(
+                    _process_file(file_name, file, current_datetime, self.pdf_pass)
+                )
 
                 if len(documents) >= self.batch_size:
                     yield documents
diff --git a/backend/danswer/connectors/github/connector.py b/backend/danswer/connectors/github/connector.py
index bafffe5b53d9..7fa19b1c4241 100644
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@@ -1,6 +1,7 @@
 import itertools
 from collections.abc import Iterator
 from datetime import datetime
+from datetime import timezone
 from typing import Any
 from typing import cast
 
@@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
         sections=[Section(link=pull_request.html_url, text=full_context)],
         source=DocumentSource.GITHUB,
         semantic_identifier=pull_request.title,
+        # updated_at is UTC time but is timezone unaware, explicitly add UTC
+        # as there is logic in indexing to prevent wrong timestamped docs
+        # due to local time discrepancies with UTC
+        doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
         metadata={
-            "last_modified": str(pull_request.last_modified),
             "merged": pull_request.merged,
             "state": pull_request.state,
         },
@@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document:
         sections=[Section(link=issue.html_url, text=full_context)],
         source=DocumentSource.GITHUB,
         semantic_identifier=issue.title,
+        # updated_at is UTC time but is timezone unaware
+        doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
         metadata={
-            "last_modified": str(issue.updated_at),
             "state": issue.state,
         },
     )
diff --git a/backend/danswer/connectors/gong/connector.py b/backend/danswer/connectors/gong/connector.py
index 378fd856d0c0..7f5f7f91ba91 100644
--- a/backend/danswer/connectors/gong/connector.py
+++ b/backend/danswer/connectors/gong/connector.py
@@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector):
                 response = requests.post(
                     url, headers=self._get_auth_header(), json=body
                 )
+                # If no calls in the range, just break out
+                if response.status_code == 404:
+                    break
                 response.raise_for_status()
 
                 data = response.json()
@@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector):
                         source=DocumentSource.GONG,
                         # Should not ever be Untitled as a call cannot be made without a Title
                         semantic_identifier=call_title or "Untitled",
+                        doc_updated_at=datetime.fromisoformat(
+                            call_metadata["started"]
+                        ).astimezone(timezone.utc),
                         metadata={"Start Time": call_metadata["started"]},
                     )
                 )
@@ -270,6 +276,5 @@ if __name__ == "__main__":
     )
 
     current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 day
-    latest_docs = connector.poll_source(one_day_ago, current)
+    latest_docs = connector.load_from_state()
     print(next(latest_docs))
diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py
index 6310d168de01..5c56475f3a04 100644
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@@ -1,8 +1,9 @@
-import datetime
 import io
 import tempfile
 from collections.abc import Iterator
 from collections.abc import Sequence
+from datetime import datetime
+from datetime import timezone
 from enum import Enum
 from itertools import chain
 from typing import Any
@@ -83,7 +84,7 @@ def _run_drive_file_query(
                 includeItemsFromAllDrives=include_shared,
                 fields=(
                     "nextPageToken, files(mimeType, id, name, "
-                    "webViewLink, shortcutDetails)"
+                    "modifiedTime, webViewLink, shortcutDetails)"
                 ),
                 pageToken=next_page_token,
                 q=query,
@@ -194,12 +195,10 @@ def _get_files(
 ) -> Iterator[GoogleDriveFileType]:
     query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
     if time_range_start is not None:
-        time_start = (
-            datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
-        )
+        time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
         query += f"and modifiedTime >= '{time_start}' "
     if time_range_end is not None:
-        time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
+        time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
         query += f"and modifiedTime <= '{time_stop}' "
     if folder_id:
         query += f"and '{folder_id}' in parents "
@@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
                             ],
                             source=DocumentSource.GOOGLE_DRIVE,
                             semantic_identifier=file["name"],
+                            doc_updated_at=datetime.fromisoformat(
+                                file["modifiedTime"]
+                            ).astimezone(timezone.utc),
                             metadata={} if text_contents else {IGNORE_FOR_QA: True},
                         )
                     )
diff --git a/backend/danswer/connectors/hubspot/connector.py b/backend/danswer/connectors/hubspot/connector.py
index 1901061fcc18..e59a4c9eb486 100644
--- a/backend/danswer/connectors/hubspot/connector.py
+++ b/backend/danswer/connectors/hubspot/connector.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from datetime import timezone
 from typing import Any
 
 import requests
@@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector):
                     sections=[Section(link=link, text=content_text)],
                     source=DocumentSource.HUBSPOT,
                     semantic_identifier=title,
+                    # Is already in tzutc, just replacing the timezone format
+                    doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
                     metadata={},
                 )
             )
@@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector):
 
 if __name__ == "__main__":
     import os
-    import time
 
-    test_connector = HubSpotConnector()
-    test_connector.load_credentials(
+    connector = HubSpotConnector()
+    connector.load_credentials(
         {"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
     )
-    all_docs = test_connector.load_from_state()
 
-    current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 day
-    latest_docs = test_connector.poll_source(one_day_ago, current)
-    print(latest_docs)
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py
index b030f1d2b5f4..2fa901fc6f8a 100644
--- a/backend/danswer/connectors/notion/connector.py
+++ b/backend/danswer/connectors/notion/connector.py
@@ -2,6 +2,8 @@ import time
 from collections.abc import Generator
 from dataclasses import dataclass
 from dataclasses import fields
+from datetime import datetime
+from datetime import timezone
 from typing import Any
 from typing import Optional
 
@@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector):
                     ],
                     source=DocumentSource.NOTION,
                     semantic_identifier=page_title,
+                    doc_updated_at=datetime.fromisoformat(
+                        page.last_edited_time
+                    ).astimezone(timezone.utc),
                     metadata={},
                 )
             )
@@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector):
 if __name__ == "__main__":
     import os
 
-    root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
-    connector = NotionConnector(root_page_id=root_page_id)
+    connector = NotionConnector()
     connector.load_credentials(
         {"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
     )
diff --git a/backend/danswer/connectors/slack/connector.py b/backend/danswer/connectors/slack/connector.py
index 53e0fee5a8b6..596b93b227b8 100644
--- a/backend/danswer/connectors/slack/connector.py
+++ b/backend/danswer/connectors/slack/connector.py
@@ -1,7 +1,8 @@
 import json
-import os
 from collections.abc import Callable
 from collections.abc import Generator
+from datetime import datetime
+from datetime import timezone
 from pathlib import Path
 from typing import Any
 from typing import cast
@@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
     return threads
 
 
+def get_latest_message_time(thread: ThreadType) -> datetime:
+    max_ts = max([float(msg.get("ts", 0)) for msg in thread])
+    return datetime.fromtimestamp(max_ts, tz=timezone.utc)
+
+
+def get_event_time(event: dict[str, Any]) -> datetime | None:
+    ts = event.get("ts")
+    if not ts:
+        return None
+    return datetime.fromtimestamp(float(ts), tz=timezone.utc)
+
+
 def thread_to_doc(
     workspace: str,
     channel: ChannelType,
@@ -148,6 +161,7 @@ def thread_to_doc(
         ],
         source=DocumentSource.SLACK,
         semantic_identifier=channel["name"],
+        doc_updated_at=get_latest_message_time(thread),
         title="",  # slack docs don't really have a "title"
         metadata={},
     )
@@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector):
                     source=matching_doc.source,
                     semantic_identifier=matching_doc.semantic_identifier,
                     title="",  # slack docs don't really have a "title"
+                    doc_updated_at=get_event_time(slack_event),
                     metadata=matching_doc.metadata,
                 )
 
@@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector):
                 source=DocumentSource.SLACK,
                 semantic_identifier=channel["name"],
                 title="",  # slack docs don't really have a "title"
+                doc_updated_at=get_event_time(slack_event),
                 metadata={},
             )
 
@@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector):
 
         if documents:
             yield documents
+
+
+if __name__ == "__main__":
+    import os
+    import time
+
+    connector = SlackPollConnector(
+        workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]]
+    )
+    connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})
+
+    current = time.time()
+    one_day_ago = current - 24 * 60 * 60  # 1 day
+    document_batches = connector.poll_source(one_day_ago, current)
+
+    print(next(document_batches))
diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py
index dfb1a878ea8c..99a1abb3adde 100644
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
-from datetime import datetime
 from enum import Enum
 from typing import Any
 from typing import cast
@@ -173,8 +172,6 @@ class WebConnector(LoadConnector):
             logger.info(f"Visiting {current_url}")
 
             try:
-                current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
-
                 if restart_playwright:
                     playwright, context = start_playwright()
                     restart_playwright = False
@@ -192,7 +189,7 @@ class WebConnector(LoadConnector):
                             sections=[Section(link=current_url, text=page_text)],
                             source=DocumentSource.WEB,
                             semantic_identifier=current_url.split(".")[-1],
-                            metadata={"Time Visited": current_visit_time},
+                            metadata={},
                         )
                     )
                     continue