Add Document UpdatedAt times for most connectors (#605)

2025-09-19 12:03:54 +02:00 · 2023-10-20 17:03:28 -07:00
parent a7099a1917
commit 5ddc9b34ab
11 changed files with 142 additions and 38 deletions
--- a/backend/danswer/connectors/cross_connector_utils/html_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -47,6 +47,7 @@ def format_document_soup(
    list_element_start = False
    verbatim_output = 0
    in_table = False
+    last_added_newline = False
    for e in document.descendants:
        verbatim_output -= 1
        if isinstance(e, bs4.element.NavigableString):
@@ -57,11 +58,29 @@ def format_document_soup(
                # Tables are represented in natural language with rows separated by newlines
                # Can't have newlines then in the table elements
                element_text = element_text.replace("\n", " ").strip()
+
+            # Some tags are translated to spaces but in the logic underneath this section, we
+            # translate them to newlines as a browser should render them such as with br
+            # This logic here avoids a space after newline when it shouldn't be there.
+            if last_added_newline and element_text.startswith(" "):
+                element_text = element_text[1:]
+                last_added_newline = False
+
            if element_text:
-                if verbatim_output > 0:
-                    text += element_text
-                else:
-                    text += strip_newlines(element_text)
+                content_to_add = (
+                    element_text
+                    if verbatim_output > 0
+                    else strip_newlines(element_text)
+                )
+
+                # Don't join separate elements without any spacing
+                if (text and not text[-1].isspace()) and (
+                    content_to_add and not content_to_add[0].isspace()
+                ):
+                    text += " "
+
+                text += content_to_add
+
                list_element_start = False
        elif isinstance(e, bs4.element.Tag):
            # table is standard HTML element
@@ -82,9 +101,14 @@ def format_document_soup(
            elif e.name in ["p", "div"]:
                if not list_element_start:
                    text += "\n"
-            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+            elif e.name in ["h1", "h2", "h3", "h4"]:
                text += "\n"
                list_element_start = False
+                last_added_newline = True
+            elif e.name == "br":
+                text += "\n"
+                list_element_start = False
+                last_added_newline = True
            elif e.name == "li":
                text += "\n- "
                list_element_start = True
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@@ -3,6 +3,7 @@ from datetime import timezone
 from typing import Any
 from urllib.parse import urlparse

+from dateutil.parser import parse
 from jira import JIRA
 from jira.resources import Issue

@@ -59,6 +60,8 @@ def fetch_jira_issues_batch(
            logger.warning(f"Found Jira object not of type Issue {jira}")
            continue

+        ticket_updated_time = parse(jira.fields.updated)
+
        semantic_rep = (
            f"Jira Ticket Summary: {jira.fields.summary}\n"
            f"Description: {jira.fields.description}\n"
@@ -75,6 +78,7 @@ def fetch_jira_issues_batch(
                sections=[Section(link=page_url, text=semantic_rep)],
                source=DocumentSource.JIRA,
                semantic_identifier=jira.fields.summary,
+                doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
                metadata={},
            )
        )
@@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector):
            start_ind += fetched_batch_size
            if fetched_batch_size < self.batch_size:
                break
+
+
+if __name__ == "__main__":
+    import os
+
+    connector = JiraConnector(os.environ["JIRA_PROJECT_URL"])
+    connector.load_credentials(
+        {
+            "jira_user_email": os.environ["JIRA_USER_EMAIL"],
+            "jira_api_token": os.environ["JIRA_API_TOKEN"],
+        }
+    )
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/danswer/connectors/document360/connector.py
+++ b/backend/danswer/connectors/document360/connector.py
@@ -5,10 +5,10 @@ from typing import List
 from typing import Optional

 import requests
-from bs4 import BeautifulSoup

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector):
            if end is not None and updated_at > end:
                continue

+            authors = [
+                author["email_id"]
+                for author in article_details.get("authors", [])
+                if author["email_id"]
+            ]
+
            doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"

            html_content = article_details["html_content"]
-            soup = BeautifulSoup(html_content, "html.parser")
-            article_content = soup.get_text()
+            article_content = parse_html_page_basic(html_content)
            doc_text = (
                f"workspace: {self.workspace}\n"
                f"category: {article['category_name']}\n"
                f"article: {article_details['title']} - "
-                f"{article_details.get('description', '')} - "
+                f"{article_details.get('description', '')}\n"
                f"{article_content}"
            )

@@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector):
                sections=[Section(link=doc_link, text=doc_text)],
                source=DocumentSource.DOCUMENT360,
                semantic_identifier=article_details["title"],
+                doc_updated_at=updated_at,
+                primary_owners=authors,
                metadata={},
            )

@@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector):

 if __name__ == "__main__":
    import time
+    import os

-    document360_connector = Document360Connector("Your Workspace", ["Your categories"])
+    document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
    document360_connector.load_credentials(
-        {"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"}
+        {
+            "portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
+            "document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
+        }
    )

    current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 days
+    one_day_ago = current - 24 * 60 * 60 * 360  # 1 year
    latest_docs = document360_connector.poll_source(one_day_ago, current)

    for doc in latest_docs:
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,4 +1,6 @@
 from collections.abc import Generator
+from datetime import datetime
+from datetime import timezone
 from pathlib import Path
 from typing import Any
 from typing import IO
@@ -41,6 +43,7 @@ def _open_files_at_location(
 def _process_file(
    file_name: str,
    file: IO[Any],
+    time_updated: datetime,
    pdf_pass: str | None = None,
 ) -> list[Document]:
    extension = get_file_ext(file_name)
@@ -63,6 +66,7 @@ def _process_file(
            sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
            source=DocumentSource.FILE,
            semantic_identifier=file_name,
+            doc_updated_at=time_updated,
            metadata={},
        )
    ]
@@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector):
    def load_from_state(self) -> GenerateDocumentsOutput:
        documents: list[Document] = []
        for file_location in self.file_locations:
+            current_datetime = datetime.now(timezone.utc)
            files = _open_files_at_location(file_location)

            for file_name, file in files:
-                documents.extend(_process_file(file_name, file, self.pdf_pass))
+                documents.extend(
+                    _process_file(file_name, file, current_datetime, self.pdf_pass)
+                )

                if len(documents) >= self.batch_size:
                    yield documents
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@@ -1,6 +1,7 @@
 import itertools
 from collections.abc import Iterator
 from datetime import datetime
+from datetime import timezone
 from typing import Any
 from typing import cast

@@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
        sections=[Section(link=pull_request.html_url, text=full_context)],
        source=DocumentSource.GITHUB,
        semantic_identifier=pull_request.title,
+        # updated_at is UTC time but is timezone unaware, explicitly add UTC
+        # as there is logic in indexing to prevent wrong timestamped docs
+        # due to local time discrepancies with UTC
+        doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
        metadata={
-            "last_modified": str(pull_request.last_modified),
            "merged": pull_request.merged,
            "state": pull_request.state,
        },
@@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document:
        sections=[Section(link=issue.html_url, text=full_context)],
        source=DocumentSource.GITHUB,
        semantic_identifier=issue.title,
+        # updated_at is UTC time but is timezone unaware
+        doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
        metadata={
-            "last_modified": str(issue.updated_at),
            "state": issue.state,
        },
    )
--- a/backend/danswer/connectors/gong/connector.py
+++ b/backend/danswer/connectors/gong/connector.py
@@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector):
                response = requests.post(
                    url, headers=self._get_auth_header(), json=body
                )
+                # If no calls in the range, just break out
+                if response.status_code == 404:
+                    break
                response.raise_for_status()

                data = response.json()
@@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector):
                        source=DocumentSource.GONG,
                        # Should not ever be Untitled as a call cannot be made without a Title
                        semantic_identifier=call_title or "Untitled",
+                        doc_updated_at=datetime.fromisoformat(
+                            call_metadata["started"]
+                        ).astimezone(timezone.utc),
                        metadata={"Start Time": call_metadata["started"]},
                    )
                )
@@ -270,6 +276,5 @@ if __name__ == "__main__":
    )

    current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 day
-    latest_docs = connector.poll_source(one_day_ago, current)
+    latest_docs = connector.load_from_state()
    print(next(latest_docs))
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@@ -1,8 +1,9 @@
-import datetime
 import io
 import tempfile
 from collections.abc import Iterator
 from collections.abc import Sequence
+from datetime import datetime
+from datetime import timezone
 from enum import Enum
 from itertools import chain
 from typing import Any
@@ -83,7 +84,7 @@ def _run_drive_file_query(
                includeItemsFromAllDrives=include_shared,
                fields=(
                    "nextPageToken, files(mimeType, id, name, "
-                    "webViewLink, shortcutDetails)"
+                    "modifiedTime, webViewLink, shortcutDetails)"
                ),
                pageToken=next_page_token,
                q=query,
@@ -194,12 +195,10 @@ def _get_files(
 ) -> Iterator[GoogleDriveFileType]:
    query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
    if time_range_start is not None:
-        time_start = (
-            datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
-        )
+        time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
        query += f"and modifiedTime >= '{time_start}' "
    if time_range_end is not None:
-        time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
+        time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
        query += f"and modifiedTime <= '{time_stop}' "
    if folder_id:
        query += f"and '{folder_id}' in parents "
@@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
                            ],
                            source=DocumentSource.GOOGLE_DRIVE,
                            semantic_identifier=file["name"],
+                            doc_updated_at=datetime.fromisoformat(
+                                file["modifiedTime"]
+                            ).astimezone(timezone.utc),
                            metadata={} if text_contents else {IGNORE_FOR_QA: True},
                        )
                    )
--- a/backend/danswer/connectors/hubspot/connector.py
+++ b/backend/danswer/connectors/hubspot/connector.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from datetime import timezone
 from typing import Any

 import requests
@@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector):
                    sections=[Section(link=link, text=content_text)],
                    source=DocumentSource.HUBSPOT,
                    semantic_identifier=title,
+                    # Is already in tzutc, just replacing the timezone format
+                    doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
                    metadata={},
                )
            )
@@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector):

 if __name__ == "__main__":
    import os
-    import time

-    test_connector = HubSpotConnector()
-    test_connector.load_credentials(
+    connector = HubSpotConnector()
+    connector.load_credentials(
        {"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
    )
-    all_docs = test_connector.load_from_state()

-    current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 day
-    latest_docs = test_connector.poll_source(one_day_ago, current)
-    print(latest_docs)
+    document_batches = connector.load_from_state()
+    print(next(document_batches))
--- a/backend/danswer/connectors/notion/connector.py
+++ b/backend/danswer/connectors/notion/connector.py
@@ -2,6 +2,8 @@ import time
 from collections.abc import Generator
 from dataclasses import dataclass
 from dataclasses import fields
+from datetime import datetime
+from datetime import timezone
 from typing import Any
 from typing import Optional

@@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector):
                    ],
                    source=DocumentSource.NOTION,
                    semantic_identifier=page_title,
+                    doc_updated_at=datetime.fromisoformat(
+                        page.last_edited_time
+                    ).astimezone(timezone.utc),
                    metadata={},
                )
            )
@@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector):
 if __name__ == "__main__":
    import os

-    root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
-    connector = NotionConnector(root_page_id=root_page_id)
+    connector = NotionConnector()
    connector.load_credentials(
        {"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
    )
--- a/backend/danswer/connectors/slack/connector.py
+++ b/backend/danswer/connectors/slack/connector.py
@@ -1,7 +1,8 @@
 import json
-import os
 from collections.abc import Callable
 from collections.abc import Generator
+from datetime import datetime
+from datetime import timezone
 from pathlib import Path
 from typing import Any
 from typing import cast
@@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
    return threads


+def get_latest_message_time(thread: ThreadType) -> datetime:
+    max_ts = max([float(msg.get("ts", 0)) for msg in thread])
+    return datetime.fromtimestamp(max_ts, tz=timezone.utc)
+
+
+def get_event_time(event: dict[str, Any]) -> datetime | None:
+    ts = event.get("ts")
+    if not ts:
+        return None
+    return datetime.fromtimestamp(float(ts), tz=timezone.utc)
+
+
 def thread_to_doc(
    workspace: str,
    channel: ChannelType,
@@ -148,6 +161,7 @@ def thread_to_doc(
        ],
        source=DocumentSource.SLACK,
        semantic_identifier=channel["name"],
+        doc_updated_at=get_latest_message_time(thread),
        title="",  # slack docs don't really have a "title"
        metadata={},
    )
@@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector):
                    source=matching_doc.source,
                    semantic_identifier=matching_doc.semantic_identifier,
                    title="",  # slack docs don't really have a "title"
+                    doc_updated_at=get_event_time(slack_event),
                    metadata=matching_doc.metadata,
                )

@@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector):
                source=DocumentSource.SLACK,
                semantic_identifier=channel["name"],
                title="",  # slack docs don't really have a "title"
+                doc_updated_at=get_event_time(slack_event),
                metadata={},
            )

@@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector):

        if documents:
            yield documents
+
+
+if __name__ == "__main__":
+    import os
+    import time
+
+    connector = SlackPollConnector(
+        workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]]
+    )
+    connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})
+
+    current = time.time()
+    one_day_ago = current - 24 * 60 * 60  # 1 day
+    document_batches = connector.poll_source(one_day_ago, current)
+
+    print(next(document_batches))
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
-from datetime import datetime
 from enum import Enum
 from typing import Any
 from typing import cast
@@ -173,8 +172,6 @@ class WebConnector(LoadConnector):
            logger.info(f"Visiting {current_url}")

            try:
-                current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
-
                if restart_playwright:
                    playwright, context = start_playwright()
                    restart_playwright = False
@@ -192,7 +189,7 @@ class WebConnector(LoadConnector):
                            sections=[Section(link=current_url, text=page_text)],
                            source=DocumentSource.WEB,
                            semantic_identifier=current_url.split(".")[-1],
-                            metadata={"Time Visited": current_visit_time},
+                            metadata={},
                        )
                    )
                    continue