Add Document UpdatedAt times for most connectors (#605)

2025-09-23 12:31:30 +02:00 · 2023-10-20 17:03:28 -07:00
parent a7099a1917
commit 5ddc9b34ab
11 changed files with 142 additions and 38 deletions
--- a/backend/danswer/connectors/cross_connector_utils/html_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/html_utils.py
@@ -47,6 +47,7 @@ def format_document_soup(
    list_element_start = False
    verbatim_output = 0
    in_table = False
    last_added_newline = False
    for e in document.descendants:
        verbatim_output -= 1
        if isinstance(e, bs4.element.NavigableString):
@@ -57,11 +58,29 @@ def format_document_soup(
                # Tables are represented in natural language with rows separated by newlines
                # Can't have newlines then in the table elements
                element_text = element_text.replace("\n", " ").strip()
            # Some tags are translated to spaces but in the logic underneath this section, we
            # translate them to newlines as a browser should render them such as with br
            # This logic here avoids a space after newline when it shouldn't be there.
            if last_added_newline and element_text.startswith(" "):
                element_text = element_text[1:]
                last_added_newline = False
            if element_text:
-                if verbatim_output > 0:
+                content_to_add = (
-                    text += element_text
+                    element_text
-                else:
+                    if verbatim_output > 0
-                    text += strip_newlines(element_text)
+                    else strip_newlines(element_text)
                )
                # Don't join separate elements without any spacing
                if (text and not text[-1].isspace()) and (
                    content_to_add and not content_to_add[0].isspace()
                ):
                    text += " "
                text += content_to_add
                list_element_start = False
        elif isinstance(e, bs4.element.Tag):
            # table is standard HTML element
@@ -82,9 +101,14 @@ def format_document_soup(
            elif e.name in ["p", "div"]:
                if not list_element_start:
                    text += "\n"
-            elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
+            elif e.name in ["h1", "h2", "h3", "h4"]:
                text += "\n"
                list_element_start = False
                last_added_newline = True
            elif e.name == "br":
                text += "\n"
                list_element_start = False
                last_added_newline = True
            elif e.name == "li":
                text += "\n- "
                list_element_start = True
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@@ -3,6 +3,7 @@ from datetime import timezone
 from typing import Any
 from urllib.parse import urlparse
 from dateutil.parser import parse
 from jira import JIRA
 from jira.resources import Issue
@@ -59,6 +60,8 @@ def fetch_jira_issues_batch(
            logger.warning(f"Found Jira object not of type Issue {jira}")
            continue
        ticket_updated_time = parse(jira.fields.updated)
        semantic_rep = (
            f"Jira Ticket Summary: {jira.fields.summary}\n"
            f"Description: {jira.fields.description}\n"
@@ -75,6 +78,7 @@ def fetch_jira_issues_batch(
                sections=[Section(link=page_url, text=semantic_rep)],
                source=DocumentSource.JIRA,
                semantic_identifier=jira.fields.summary,
                doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
                metadata={},
            )
        )
@@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector):
            start_ind += fetched_batch_size
            if fetched_batch_size < self.batch_size:
                break
 if __name__ == "__main__":
    import os
    connector = JiraConnector(os.environ["JIRA_PROJECT_URL"])
    connector.load_credentials(
        {
            "jira_user_email": os.environ["JIRA_USER_EMAIL"],
            "jira_api_token": os.environ["JIRA_API_TOKEN"],
        }
    )
    document_batches = connector.load_from_state()
    print(next(document_batches))
--- a/backend/danswer/connectors/document360/connector.py
+++ b/backend/danswer/connectors/document360/connector.py
@@ -5,10 +5,10 @@ from typing import List
 from typing import Optional
 import requests
 from bs4 import BeautifulSoup
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
 from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.interfaces import PollConnector
@@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector):
            if end is not None and updated_at > end:
                continue
            authors = [
                author["email_id"]
                for author in article_details.get("authors", [])
                if author["email_id"]
            ]
            doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
            html_content = article_details["html_content"]
-            soup = BeautifulSoup(html_content, "html.parser")
+            article_content = parse_html_page_basic(html_content)
            article_content = soup.get_text()
            doc_text = (
                f"workspace: {self.workspace}\n"
                f"category: {article['category_name']}\n"
                f"article: {article_details['title']} - "
-                f"{article_details.get('description', '')} - "
+                f"{article_details.get('description', '')}\n"
                f"{article_content}"
            )
@@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector):
                sections=[Section(link=doc_link, text=doc_text)],
                source=DocumentSource.DOCUMENT360,
                semantic_identifier=article_details["title"],
                doc_updated_at=updated_at,
                primary_owners=authors,
                metadata={},
            )
@@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector):
 if __name__ == "__main__":
    import time
    import os
-    document360_connector = Document360Connector("Your Workspace", ["Your categories"])
+    document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
    document360_connector.load_credentials(
-        {"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"}
+        {
            "portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
            "document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
        }
    )
    current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 days
+    one_day_ago = current - 24 * 60 * 60 * 360  # 1 year
    latest_docs = document360_connector.poll_source(one_day_ago, current)
    for doc in latest_docs:
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -1,4 +1,6 @@
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone
 from pathlib import Path
 from typing import Any
 from typing import IO
@@ -41,6 +43,7 @@ def _open_files_at_location(
 def _process_file(
    file_name: str,
    file: IO[Any],
    time_updated: datetime,
    pdf_pass: str | None = None,
 ) -> list[Document]:
    extension = get_file_ext(file_name)
@@ -63,6 +66,7 @@ def _process_file(
            sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
            source=DocumentSource.FILE,
            semantic_identifier=file_name,
            doc_updated_at=time_updated,
            metadata={},
        )
    ]
@@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector):
    def load_from_state(self) -> GenerateDocumentsOutput:
        documents: list[Document] = []
        for file_location in self.file_locations:
            current_datetime = datetime.now(timezone.utc)
            files = _open_files_at_location(file_location)
            for file_name, file in files:
-                documents.extend(_process_file(file_name, file, self.pdf_pass))
+                documents.extend(
                    _process_file(file_name, file, current_datetime, self.pdf_pass)
                )
                if len(documents) >= self.batch_size:
                    yield documents
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@@ -1,6 +1,7 @@
 import itertools
 from collections.abc import Iterator
 from datetime import datetime
 from datetime import timezone
 from typing import Any
 from typing import cast
@@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
        sections=[Section(link=pull_request.html_url, text=full_context)],
        source=DocumentSource.GITHUB,
        semantic_identifier=pull_request.title,
        # updated_at is UTC time but is timezone unaware, explicitly add UTC
        # as there is logic in indexing to prevent wrong timestamped docs
        # due to local time discrepancies with UTC
        doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
        metadata={
            "last_modified": str(pull_request.last_modified),
            "merged": pull_request.merged,
            "state": pull_request.state,
        },
@@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document:
        sections=[Section(link=issue.html_url, text=full_context)],
        source=DocumentSource.GITHUB,
        semantic_identifier=issue.title,
        # updated_at is UTC time but is timezone unaware
        doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
        metadata={
            "last_modified": str(issue.updated_at),
            "state": issue.state,
        },
    )
--- a/backend/danswer/connectors/gong/connector.py
+++ b/backend/danswer/connectors/gong/connector.py
@@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector):
                response = requests.post(
                    url, headers=self._get_auth_header(), json=body
                )
                # If no calls in the range, just break out
                if response.status_code == 404:
                    break
                response.raise_for_status()
                data = response.json()
@@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector):
                        source=DocumentSource.GONG,
                        # Should not ever be Untitled as a call cannot be made without a Title
                        semantic_identifier=call_title or "Untitled",
                        doc_updated_at=datetime.fromisoformat(
                            call_metadata["started"]
                        ).astimezone(timezone.utc),
                        metadata={"Start Time": call_metadata["started"]},
                    )
                )
@@ -270,6 +276,5 @@ if __name__ == "__main__":
    )
    current = time.time()
-    one_day_ago = current - 24 * 60 * 60  # 1 day
+    latest_docs = connector.load_from_state()
    latest_docs = connector.poll_source(one_day_ago, current)
    print(next(latest_docs))
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@@ -1,8 +1,9 @@
 import datetime
 import io
 import tempfile
 from collections.abc import Iterator
 from collections.abc import Sequence
 from datetime import datetime
 from datetime import timezone
 from enum import Enum
 from itertools import chain
 from typing import Any
@@ -83,7 +84,7 @@ def _run_drive_file_query(
                includeItemsFromAllDrives=include_shared,
                fields=(
                    "nextPageToken, files(mimeType, id, name, "
-                    "webViewLink, shortcutDetails)"
+                    "modifiedTime, webViewLink, shortcutDetails)"
                ),
                pageToken=next_page_token,
                q=query,
@@ -194,12 +195,10 @@ def _get_files(
 ) -> Iterator[GoogleDriveFileType]:
    query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
    if time_range_start is not None:
-        time_start = (
+        time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
            datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
        )
        query += f"and modifiedTime >= '{time_start}' "
    if time_range_end is not None:
-        time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
+        time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
        query += f"and modifiedTime <= '{time_stop}' "
    if folder_id:
        query += f"and '{folder_id}' in parents "
@@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
                            ],
                            source=DocumentSource.GOOGLE_DRIVE,
                            semantic_identifier=file["name"],
                            doc_updated_at=datetime.fromisoformat(
                                file["modifiedTime"]
                            ).astimezone(timezone.utc),
                            metadata={} if text_contents else {IGNORE_FOR_QA: True},
                        )
                    )
--- a/backend/danswer/connectors/hubspot/connector.py
+++ b/backend/danswer/connectors/hubspot/connector.py
@@ -1,4 +1,5 @@
 from datetime import datetime
 from datetime import timezone
 from typing import Any
 import requests
@@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector):
                    sections=[Section(link=link, text=content_text)],
                    source=DocumentSource.HUBSPOT,
                    semantic_identifier=title,
                    # Is already in tzutc, just replacing the timezone format
                    doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
                    metadata={},
                )
            )
@@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector):
 if __name__ == "__main__":
    import os
    import time
-    test_connector = HubSpotConnector()
+    connector = HubSpotConnector()
-    test_connector.load_credentials(
+    connector.load_credentials(
        {"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
    )
    all_docs = test_connector.load_from_state()
-    current = time.time()
+    document_batches = connector.load_from_state()
-    one_day_ago = current - 24 * 60 * 60  # 1 day
+    print(next(document_batches))
    latest_docs = test_connector.poll_source(one_day_ago, current)
    print(latest_docs)
--- a/backend/danswer/connectors/notion/connector.py
+++ b/backend/danswer/connectors/notion/connector.py
@@ -2,6 +2,8 @@ import time
 from collections.abc import Generator
 from dataclasses import dataclass
 from dataclasses import fields
 from datetime import datetime
 from datetime import timezone
 from typing import Any
 from typing import Optional
@@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector):
                    ],
                    source=DocumentSource.NOTION,
                    semantic_identifier=page_title,
                    doc_updated_at=datetime.fromisoformat(
                        page.last_edited_time
                    ).astimezone(timezone.utc),
                    metadata={},
                )
            )
@@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector):
 if __name__ == "__main__":
    import os
-    root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
+    connector = NotionConnector()
    connector = NotionConnector(root_page_id=root_page_id)
    connector.load_credentials(
        {"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
    )
--- a/backend/danswer/connectors/slack/connector.py
+++ b/backend/danswer/connectors/slack/connector.py
@@ -1,7 +1,8 @@
 import json
 import os
 from collections.abc import Callable
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone
 from pathlib import Path
 from typing import Any
 from typing import cast
@@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
    return threads
 def get_latest_message_time(thread: ThreadType) -> datetime:
    max_ts = max([float(msg.get("ts", 0)) for msg in thread])
    return datetime.fromtimestamp(max_ts, tz=timezone.utc)
 def get_event_time(event: dict[str, Any]) -> datetime | None:
    ts = event.get("ts")
    if not ts:
        return None
    return datetime.fromtimestamp(float(ts), tz=timezone.utc)
 def thread_to_doc(
    workspace: str,
    channel: ChannelType,
@@ -148,6 +161,7 @@ def thread_to_doc(
        ],
        source=DocumentSource.SLACK,
        semantic_identifier=channel["name"],
        doc_updated_at=get_latest_message_time(thread),
        title="",  # slack docs don't really have a "title"
        metadata={},
    )
@@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector):
                    source=matching_doc.source,
                    semantic_identifier=matching_doc.semantic_identifier,
                    title="",  # slack docs don't really have a "title"
                    doc_updated_at=get_event_time(slack_event),
                    metadata=matching_doc.metadata,
                )
@@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector):
                source=DocumentSource.SLACK,
                semantic_identifier=channel["name"],
                title="",  # slack docs don't really have a "title"
                doc_updated_at=get_event_time(slack_event),
                metadata={},
            )
@@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector):
        if documents:
            yield documents
 if __name__ == "__main__":
    import os
    import time
    connector = SlackPollConnector(
        workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]]
    )
    connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})
    current = time.time()
    one_day_ago = current - 24 * 60 * 60  # 1 day
    document_batches = connector.poll_source(one_day_ago, current)
    print(next(document_batches))
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,5 +1,4 @@
 import io
 from datetime import datetime
 from enum import Enum
 from typing import Any
 from typing import cast
@@ -173,8 +172,6 @@ class WebConnector(LoadConnector):
            logger.info(f"Visiting {current_url}")
            try:
                current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
                if restart_playwright:
                    playwright, context = start_playwright()
                    restart_playwright = False
@@ -192,7 +189,7 @@ class WebConnector(LoadConnector):
                            sections=[Section(link=current_url, text=page_text)],
                            source=DocumentSource.WEB,
                            semantic_identifier=current_url.split(".")[-1],
-                            metadata={"Time Visited": current_visit_time},
+                            metadata={},
                        )
                    )
                    continue