Add Document UpdatedAt times for most connectors (#605)

This commit is contained in:
Yuhong Sun
2023-10-20 17:03:28 -07:00
committed by GitHub
parent a7099a1917
commit 5ddc9b34ab
11 changed files with 142 additions and 38 deletions

View File

@@ -47,6 +47,7 @@ def format_document_soup(
list_element_start = False
verbatim_output = 0
in_table = False
last_added_newline = False
for e in document.descendants:
verbatim_output -= 1
if isinstance(e, bs4.element.NavigableString):
@@ -57,11 +58,29 @@ def format_document_soup(
# Tables are represented in natural language with rows separated by newlines
# Can't have newlines then in the table elements
element_text = element_text.replace("\n", " ").strip()
# Some tags are translated to spaces but in the logic underneath this section, we
# translate them to newlines as a browser should render them such as with br
# This logic here avoids a space after newline when it shouldn't be there.
if last_added_newline and element_text.startswith(" "):
element_text = element_text[1:]
last_added_newline = False
if element_text:
if verbatim_output > 0:
text += element_text
else:
text += strip_newlines(element_text)
content_to_add = (
element_text
if verbatim_output > 0
else strip_newlines(element_text)
)
# Don't join separate elements without any spacing
if (text and not text[-1].isspace()) and (
content_to_add and not content_to_add[0].isspace()
):
text += " "
text += content_to_add
list_element_start = False
elif isinstance(e, bs4.element.Tag):
# table is standard HTML element
@@ -82,9 +101,14 @@ def format_document_soup(
elif e.name in ["p", "div"]:
if not list_element_start:
text += "\n"
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
elif e.name in ["h1", "h2", "h3", "h4"]:
text += "\n"
list_element_start = False
last_added_newline = True
elif e.name == "br":
text += "\n"
list_element_start = False
last_added_newline = True
elif e.name == "li":
text += "\n- "
list_element_start = True

View File

@@ -3,6 +3,7 @@ from datetime import timezone
from typing import Any
from urllib.parse import urlparse
from dateutil.parser import parse
from jira import JIRA
from jira.resources import Issue
@@ -59,6 +60,8 @@ def fetch_jira_issues_batch(
logger.warning(f"Found Jira object not of type Issue {jira}")
continue
ticket_updated_time = parse(jira.fields.updated)
semantic_rep = (
f"Jira Ticket Summary: {jira.fields.summary}\n"
f"Description: {jira.fields.description}\n"
@@ -75,6 +78,7 @@ def fetch_jira_issues_batch(
sections=[Section(link=page_url, text=semantic_rep)],
source=DocumentSource.JIRA,
semantic_identifier=jira.fields.summary,
doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
metadata={},
)
)
@@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector):
start_ind += fetched_batch_size
if fetched_batch_size < self.batch_size:
break
if __name__ == "__main__":
import os
connector = JiraConnector(os.environ["JIRA_PROJECT_URL"])
connector.load_credentials(
{
"jira_user_email": os.environ["JIRA_USER_EMAIL"],
"jira_api_token": os.environ["JIRA_API_TOKEN"],
}
)
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@@ -5,10 +5,10 @@ from typing import List
from typing import Optional
import requests
from bs4 import BeautifulSoup
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
@@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector):
if end is not None and updated_at > end:
continue
authors = [
author["email_id"]
for author in article_details.get("authors", [])
if author["email_id"]
]
doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
html_content = article_details["html_content"]
soup = BeautifulSoup(html_content, "html.parser")
article_content = soup.get_text()
article_content = parse_html_page_basic(html_content)
doc_text = (
f"workspace: {self.workspace}\n"
f"category: {article['category_name']}\n"
f"article: {article_details['title']} - "
f"{article_details.get('description', '')} - "
f"{article_details.get('description', '')}\n"
f"{article_content}"
)
@@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector):
sections=[Section(link=doc_link, text=doc_text)],
source=DocumentSource.DOCUMENT360,
semantic_identifier=article_details["title"],
doc_updated_at=updated_at,
primary_owners=authors,
metadata={},
)
@@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector):
if __name__ == "__main__":
import time
import os
document360_connector = Document360Connector("Your Workspace", ["Your categories"])
document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
document360_connector.load_credentials(
{"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"}
{
"portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
"document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
}
)
current = time.time()
one_day_ago = current - 24 * 60 * 60 # 1 days
one_day_ago = current - 24 * 60 * 60 * 360 # 1 year
latest_docs = document360_connector.poll_source(one_day_ago, current)
for doc in latest_docs:

View File

@@ -1,4 +1,6 @@
from collections.abc import Generator
from datetime import datetime
from datetime import timezone
from pathlib import Path
from typing import Any
from typing import IO
@@ -41,6 +43,7 @@ def _open_files_at_location(
def _process_file(
file_name: str,
file: IO[Any],
time_updated: datetime,
pdf_pass: str | None = None,
) -> list[Document]:
extension = get_file_ext(file_name)
@@ -63,6 +66,7 @@ def _process_file(
sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
source=DocumentSource.FILE,
semantic_identifier=file_name,
doc_updated_at=time_updated,
metadata={},
)
]
@@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector):
def load_from_state(self) -> GenerateDocumentsOutput:
documents: list[Document] = []
for file_location in self.file_locations:
current_datetime = datetime.now(timezone.utc)
files = _open_files_at_location(file_location)
for file_name, file in files:
documents.extend(_process_file(file_name, file, self.pdf_pass))
documents.extend(
_process_file(file_name, file, current_datetime, self.pdf_pass)
)
if len(documents) >= self.batch_size:
yield documents

View File

@@ -1,6 +1,7 @@
import itertools
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import cast
@@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
sections=[Section(link=pull_request.html_url, text=full_context)],
source=DocumentSource.GITHUB,
semantic_identifier=pull_request.title,
# updated_at is UTC time but is timezone unaware, explicitly add UTC
# as there is logic in indexing to prevent wrong timestamped docs
# due to local time discrepancies with UTC
doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
metadata={
"last_modified": str(pull_request.last_modified),
"merged": pull_request.merged,
"state": pull_request.state,
},
@@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document:
sections=[Section(link=issue.html_url, text=full_context)],
source=DocumentSource.GITHUB,
semantic_identifier=issue.title,
# updated_at is UTC time but is timezone unaware
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
metadata={
"last_modified": str(issue.updated_at),
"state": issue.state,
},
)

View File

@@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector):
response = requests.post(
url, headers=self._get_auth_header(), json=body
)
# If no calls in the range, just break out
if response.status_code == 404:
break
response.raise_for_status()
data = response.json()
@@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector):
source=DocumentSource.GONG,
# Should not ever be Untitled as a call cannot be made without a Title
semantic_identifier=call_title or "Untitled",
doc_updated_at=datetime.fromisoformat(
call_metadata["started"]
).astimezone(timezone.utc),
metadata={"Start Time": call_metadata["started"]},
)
)
@@ -270,6 +276,5 @@ if __name__ == "__main__":
)
current = time.time()
one_day_ago = current - 24 * 60 * 60 # 1 day
latest_docs = connector.poll_source(one_day_ago, current)
latest_docs = connector.load_from_state()
print(next(latest_docs))

View File

@@ -1,8 +1,9 @@
import datetime
import io
import tempfile
from collections.abc import Iterator
from collections.abc import Sequence
from datetime import datetime
from datetime import timezone
from enum import Enum
from itertools import chain
from typing import Any
@@ -83,7 +84,7 @@ def _run_drive_file_query(
includeItemsFromAllDrives=include_shared,
fields=(
"nextPageToken, files(mimeType, id, name, "
"webViewLink, shortcutDetails)"
"modifiedTime, webViewLink, shortcutDetails)"
),
pageToken=next_page_token,
q=query,
@@ -194,12 +195,10 @@ def _get_files(
) -> Iterator[GoogleDriveFileType]:
query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
if time_range_start is not None:
time_start = (
datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
)
time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
query += f"and modifiedTime >= '{time_start}' "
if time_range_end is not None:
time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
query += f"and modifiedTime <= '{time_stop}' "
if folder_id:
query += f"and '{folder_id}' in parents "
@@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
],
source=DocumentSource.GOOGLE_DRIVE,
semantic_identifier=file["name"],
doc_updated_at=datetime.fromisoformat(
file["modifiedTime"]
).astimezone(timezone.utc),
metadata={} if text_contents else {IGNORE_FOR_QA: True},
)
)

View File

@@ -1,4 +1,5 @@
from datetime import datetime
from datetime import timezone
from typing import Any
import requests
@@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector):
sections=[Section(link=link, text=content_text)],
source=DocumentSource.HUBSPOT,
semantic_identifier=title,
# Is already in tzutc, just replacing the timezone format
doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
metadata={},
)
)
@@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector):
if __name__ == "__main__":
import os
import time
test_connector = HubSpotConnector()
test_connector.load_credentials(
connector = HubSpotConnector()
connector.load_credentials(
{"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
)
all_docs = test_connector.load_from_state()
current = time.time()
one_day_ago = current - 24 * 60 * 60 # 1 day
latest_docs = test_connector.poll_source(one_day_ago, current)
print(latest_docs)
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@@ -2,6 +2,8 @@ import time
from collections.abc import Generator
from dataclasses import dataclass
from dataclasses import fields
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import Optional
@@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector):
],
source=DocumentSource.NOTION,
semantic_identifier=page_title,
doc_updated_at=datetime.fromisoformat(
page.last_edited_time
).astimezone(timezone.utc),
metadata={},
)
)
@@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector):
if __name__ == "__main__":
import os
root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
connector = NotionConnector(root_page_id=root_page_id)
connector = NotionConnector()
connector.load_credentials(
{"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
)

View File

@@ -1,7 +1,8 @@
import json
import os
from collections.abc import Callable
from collections.abc import Generator
from datetime import datetime
from datetime import timezone
from pathlib import Path
from typing import Any
from typing import cast
@@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
return threads
def get_latest_message_time(thread: ThreadType) -> datetime:
max_ts = max([float(msg.get("ts", 0)) for msg in thread])
return datetime.fromtimestamp(max_ts, tz=timezone.utc)
def get_event_time(event: dict[str, Any]) -> datetime | None:
ts = event.get("ts")
if not ts:
return None
return datetime.fromtimestamp(float(ts), tz=timezone.utc)
def thread_to_doc(
workspace: str,
channel: ChannelType,
@@ -148,6 +161,7 @@ def thread_to_doc(
],
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
doc_updated_at=get_latest_message_time(thread),
title="", # slack docs don't really have a "title"
metadata={},
)
@@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector):
source=matching_doc.source,
semantic_identifier=matching_doc.semantic_identifier,
title="", # slack docs don't really have a "title"
doc_updated_at=get_event_time(slack_event),
metadata=matching_doc.metadata,
)
@@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector):
source=DocumentSource.SLACK,
semantic_identifier=channel["name"],
title="", # slack docs don't really have a "title"
doc_updated_at=get_event_time(slack_event),
metadata={},
)
@@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector):
if documents:
yield documents
if __name__ == "__main__":
import os
import time
connector = SlackPollConnector(
workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]]
)
connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})
current = time.time()
one_day_ago = current - 24 * 60 * 60 # 1 day
document_batches = connector.poll_source(one_day_ago, current)
print(next(document_batches))

View File

@@ -1,5 +1,4 @@
import io
from datetime import datetime
from enum import Enum
from typing import Any
from typing import cast
@@ -173,8 +172,6 @@ class WebConnector(LoadConnector):
logger.info(f"Visiting {current_url}")
try:
current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
if restart_playwright:
playwright, context = start_playwright()
restart_playwright = False
@@ -192,7 +189,7 @@ class WebConnector(LoadConnector):
sections=[Section(link=current_url, text=page_text)],
source=DocumentSource.WEB,
semantic_identifier=current_url.split(".")[-1],
metadata={"Time Visited": current_visit_time},
metadata={},
)
)
continue