mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-23 12:31:30 +02:00
Add Document UpdatedAt times for most connectors (#605)
This commit is contained in:
@@ -47,6 +47,7 @@ def format_document_soup(
|
|||||||
list_element_start = False
|
list_element_start = False
|
||||||
verbatim_output = 0
|
verbatim_output = 0
|
||||||
in_table = False
|
in_table = False
|
||||||
|
last_added_newline = False
|
||||||
for e in document.descendants:
|
for e in document.descendants:
|
||||||
verbatim_output -= 1
|
verbatim_output -= 1
|
||||||
if isinstance(e, bs4.element.NavigableString):
|
if isinstance(e, bs4.element.NavigableString):
|
||||||
@@ -57,11 +58,29 @@ def format_document_soup(
|
|||||||
# Tables are represented in natural language with rows separated by newlines
|
# Tables are represented in natural language with rows separated by newlines
|
||||||
# Can't have newlines then in the table elements
|
# Can't have newlines then in the table elements
|
||||||
element_text = element_text.replace("\n", " ").strip()
|
element_text = element_text.replace("\n", " ").strip()
|
||||||
|
|
||||||
|
# Some tags are translated to spaces but in the logic underneath this section, we
|
||||||
|
# translate them to newlines as a browser should render them such as with br
|
||||||
|
# This logic here avoids a space after newline when it shouldn't be there.
|
||||||
|
if last_added_newline and element_text.startswith(" "):
|
||||||
|
element_text = element_text[1:]
|
||||||
|
last_added_newline = False
|
||||||
|
|
||||||
if element_text:
|
if element_text:
|
||||||
if verbatim_output > 0:
|
content_to_add = (
|
||||||
text += element_text
|
element_text
|
||||||
else:
|
if verbatim_output > 0
|
||||||
text += strip_newlines(element_text)
|
else strip_newlines(element_text)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Don't join separate elements without any spacing
|
||||||
|
if (text and not text[-1].isspace()) and (
|
||||||
|
content_to_add and not content_to_add[0].isspace()
|
||||||
|
):
|
||||||
|
text += " "
|
||||||
|
|
||||||
|
text += content_to_add
|
||||||
|
|
||||||
list_element_start = False
|
list_element_start = False
|
||||||
elif isinstance(e, bs4.element.Tag):
|
elif isinstance(e, bs4.element.Tag):
|
||||||
# table is standard HTML element
|
# table is standard HTML element
|
||||||
@@ -82,9 +101,14 @@ def format_document_soup(
|
|||||||
elif e.name in ["p", "div"]:
|
elif e.name in ["p", "div"]:
|
||||||
if not list_element_start:
|
if not list_element_start:
|
||||||
text += "\n"
|
text += "\n"
|
||||||
elif e.name in ["br", "h1", "h2", "h3", "h4", "tr", "th", "td"]:
|
elif e.name in ["h1", "h2", "h3", "h4"]:
|
||||||
text += "\n"
|
text += "\n"
|
||||||
list_element_start = False
|
list_element_start = False
|
||||||
|
last_added_newline = True
|
||||||
|
elif e.name == "br":
|
||||||
|
text += "\n"
|
||||||
|
list_element_start = False
|
||||||
|
last_added_newline = True
|
||||||
elif e.name == "li":
|
elif e.name == "li":
|
||||||
text += "\n- "
|
text += "\n- "
|
||||||
list_element_start = True
|
list_element_start = True
|
||||||
|
@@ -3,6 +3,7 @@ from datetime import timezone
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from dateutil.parser import parse
|
||||||
from jira import JIRA
|
from jira import JIRA
|
||||||
from jira.resources import Issue
|
from jira.resources import Issue
|
||||||
|
|
||||||
@@ -59,6 +60,8 @@ def fetch_jira_issues_batch(
|
|||||||
logger.warning(f"Found Jira object not of type Issue {jira}")
|
logger.warning(f"Found Jira object not of type Issue {jira}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
ticket_updated_time = parse(jira.fields.updated)
|
||||||
|
|
||||||
semantic_rep = (
|
semantic_rep = (
|
||||||
f"Jira Ticket Summary: {jira.fields.summary}\n"
|
f"Jira Ticket Summary: {jira.fields.summary}\n"
|
||||||
f"Description: {jira.fields.description}\n"
|
f"Description: {jira.fields.description}\n"
|
||||||
@@ -75,6 +78,7 @@ def fetch_jira_issues_batch(
|
|||||||
sections=[Section(link=page_url, text=semantic_rep)],
|
sections=[Section(link=page_url, text=semantic_rep)],
|
||||||
source=DocumentSource.JIRA,
|
source=DocumentSource.JIRA,
|
||||||
semantic_identifier=jira.fields.summary,
|
semantic_identifier=jira.fields.summary,
|
||||||
|
doc_updated_at=ticket_updated_time.astimezone(timezone.utc),
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -151,3 +155,17 @@ class JiraConnector(LoadConnector, PollConnector):
|
|||||||
start_ind += fetched_batch_size
|
start_ind += fetched_batch_size
|
||||||
if fetched_batch_size < self.batch_size:
|
if fetched_batch_size < self.batch_size:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import os
|
||||||
|
|
||||||
|
connector = JiraConnector(os.environ["JIRA_PROJECT_URL"])
|
||||||
|
connector.load_credentials(
|
||||||
|
{
|
||||||
|
"jira_user_email": os.environ["JIRA_USER_EMAIL"],
|
||||||
|
"jira_api_token": os.environ["JIRA_API_TOKEN"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
document_batches = connector.load_from_state()
|
||||||
|
print(next(document_batches))
|
||||||
|
@@ -5,10 +5,10 @@ from typing import List
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.interfaces import PollConnector
|
from danswer.connectors.interfaces import PollConnector
|
||||||
@@ -120,16 +120,21 @@ class Document360Connector(LoadConnector, PollConnector):
|
|||||||
if end is not None and updated_at > end:
|
if end is not None and updated_at > end:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
authors = [
|
||||||
|
author["email_id"]
|
||||||
|
for author in article_details.get("authors", [])
|
||||||
|
if author["email_id"]
|
||||||
|
]
|
||||||
|
|
||||||
doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
|
doc_link = f"{DOCUMENT360_BASE_URL}/{self.portal_id}/document/v1/view/{article['id']}"
|
||||||
|
|
||||||
html_content = article_details["html_content"]
|
html_content = article_details["html_content"]
|
||||||
soup = BeautifulSoup(html_content, "html.parser")
|
article_content = parse_html_page_basic(html_content)
|
||||||
article_content = soup.get_text()
|
|
||||||
doc_text = (
|
doc_text = (
|
||||||
f"workspace: {self.workspace}\n"
|
f"workspace: {self.workspace}\n"
|
||||||
f"category: {article['category_name']}\n"
|
f"category: {article['category_name']}\n"
|
||||||
f"article: {article_details['title']} - "
|
f"article: {article_details['title']} - "
|
||||||
f"{article_details.get('description', '')} - "
|
f"{article_details.get('description', '')}\n"
|
||||||
f"{article_content}"
|
f"{article_content}"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -138,6 +143,8 @@ class Document360Connector(LoadConnector, PollConnector):
|
|||||||
sections=[Section(link=doc_link, text=doc_text)],
|
sections=[Section(link=doc_link, text=doc_text)],
|
||||||
source=DocumentSource.DOCUMENT360,
|
source=DocumentSource.DOCUMENT360,
|
||||||
semantic_identifier=article_details["title"],
|
semantic_identifier=article_details["title"],
|
||||||
|
doc_updated_at=updated_at,
|
||||||
|
primary_owners=authors,
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -163,14 +170,18 @@ class Document360Connector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
document360_connector = Document360Connector("Your Workspace", ["Your categories"])
|
document360_connector = Document360Connector(os.environ["DOCUMENT360_WORKSPACE"])
|
||||||
document360_connector.load_credentials(
|
document360_connector.load_credentials(
|
||||||
{"portal_id": "Your Portal ID", "document360_api_token": "Your API Token"}
|
{
|
||||||
|
"portal_id": os.environ["DOCUMENT360_PORTAL_ID"],
|
||||||
|
"document360_api_token": os.environ["DOCUMENT360_API_TOKEN"],
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
current = time.time()
|
current = time.time()
|
||||||
one_day_ago = current - 24 * 60 * 60 # 1 days
|
one_day_ago = current - 24 * 60 * 60 * 360 # 1 year
|
||||||
latest_docs = document360_connector.poll_source(one_day_ago, current)
|
latest_docs = document360_connector.poll_source(one_day_ago, current)
|
||||||
|
|
||||||
for doc in latest_docs:
|
for doc in latest_docs:
|
||||||
|
@@ -1,4 +1,6 @@
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import IO
|
from typing import IO
|
||||||
@@ -41,6 +43,7 @@ def _open_files_at_location(
|
|||||||
def _process_file(
|
def _process_file(
|
||||||
file_name: str,
|
file_name: str,
|
||||||
file: IO[Any],
|
file: IO[Any],
|
||||||
|
time_updated: datetime,
|
||||||
pdf_pass: str | None = None,
|
pdf_pass: str | None = None,
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
extension = get_file_ext(file_name)
|
extension = get_file_ext(file_name)
|
||||||
@@ -63,6 +66,7 @@ def _process_file(
|
|||||||
sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
|
sections=[Section(link=metadata.get("link", ""), text=file_content_raw)],
|
||||||
source=DocumentSource.FILE,
|
source=DocumentSource.FILE,
|
||||||
semantic_identifier=file_name,
|
semantic_identifier=file_name,
|
||||||
|
doc_updated_at=time_updated,
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
@@ -85,10 +89,13 @@ class LocalFileConnector(LoadConnector):
|
|||||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
documents: list[Document] = []
|
documents: list[Document] = []
|
||||||
for file_location in self.file_locations:
|
for file_location in self.file_locations:
|
||||||
|
current_datetime = datetime.now(timezone.utc)
|
||||||
files = _open_files_at_location(file_location)
|
files = _open_files_at_location(file_location)
|
||||||
|
|
||||||
for file_name, file in files:
|
for file_name, file in files:
|
||||||
documents.extend(_process_file(file_name, file, self.pdf_pass))
|
documents.extend(
|
||||||
|
_process_file(file_name, file, current_datetime, self.pdf_pass)
|
||||||
|
)
|
||||||
|
|
||||||
if len(documents) >= self.batch_size:
|
if len(documents) >= self.batch_size:
|
||||||
yield documents
|
yield documents
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import itertools
|
import itertools
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
@@ -42,8 +43,11 @@ def _convert_pr_to_document(pull_request: PullRequest) -> Document:
|
|||||||
sections=[Section(link=pull_request.html_url, text=full_context)],
|
sections=[Section(link=pull_request.html_url, text=full_context)],
|
||||||
source=DocumentSource.GITHUB,
|
source=DocumentSource.GITHUB,
|
||||||
semantic_identifier=pull_request.title,
|
semantic_identifier=pull_request.title,
|
||||||
|
# updated_at is UTC time but is timezone unaware, explicitly add UTC
|
||||||
|
# as there is logic in indexing to prevent wrong timestamped docs
|
||||||
|
# due to local time discrepancies with UTC
|
||||||
|
doc_updated_at=pull_request.updated_at.replace(tzinfo=timezone.utc),
|
||||||
metadata={
|
metadata={
|
||||||
"last_modified": str(pull_request.last_modified),
|
|
||||||
"merged": pull_request.merged,
|
"merged": pull_request.merged,
|
||||||
"state": pull_request.state,
|
"state": pull_request.state,
|
||||||
},
|
},
|
||||||
@@ -62,8 +66,9 @@ def _convert_issue_to_document(issue: Issue) -> Document:
|
|||||||
sections=[Section(link=issue.html_url, text=full_context)],
|
sections=[Section(link=issue.html_url, text=full_context)],
|
||||||
source=DocumentSource.GITHUB,
|
source=DocumentSource.GITHUB,
|
||||||
semantic_identifier=issue.title,
|
semantic_identifier=issue.title,
|
||||||
|
# updated_at is UTC time but is timezone unaware
|
||||||
|
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
|
||||||
metadata={
|
metadata={
|
||||||
"last_modified": str(issue.updated_at),
|
|
||||||
"state": issue.state,
|
"state": issue.state,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@@ -90,6 +90,9 @@ class GongConnector(LoadConnector, PollConnector):
|
|||||||
response = requests.post(
|
response = requests.post(
|
||||||
url, headers=self._get_auth_header(), json=body
|
url, headers=self._get_auth_header(), json=body
|
||||||
)
|
)
|
||||||
|
# If no calls in the range, just break out
|
||||||
|
if response.status_code == 404:
|
||||||
|
break
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
data = response.json()
|
data = response.json()
|
||||||
@@ -223,6 +226,9 @@ class GongConnector(LoadConnector, PollConnector):
|
|||||||
source=DocumentSource.GONG,
|
source=DocumentSource.GONG,
|
||||||
# Should not ever be Untitled as a call cannot be made without a Title
|
# Should not ever be Untitled as a call cannot be made without a Title
|
||||||
semantic_identifier=call_title or "Untitled",
|
semantic_identifier=call_title or "Untitled",
|
||||||
|
doc_updated_at=datetime.fromisoformat(
|
||||||
|
call_metadata["started"]
|
||||||
|
).astimezone(timezone.utc),
|
||||||
metadata={"Start Time": call_metadata["started"]},
|
metadata={"Start Time": call_metadata["started"]},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -270,6 +276,5 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
current = time.time()
|
current = time.time()
|
||||||
one_day_ago = current - 24 * 60 * 60 # 1 day
|
latest_docs = connector.load_from_state()
|
||||||
latest_docs = connector.poll_source(one_day_ago, current)
|
|
||||||
print(next(latest_docs))
|
print(next(latest_docs))
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
import datetime
|
|
||||||
import io
|
import io
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -83,7 +84,7 @@ def _run_drive_file_query(
|
|||||||
includeItemsFromAllDrives=include_shared,
|
includeItemsFromAllDrives=include_shared,
|
||||||
fields=(
|
fields=(
|
||||||
"nextPageToken, files(mimeType, id, name, "
|
"nextPageToken, files(mimeType, id, name, "
|
||||||
"webViewLink, shortcutDetails)"
|
"modifiedTime, webViewLink, shortcutDetails)"
|
||||||
),
|
),
|
||||||
pageToken=next_page_token,
|
pageToken=next_page_token,
|
||||||
q=query,
|
q=query,
|
||||||
@@ -194,12 +195,10 @@ def _get_files(
|
|||||||
) -> Iterator[GoogleDriveFileType]:
|
) -> Iterator[GoogleDriveFileType]:
|
||||||
query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
|
query = f"mimeType != '{DRIVE_FOLDER_TYPE}' "
|
||||||
if time_range_start is not None:
|
if time_range_start is not None:
|
||||||
time_start = (
|
time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
|
||||||
datetime.datetime.utcfromtimestamp(time_range_start).isoformat() + "Z"
|
|
||||||
)
|
|
||||||
query += f"and modifiedTime >= '{time_start}' "
|
query += f"and modifiedTime >= '{time_start}' "
|
||||||
if time_range_end is not None:
|
if time_range_end is not None:
|
||||||
time_stop = datetime.datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
|
time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z"
|
||||||
query += f"and modifiedTime <= '{time_stop}' "
|
query += f"and modifiedTime <= '{time_stop}' "
|
||||||
if folder_id:
|
if folder_id:
|
||||||
query += f"and '{folder_id}' in parents "
|
query += f"and '{folder_id}' in parents "
|
||||||
@@ -464,6 +463,9 @@ class GoogleDriveConnector(LoadConnector, PollConnector):
|
|||||||
],
|
],
|
||||||
source=DocumentSource.GOOGLE_DRIVE,
|
source=DocumentSource.GOOGLE_DRIVE,
|
||||||
semantic_identifier=file["name"],
|
semantic_identifier=file["name"],
|
||||||
|
doc_updated_at=datetime.fromisoformat(
|
||||||
|
file["modifiedTime"]
|
||||||
|
).astimezone(timezone.utc),
|
||||||
metadata={} if text_contents else {IGNORE_FOR_QA: True},
|
metadata={} if text_contents else {IGNORE_FOR_QA: True},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@@ -106,6 +107,8 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
|||||||
sections=[Section(link=link, text=content_text)],
|
sections=[Section(link=link, text=content_text)],
|
||||||
source=DocumentSource.HUBSPOT,
|
source=DocumentSource.HUBSPOT,
|
||||||
semantic_identifier=title,
|
semantic_identifier=title,
|
||||||
|
# Is already in tzutc, just replacing the timezone format
|
||||||
|
doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -130,15 +133,11 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
import os
|
||||||
import time
|
|
||||||
|
|
||||||
test_connector = HubSpotConnector()
|
connector = HubSpotConnector()
|
||||||
test_connector.load_credentials(
|
connector.load_credentials(
|
||||||
{"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
|
{"hubspot_access_token": os.environ["HUBSPOT_ACCESS_TOKEN"]}
|
||||||
)
|
)
|
||||||
all_docs = test_connector.load_from_state()
|
|
||||||
|
|
||||||
current = time.time()
|
document_batches = connector.load_from_state()
|
||||||
one_day_ago = current - 24 * 60 * 60 # 1 day
|
print(next(document_batches))
|
||||||
latest_docs = test_connector.poll_source(one_day_ago, current)
|
|
||||||
print(latest_docs)
|
|
||||||
|
@@ -2,6 +2,8 @@ import time
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from dataclasses import fields
|
from dataclasses import fields
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -191,6 +193,9 @@ class NotionConnector(LoadConnector, PollConnector):
|
|||||||
],
|
],
|
||||||
source=DocumentSource.NOTION,
|
source=DocumentSource.NOTION,
|
||||||
semantic_identifier=page_title,
|
semantic_identifier=page_title,
|
||||||
|
doc_updated_at=datetime.fromisoformat(
|
||||||
|
page.last_edited_time
|
||||||
|
).astimezone(timezone.utc),
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -323,8 +328,7 @@ class NotionConnector(LoadConnector, PollConnector):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
import os
|
||||||
|
|
||||||
root_page_id = os.environ.get("NOTION_ROOT_PAGE_ID")
|
connector = NotionConnector()
|
||||||
connector = NotionConnector(root_page_id=root_page_id)
|
|
||||||
connector.load_credentials(
|
connector.load_credentials(
|
||||||
{"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
|
{"notion_integration_token": os.environ.get("NOTION_INTEGRATION_TOKEN")}
|
||||||
)
|
)
|
||||||
|
@@ -1,7 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
@@ -128,6 +129,18 @@ def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType
|
|||||||
return threads
|
return threads
|
||||||
|
|
||||||
|
|
||||||
|
def get_latest_message_time(thread: ThreadType) -> datetime:
|
||||||
|
max_ts = max([float(msg.get("ts", 0)) for msg in thread])
|
||||||
|
return datetime.fromtimestamp(max_ts, tz=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
def get_event_time(event: dict[str, Any]) -> datetime | None:
|
||||||
|
ts = event.get("ts")
|
||||||
|
if not ts:
|
||||||
|
return None
|
||||||
|
return datetime.fromtimestamp(float(ts), tz=timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
def thread_to_doc(
|
def thread_to_doc(
|
||||||
workspace: str,
|
workspace: str,
|
||||||
channel: ChannelType,
|
channel: ChannelType,
|
||||||
@@ -148,6 +161,7 @@ def thread_to_doc(
|
|||||||
],
|
],
|
||||||
source=DocumentSource.SLACK,
|
source=DocumentSource.SLACK,
|
||||||
semantic_identifier=channel["name"],
|
semantic_identifier=channel["name"],
|
||||||
|
doc_updated_at=get_latest_message_time(thread),
|
||||||
title="", # slack docs don't really have a "title"
|
title="", # slack docs don't really have a "title"
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
@@ -304,6 +318,7 @@ class SlackLoadConnector(LoadConnector):
|
|||||||
source=matching_doc.source,
|
source=matching_doc.source,
|
||||||
semantic_identifier=matching_doc.semantic_identifier,
|
semantic_identifier=matching_doc.semantic_identifier,
|
||||||
title="", # slack docs don't really have a "title"
|
title="", # slack docs don't really have a "title"
|
||||||
|
doc_updated_at=get_event_time(slack_event),
|
||||||
metadata=matching_doc.metadata,
|
metadata=matching_doc.metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -322,6 +337,7 @@ class SlackLoadConnector(LoadConnector):
|
|||||||
source=DocumentSource.SLACK,
|
source=DocumentSource.SLACK,
|
||||||
semantic_identifier=channel["name"],
|
semantic_identifier=channel["name"],
|
||||||
title="", # slack docs don't really have a "title"
|
title="", # slack docs don't really have a "title"
|
||||||
|
doc_updated_at=get_event_time(slack_event),
|
||||||
metadata={},
|
metadata={},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -403,3 +419,19 @@ class SlackPollConnector(PollConnector):
|
|||||||
|
|
||||||
if documents:
|
if documents:
|
||||||
yield documents
|
yield documents
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
connector = SlackPollConnector(
|
||||||
|
workspace=os.environ["SLACK_WORKSPACE"], channels=[os.environ["SLACK_CHANNEL"]]
|
||||||
|
)
|
||||||
|
connector.load_credentials({"slack_bot_token": os.environ["SLACK_BOT_TOKEN"]})
|
||||||
|
|
||||||
|
current = time.time()
|
||||||
|
one_day_ago = current - 24 * 60 * 60 # 1 day
|
||||||
|
document_batches = connector.poll_source(one_day_ago, current)
|
||||||
|
|
||||||
|
print(next(document_batches))
|
||||||
|
@@ -1,5 +1,4 @@
|
|||||||
import io
|
import io
|
||||||
from datetime import datetime
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
@@ -173,8 +172,6 @@ class WebConnector(LoadConnector):
|
|||||||
logger.info(f"Visiting {current_url}")
|
logger.info(f"Visiting {current_url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
current_visit_time = datetime.now().strftime("%B %d, %Y, %H:%M:%S")
|
|
||||||
|
|
||||||
if restart_playwright:
|
if restart_playwright:
|
||||||
playwright, context = start_playwright()
|
playwright, context = start_playwright()
|
||||||
restart_playwright = False
|
restart_playwright = False
|
||||||
@@ -192,7 +189,7 @@ class WebConnector(LoadConnector):
|
|||||||
sections=[Section(link=current_url, text=page_text)],
|
sections=[Section(link=current_url, text=page_text)],
|
||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=current_url.split(".")[-1],
|
semantic_identifier=current_url.split(".")[-1],
|
||||||
metadata={"Time Visited": current_visit_time},
|
metadata={},
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
Reference in New Issue
Block a user