From b0f76b97ef3bcf8d6b76609e1febaaa9af771ec8 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 2 Nov 2023 14:27:06 -0700 Subject: [PATCH] Guru and Productboard Time Updated (#683) --- .../cross_connector_utils/time_utils.py | 16 +++++++++++ .../connectors/document360/connector.py | 4 +-- backend/danswer/connectors/gong/connector.py | 2 -- backend/danswer/connectors/guru/connector.py | 28 +++++++++++++++++++ .../connectors/productboard/connector.py | 26 ++++++++++++++--- 5 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 backend/danswer/connectors/cross_connector_utils/time_utils.py diff --git a/backend/danswer/connectors/cross_connector_utils/time_utils.py b/backend/danswer/connectors/cross_connector_utils/time_utils.py new file mode 100644 index 0000000000..bab6d2e1f7 --- /dev/null +++ b/backend/danswer/connectors/cross_connector_utils/time_utils.py @@ -0,0 +1,16 @@ +from datetime import datetime +from datetime import timezone + +from dateutil.parser import parse + + +def datetime_to_utc(dt: datetime) -> datetime: + if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None: + dt = dt.replace(tzinfo=timezone.utc) + + return dt.astimezone(timezone.utc) + + +def time_str_to_utc(datetime_str: str) -> datetime: + dt = parse(datetime_str) + return datetime_to_utc(dt) diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py index 685cf6bf3e..5324aa1d75 100644 --- a/backend/danswer/connectors/document360/connector.py +++ b/backend/danswer/connectors/document360/connector.py @@ -190,8 +190,8 @@ if __name__ == "__main__": ) current = time.time() - one_day_ago = current - 24 * 60 * 60 * 360 # 1 year - latest_docs = document360_connector.poll_source(one_day_ago, current) + one_year_ago = current - 24 * 60 * 60 * 360 + latest_docs = document360_connector.poll_source(one_year_ago, current) for doc in latest_docs: print(doc) diff --git a/backend/danswer/connectors/gong/connector.py b/backend/danswer/connectors/gong/connector.py index d34447783c..44172ff14e 100644 --- a/backend/danswer/connectors/gong/connector.py +++ b/backend/danswer/connectors/gong/connector.py @@ -292,7 +292,6 @@ class GongConnector(LoadConnector, PollConnector): if __name__ == "__main__": import os - import time connector = GongConnector() connector.load_credentials( @@ -302,6 +301,5 @@ if __name__ == "__main__": } ) - current = time.time() latest_docs = connector.load_from_state() print(next(latest_docs)) diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py index c9e9fb6d0f..d60ccc50c3 100644 --- a/backend/danswer/connectors/guru/connector.py +++ b/backend/danswer/connectors/guru/connector.py @@ -8,6 +8,7 @@ import requests from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.html_utils import parse_html_page_basic +from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -77,13 +78,25 @@ class GuruConnector(LoadConnector, PollConnector): title = card["preferredPhrase"] link = GURU_CARDS_URL + card["slug"] content_text = title + "\n" + parse_html_page_basic(card["content"]) + last_updated = time_str_to_utc(card["lastModified"]) + last_verified = ( + time_str_to_utc(card.get("lastVerified")) + if card.get("lastVerified") + else None + ) + # For Danswer, we decay document score overtime, either last_updated or + # last_verified is a good enough signal for the document's recency + latest_time = ( + max(last_verified, last_updated) if last_verified else last_updated + ) doc_batch.append( Document( id=card["id"], sections=[Section(link=link, text=content_text)], source=DocumentSource.GURU, semantic_identifier=title, + doc_updated_at=latest_time, metadata={}, ) ) @@ -109,3 +122,18 @@ class GuruConnector(LoadConnector, PollConnector): end_time = unixtime_to_guru_time_str(end) return self._process_cards(start_time, end_time) + + +if __name__ == "__main__": + import os + + connector = GuruConnector() + connector.load_credentials( + { + "guru_user": os.environ["GURU_USER"], + "guru_user_token": os.environ["GURU_USER_TOKEN"], + } + ) + + latest_docs = connector.load_from_state() + print(next(latest_docs)) diff --git a/backend/danswer/connectors/productboard/connector.py b/backend/danswer/connectors/productboard/connector.py index eb5f41506d..14a54c1ac8 100644 --- a/backend/danswer/connectors/productboard/connector.py +++ b/backend/danswer/connectors/productboard/connector.py @@ -10,6 +10,7 @@ from retry import retry from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch @@ -108,11 +109,11 @@ class ProductboardConnector(PollConnector): ], semantic_identifier=feature["name"], source=DocumentSource.PRODUCTBOARD, + doc_updated_at=time_str_to_utc(feature["updatedAt"]), metadata={ "productboard_entity_type": feature["type"], "status": feature["status"]["name"], "owner": self._get_owner_email(feature), - "updated_at": feature["updatedAt"], }, ) @@ -136,10 +137,10 @@ class ProductboardConnector(PollConnector): ], semantic_identifier=component["name"], source=DocumentSource.PRODUCTBOARD, + doc_updated_at=time_str_to_utc(component["updatedAt"]), metadata={ "productboard_entity_type": "component", "owner": self._get_owner_email(component), - "updated_at": component["updatedAt"], }, ) @@ -164,10 +165,10 @@ class ProductboardConnector(PollConnector): ], semantic_identifier=product["name"], source=DocumentSource.PRODUCTBOARD, + doc_updated_at=time_str_to_utc(product["updatedAt"]), metadata={ "productboard_entity_type": "product", "owner": self._get_owner_email(product), - "updated_at": product["updatedAt"], }, ) @@ -190,11 +191,11 @@ class ProductboardConnector(PollConnector): ], semantic_identifier=objective["name"], source=DocumentSource.PRODUCTBOARD, + doc_updated_at=time_str_to_utc(objective["updatedAt"]), metadata={ "productboard_entity_type": "release", "state": objective["state"], "owner": self._get_owner_email(objective), - "updated_at": objective["updatedAt"], }, ) @@ -252,3 +253,20 @@ class ProductboardConnector(PollConnector): if document_batch: yield document_batch + + +if __name__ == "__main__": + import os + import time + + connector = ProductboardConnector() + connector.load_credentials( + { + "productboard_access_token": os.environ["PRODUCTBOARD_ACCESS_TOKEN"], + } + ) + + current = time.time() + one_year_ago = current - 24 * 60 * 60 * 360 + latest_docs = connector.poll_source(one_year_ago, current) + print(next(latest_docs))