diff --git a/backend/danswer/background/indexing/run_indexing.py b/backend/danswer/background/indexing/run_indexing.py index 37ee7130cf47..0d7960159866 100644 --- a/backend/danswer/background/indexing/run_indexing.py +++ b/backend/danswer/background/indexing/run_indexing.py @@ -1,11 +1,13 @@ import time from datetime import datetime +from datetime import timedelta from datetime import timezone import torch from sqlalchemy.orm import Session from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt +from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET from danswer.connectors.factory import instantiate_connector from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector @@ -121,6 +123,11 @@ def _run_indexing( source_type=db_connector.source, ) ): + window_start = max( + window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET), + datetime(1970, 1, 1, tzinfo=timezone.utc), + ) + doc_batch_generator = _get_document_generator( db_session=db_session, attempt=index_attempt, diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index cf9c3a5a68a8..89646264ca68 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -116,6 +116,8 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres" ##### # Connector Configs ##### +POLL_CONNECTOR_OFFSET = 30 # Minutes overlap between poll windows + GOOGLE_DRIVE_INCLUDE_SHARED = False GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False diff --git a/backend/danswer/connectors/github/connector.py b/backend/danswer/connectors/github/connector.py index a2a583f98163..0b68458ed22c 100644 --- a/backend/danswer/connectors/github/connector.py +++ b/backend/danswer/connectors/github/connector.py @@ -151,9 +151,7 @@ class GithubConnector(LoadConnector, PollConnector): def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: - # Sometimes Issues are not updated right away, giving more than enough buffer with 1h - offset_start = max(0, int(start - 60 * 60)) - start_datetime = datetime.utcfromtimestamp(offset_start) + start_datetime = datetime.utcfromtimestamp(start) end_datetime = datetime.utcfromtimestamp(end) return self._fetch_from_github(start_datetime, end_datetime)