Poll Connector Window Overlap (#930)

This commit is contained in:
Yuhong Sun
2024-01-11 11:10:01 -08:00
committed by GitHub
parent 54347e100f
commit 2a139fd529
3 changed files with 10 additions and 3 deletions

View File

@@ -1,11 +1,13 @@
import time import time
from datetime import datetime from datetime import datetime
from datetime import timedelta
from datetime import timezone from datetime import timezone
import torch import torch
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt
from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET
from danswer.connectors.factory import instantiate_connector from danswer.connectors.factory import instantiate_connector
from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
@@ -121,6 +123,11 @@ def _run_indexing(
source_type=db_connector.source, source_type=db_connector.source,
) )
): ):
window_start = max(
window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
datetime(1970, 1, 1, tzinfo=timezone.utc),
)
doc_batch_generator = _get_document_generator( doc_batch_generator = _get_document_generator(
db_session=db_session, db_session=db_session,
attempt=index_attempt, attempt=index_attempt,

View File

@@ -116,6 +116,8 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
##### #####
# Connector Configs # Connector Configs
##### #####
POLL_CONNECTOR_OFFSET = 30 # Minutes overlap between poll windows
GOOGLE_DRIVE_INCLUDE_SHARED = False GOOGLE_DRIVE_INCLUDE_SHARED = False
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False

View File

@@ -151,9 +151,7 @@ class GithubConnector(LoadConnector, PollConnector):
def poll_source( def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput: ) -> GenerateDocumentsOutput:
# Sometimes Issues are not updated right away, giving more than enough buffer with 1h start_datetime = datetime.utcfromtimestamp(start)
offset_start = max(0, int(start - 60 * 60))
start_datetime = datetime.utcfromtimestamp(offset_start)
end_datetime = datetime.utcfromtimestamp(end) end_datetime = datetime.utcfromtimestamp(end)
return self._fetch_from_github(start_datetime, end_datetime) return self._fetch_from_github(start_datetime, end_datetime)