Poll Connector Window Overlap (#930)

This commit is contained in:
Yuhong Sun
2024-01-11 11:10:01 -08:00
committed by GitHub
parent 54347e100f
commit 2a139fd529
3 changed files with 10 additions and 3 deletions

View File

@@ -1,11 +1,13 @@
import time
from datetime import datetime
from datetime import timedelta
from datetime import timezone
import torch
from sqlalchemy.orm import Session
from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt
from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET
from danswer.connectors.factory import instantiate_connector
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@@ -121,6 +123,11 @@ def _run_indexing(
source_type=db_connector.source,
)
):
window_start = max(
window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
datetime(1970, 1, 1, tzinfo=timezone.utc),
)
doc_batch_generator = _get_document_generator(
db_session=db_session,
attempt=index_attempt,

View File

@@ -116,6 +116,8 @@ POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
#####
# Connector Configs
#####
POLL_CONNECTOR_OFFSET = 30 # Minutes overlap between poll windows
GOOGLE_DRIVE_INCLUDE_SHARED = False
GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False

View File

@@ -151,9 +151,7 @@ class GithubConnector(LoadConnector, PollConnector):
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
# Sometimes Issues are not updated right away, giving more than enough buffer with 1h
offset_start = max(0, int(start - 60 * 60))
start_datetime = datetime.utcfromtimestamp(offset_start)
start_datetime = datetime.utcfromtimestamp(start)
end_datetime = datetime.utcfromtimestamp(end)
return self._fetch_from_github(start_datetime, end_datetime)