Indexing Job has timezone discrepancy with DB making Poll timeframes incorrect (#231)

This commit is contained in:
Yuhong Sun
2023-07-23 21:59:00 -07:00
committed by GitHub
parent 59f27e83bf
commit e019db0bc7
2 changed files with 14 additions and 3 deletions

View File

@@ -157,8 +157,11 @@ def run_indexing_jobs(db_session: Session) -> None:
last_run_time = get_last_successful_attempt_start_time(
attempt.connector_id, attempt.credential_id, db_session
)
# Covers very unlikely case that time offset check from DB having tiny variations that coincide with
# a new document being created
safe_last_run_time = max(last_run_time - 1, 0.0)
doc_batch_generator = runnable_connector.poll_source(
last_run_time, time.time()
safe_last_run_time, time.time()
)
else:

View File

@@ -8,6 +8,7 @@ from danswer.configs.app_configs import POSTGRES_HOST
from danswer.configs.app_configs import POSTGRES_PASSWORD
from danswer.configs.app_configs import POSTGRES_PORT
from danswer.configs.app_configs import POSTGRES_USER
from danswer.utils.logger import setup_logger
from sqlalchemy import text
from sqlalchemy.engine import create_engine
from sqlalchemy.engine import Engine
@@ -16,6 +17,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.ext.asyncio import create_async_engine
from sqlalchemy.orm import Session
logger = setup_logger()
SYNC_DB_API = "psycopg2"
ASYNC_DB_API = "asyncpg"
@@ -28,6 +30,10 @@ _ASYNC_ENGINE: AsyncEngine | None = None
def get_db_current_time(db_session: Session) -> datetime:
"""Get the current time from Postgres representing the start of the transaction
Within the same transaction this value will not update
This datetime object returned should be timezone aware, default Postgres timezone is UTC
"""
result = db_session.execute(text("SELECT NOW()")).scalar()
if result is None:
raise ValueError("Database did not return a time")
@@ -37,9 +43,11 @@ def get_db_current_time(db_session: Session) -> datetime:
def translate_db_time_to_server_time(
db_time: datetime, db_session: Session
) -> datetime:
server_now = datetime.now()
"""If a different database driver is used which does not include timezone info,
this should hit an exception rather than being wrong"""
server_now = datetime.now(timezone.utc)
db_now = get_db_current_time(db_session)
time_diff = server_now - db_now.astimezone(timezone.utc).replace(tzinfo=None)
time_diff = server_now - db_now
return db_time + time_diff