quick hack to prevent resyncing the same doc

This commit is contained in:
Richard Kuo (Danswer)
2025-01-05 03:05:32 -08:00
parent af2061c4db
commit 7fb92d42a0
3 changed files with 20 additions and 0 deletions

View File

@@ -898,6 +898,9 @@ def vespa_metadata_sync_task(
# the sync might repeat again later # the sync might repeat again later
mark_document_as_synced(document_id, db_session) mark_document_as_synced(document_id, db_session)
r = get_redis_client(tenant_id=tenant_id)
r.hdel(RedisConnectorCredentialPair.SYNCING_HASH, document_id)
task_logger.info(f"doc={document_id} action=sync chunks={chunks_affected}") task_logger.info(f"doc={document_id} action=sync chunks={chunks_affected}")
except SoftTimeLimitExceeded: except SoftTimeLimitExceeded:
task_logger.info(f"SoftTimeLimitExceeded exception. doc={document_id}") task_logger.info(f"SoftTimeLimitExceeded exception. doc={document_id}")

View File

@@ -30,6 +30,8 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
FENCE_PREFIX = PREFIX + "_fence" FENCE_PREFIX = PREFIX + "_fence"
TASKSET_PREFIX = PREFIX + "_taskset" TASKSET_PREFIX = PREFIX + "_taskset"
SYNCING_HASH = PREFIX + ":vespa_syncing"
def __init__(self, tenant_id: str | None, id: int) -> None: def __init__(self, tenant_id: str | None, id: int) -> None:
super().__init__(tenant_id, str(id)) super().__init__(tenant_id, str(id))
@@ -64,6 +66,9 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
lock: RedisLock, lock: RedisLock,
tenant_id: str | None, tenant_id: str | None,
) -> tuple[int, int] | None: ) -> tuple[int, int] | None:
# an arbitrary number in seconds to prevent the same doc from syncing repeatedly
SYNC_EXPIRATION = 24 * 60 * 60
last_lock_time = time.monotonic() last_lock_time = time.monotonic()
async_results = [] async_results = []
@@ -92,6 +97,10 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
if doc.id in self.skip_docs: if doc.id in self.skip_docs:
continue continue
# is the document sync already queued?
if redis_client.hexists(doc.id):
continue
# celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
# the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
# we prefix the task id so it's easier to keep track of who created the task # we prefix the task id so it's easier to keep track of who created the task
@@ -104,6 +113,11 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
RedisConnectorCredentialPair.get_taskset_key(), custom_task_id RedisConnectorCredentialPair.get_taskset_key(), custom_task_id
) )
# track the doc.id in redis so that we don't resubmit it repeatedly
redis_client.hset(
self.SYNCING_HASH, doc.id, custom_task_id, ex=SYNC_EXPIRATION
)
# Priority on sync's triggered by new indexing should be medium # Priority on sync's triggered by new indexing should be medium
result = celery_app.send_task( result = celery_app.send_task(
OnyxCeleryTask.VESPA_METADATA_SYNC_TASK, OnyxCeleryTask.VESPA_METADATA_SYNC_TASK,

View File

@@ -110,6 +110,9 @@ class TenantRedis(redis.Redis):
"sadd", "sadd",
"srem", "srem",
"scard", "scard",
"hexists",
"hset",
"hdel",
] # Regular methods that need simple prefixing ] # Regular methods that need simple prefixing
if item == "scan_iter": if item == "scan_iter":