Feature/indexing logs (#3002)

* improve logging around indexing tasks

* task_logger doesn't work inside the spawned task
This commit is contained in:
rkuo-danswer 2024-10-31 09:43:46 -07:00 committed by GitHub
parent ff9d7141a9
commit 231ab3fb5d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -391,7 +391,12 @@ def connector_indexing_proxy_task(
tenant_id: str | None, tenant_id: str | None,
) -> None: ) -> None:
"""celery tasks are forked, but forking is unstable. This proxies work to a spawned task.""" """celery tasks are forked, but forking is unstable. This proxies work to a spawned task."""
task_logger.info(
f"Indexing proxy - starting: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
client = SimpleJobClient() client = SimpleJobClient()
job = client.submit( job = client.submit(
@ -405,17 +410,31 @@ def connector_indexing_proxy_task(
) )
if not job: if not job:
task_logger.info(
f"Indexing proxy - spawn failed: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
return return
task_logger.info(
f"Indexing proxy - spawn succeeded: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
while True: while True:
sleep(10) sleep(10)
# do nothing for ongoing jobs that haven't been stopped
if not job.done():
with get_session_with_tenant(tenant_id) as db_session: with get_session_with_tenant(tenant_id) as db_session:
index_attempt = get_index_attempt( index_attempt = get_index_attempt(
db_session=db_session, index_attempt_id=index_attempt_id db_session=db_session, index_attempt_id=index_attempt_id
) )
# do nothing for ongoing jobs that haven't been stopped
if not job.done():
if not index_attempt: if not index_attempt:
continue continue
@ -423,11 +442,24 @@ def connector_indexing_proxy_task(
continue continue
if job.status == "error": if job.status == "error":
logger.error(job.exception()) task_logger.error(
f"Indexing proxy - spawned task exceptioned: "
f"attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id} "
f"error={job.exception()}"
)
job.release() job.release()
break break
task_logger.info(
f"Indexing proxy - finished: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
return return
@ -449,7 +481,17 @@ def connector_indexing_task(
Returns None if the task did not run (possibly due to a conflict). Returns None if the task did not run (possibly due to a conflict).
Otherwise, returns an int >= 0 representing the number of indexed docs. Otherwise, returns an int >= 0 representing the number of indexed docs.
NOTE: if an exception is raised out of this task, the primary worker will detect
that the task transitioned to a "READY" state but the generator_complete_key doesn't exist.
This will cause the primary worker to abort the indexing attempt and clean up.
""" """
logger.info(
f"Indexing spawned task starting: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
attempt = None attempt = None
n_final_progress = 0 n_final_progress = 0
@ -488,19 +530,19 @@ def connector_indexing_task(
cast(str, fence_json) cast(str, fence_json)
) )
except ValueError: except ValueError:
task_logger.exception( logger.exception(
f"connector_indexing_task: fence_data not decodeable: fence={rci.fence_key}" f"connector_indexing_task: fence_data not decodeable: fence={rci.fence_key}"
) )
raise raise
if fence_data.index_attempt_id is None or fence_data.celery_task_id is None: if fence_data.index_attempt_id is None or fence_data.celery_task_id is None:
task_logger.info( logger.info(
f"connector_indexing_task - Waiting for fence: fence={rci.fence_key}" f"connector_indexing_task - Waiting for fence: fence={rci.fence_key}"
) )
sleep(1) sleep(1)
continue continue
task_logger.info( logger.info(
f"connector_indexing_task - Fence found, continuing...: fence={rci.fence_key}" f"connector_indexing_task - Fence found, continuing...: fence={rci.fence_key}"
) )
break break
@ -512,7 +554,7 @@ def connector_indexing_task(
acquired = lock.acquire(blocking=False) acquired = lock.acquire(blocking=False)
if not acquired: if not acquired:
task_logger.warning( logger.warning(
f"Indexing task already running, exiting...: " f"Indexing task already running, exiting...: "
f"cc_pair={cc_pair_id} search_settings={search_settings_id}" f"cc_pair={cc_pair_id} search_settings={search_settings_id}"
) )
@ -555,6 +597,13 @@ def connector_indexing_task(
rcs.fence_key, rci.generator_progress_key, lock, r rcs.fence_key, rci.generator_progress_key, lock, r
) )
logger.info(
f"Indexing spawned task running entrypoint: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
run_indexing_entrypoint( run_indexing_entrypoint(
index_attempt_id, index_attempt_id,
tenant_id, tenant_id,
@ -573,7 +622,12 @@ def connector_indexing_task(
r.set(rci.generator_complete_key, HTTPStatus.OK.value) r.set(rci.generator_complete_key, HTTPStatus.OK.value)
except Exception as e: except Exception as e:
task_logger.exception(f"Indexing failed: cc_pair={cc_pair_id}") logger.exception(
f"Indexing spawned task failed: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
if attempt: if attempt:
with get_session_with_tenant(tenant_id) as db_session: with get_session_with_tenant(tenant_id) as db_session:
mark_attempt_failed(attempt, db_session, failure_reason=str(e)) mark_attempt_failed(attempt, db_session, failure_reason=str(e))
@ -587,4 +641,10 @@ def connector_indexing_task(
if lock.owned(): if lock.owned():
lock.release() lock.release()
logger.info(
f"Indexing spawned task finished: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
)
return n_final_progress return n_final_progress