Fix Indexing Frozen (#660)

This commit is contained in:
Yuhong Sun
2023-10-30 20:49:39 -07:00
committed by GitHub
parent 517a539d7e
commit 0618b59de6
3 changed files with 26 additions and 12 deletions

View File

@@ -3,6 +3,7 @@ import time
from datetime import datetime from datetime import datetime
from datetime import timezone from datetime import timezone
import dask
import torch import torch
from dask.distributed import Client from dask.distributed import Client
from dask.distributed import Future from dask.distributed import Future
@@ -44,6 +45,10 @@ from danswer.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
# If the indexing dies, it's most likely due to resource constraints,
# restarting just delays the eventual failure, not useful to the user
dask.config.set({"distributed.scheduler.allowed-failures": 0})
_UNEXPECTED_STATE_FAILURE_REASON = ( _UNEXPECTED_STATE_FAILURE_REASON = (
"Stopped mid run, likely due to the background process being killed" "Stopped mid run, likely due to the background process being killed"
) )
@@ -144,6 +149,9 @@ def cleanup_indexing_jobs(
if not job.done(): if not job.done():
continue continue
if job.status == "error":
logger.error(job.exception())
job.release() job.release()
del existing_jobs_copy[attempt_id] del existing_jobs_copy[attempt_id]
index_attempt = get_index_attempt( index_attempt = get_index_attempt(
@@ -156,7 +164,7 @@ def cleanup_indexing_jobs(
) )
continue continue
if index_attempt.status == IndexingStatus.IN_PROGRESS: if index_attempt.status == IndexingStatus.IN_PROGRESS or job.status == "error":
mark_run_failed( mark_run_failed(
db_session=db_session, db_session=db_session,
index_attempt=index_attempt, index_attempt=index_attempt,
@@ -286,10 +294,10 @@ def _run_indexing(
run_dt=run_dt, run_dt=run_dt,
) )
net_doc_change = 0
document_count = 0
chunk_count = 0
try: try:
net_doc_change = 0
document_count = 0
chunk_count = 0
for doc_batch in doc_batch_generator: for doc_batch in doc_batch_generator:
logger.debug( logger.debug(
f"Indexing batch of documents: {[doc.to_short_descriptor() for doc in doc_batch]}" f"Indexing batch of documents: {[doc.to_short_descriptor() for doc in doc_batch]}"
@@ -418,7 +426,14 @@ def kickoff_indexing_jobs(
) -> dict[int, Future]: ) -> dict[int, Future]:
existing_jobs_copy = existing_jobs.copy() existing_jobs_copy = existing_jobs.copy()
new_indexing_attempts = get_not_started_index_attempts(db_session) # Don't include jobs waiting in the Dask queue that just haven't started running
# Also (rarely) don't include for jobs that started but haven't updated the indexing tables yet
new_indexing_attempts = [
attempt
for attempt in get_not_started_index_attempts(db_session)
if attempt.id not in existing_jobs
]
logger.info(f"Found {len(new_indexing_attempts)} new indexing tasks.") logger.info(f"Found {len(new_indexing_attempts)} new indexing tasks.")
if not new_indexing_attempts: if not new_indexing_attempts:
@@ -440,9 +455,6 @@ def kickoff_indexing_jobs(
) )
continue continue
if attempt.id in existing_jobs:
continue
logger.info( logger.info(
f"Kicking off indexing attempt for connector: '{attempt.connector.name}', " f"Kicking off indexing attempt for connector: '{attempt.connector.name}', "
f"with config: '{attempt.connector.connector_specific_config}', and " f"with config: '{attempt.connector.connector_specific_config}', and "

View File

@@ -211,5 +211,5 @@ app = get_application()
if __name__ == "__main__": if __name__ == "__main__":
logger.info(f"Running QA Service on http://{APP_HOST}:{str(APP_PORT)}/") logger.info(f"Starting Danswer Backend on http://{APP_HOST}:{str(APP_PORT)}/")
uvicorn.run(app, host=APP_HOST, port=APP_PORT) uvicorn.run(app, host=APP_HOST, port=APP_PORT)

View File

@@ -1,9 +1,11 @@
[supervisord] [supervisord]
nodaemon=true nodaemon=true
logfile=/dev/stdout logfile=/var/log/supervisord.log
logfile_maxbytes=0
[program:indexing] # Indexing is the heaviest job, also requires some CPU intensive steps
# Cannot place this in Celery for now because Celery must run as a single process (see note below)
# Indexing uses multi-processing to speed things up
[program:document_indexing]
command=python danswer/background/update.py command=python danswer/background/update.py
stdout_logfile=/var/log/update.log stdout_logfile=/var/log/update.log
stdout_logfile_maxbytes=52428800 stdout_logfile_maxbytes=52428800