mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 23:40:58 +02:00
Small improvements to checkpoint pickup logic (#4634)
This commit is contained in:
parent
9be3da2357
commit
47767c1666
@ -20,8 +20,7 @@ from onyx.utils.object_size_check import deep_getsizeof
|
|||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
_NUM_RECENT_ATTEMPTS_TO_CONSIDER = 20
|
_NUM_RECENT_ATTEMPTS_TO_CONSIDER = 50
|
||||||
_NUM_DOCS_INDEXED_TO_BE_VALID_CHECKPOINT = 100
|
|
||||||
|
|
||||||
|
|
||||||
def _build_checkpoint_pointer(index_attempt_id: int) -> str:
|
def _build_checkpoint_pointer(index_attempt_id: int) -> str:
|
||||||
@ -80,34 +79,56 @@ def get_latest_valid_checkpoint(
|
|||||||
db_session=db_session,
|
db_session=db_session,
|
||||||
limit=_NUM_RECENT_ATTEMPTS_TO_CONSIDER,
|
limit=_NUM_RECENT_ATTEMPTS_TO_CONSIDER,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# don't keep using checkpoints if we've had a bunch of failed attempts in a row
|
||||||
|
# where we make no progress. Only do this if we have had at least
|
||||||
|
# _NUM_RECENT_ATTEMPTS_TO_CONSIDER completed attempts.
|
||||||
|
if len(checkpoint_candidates) == _NUM_RECENT_ATTEMPTS_TO_CONSIDER:
|
||||||
|
had_any_progress = False
|
||||||
|
for candidate in checkpoint_candidates:
|
||||||
|
if (
|
||||||
|
candidate.total_docs_indexed is not None
|
||||||
|
and candidate.total_docs_indexed > 0
|
||||||
|
) or candidate.status.is_successful():
|
||||||
|
had_any_progress = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not had_any_progress:
|
||||||
|
logger.warning(
|
||||||
|
f"{_NUM_RECENT_ATTEMPTS_TO_CONSIDER} consecutive failed attempts without progress "
|
||||||
|
f"found for cc_pair={cc_pair_id}. Ignoring checkpoint to let the run start "
|
||||||
|
"from scratch."
|
||||||
|
)
|
||||||
|
return connector.build_dummy_checkpoint()
|
||||||
|
|
||||||
|
# filter out any candidates that don't meet the criteria
|
||||||
checkpoint_candidates = [
|
checkpoint_candidates = [
|
||||||
candidate
|
candidate
|
||||||
for candidate in checkpoint_candidates
|
for candidate in checkpoint_candidates
|
||||||
if (
|
if (
|
||||||
candidate.poll_range_start == window_start
|
candidate.poll_range_start == window_start
|
||||||
and candidate.poll_range_end == window_end
|
and candidate.poll_range_end == window_end
|
||||||
and candidate.status == IndexingStatus.FAILED
|
and (
|
||||||
|
candidate.status == IndexingStatus.FAILED
|
||||||
|
# if the background job was killed (and thus the attempt was canceled)
|
||||||
|
# we still want to use the checkpoint so that we can pick up where we left off
|
||||||
|
or candidate.status == IndexingStatus.CANCELED
|
||||||
|
)
|
||||||
and candidate.checkpoint_pointer is not None
|
and candidate.checkpoint_pointer is not None
|
||||||
|
# NOTE: There are a couple connectors that may make progress but not have
|
||||||
|
# any "total_docs_indexed". E.g. they are going through
|
||||||
|
# Slack channels, and tons of them don't have any updates.
|
||||||
|
# Leaving the below in as historical context / in-case we want to use it again.
|
||||||
# we want to make sure that the checkpoint is actually useful
|
# we want to make sure that the checkpoint is actually useful
|
||||||
# if it's only gone through a few docs, it's probably not worth
|
# if it's only gone through a few docs, it's probably not worth
|
||||||
# using. This also avoids weird cases where a connector is basically
|
# using. This also avoids weird cases where a connector is basically
|
||||||
# non-functional but still "makes progress" by slowly moving the
|
# non-functional but still "makes progress" by slowly moving the
|
||||||
# checkpoint forward run after run
|
# checkpoint forward run after run
|
||||||
and candidate.total_docs_indexed
|
# and candidate.total_docs_indexed
|
||||||
and candidate.total_docs_indexed > _NUM_DOCS_INDEXED_TO_BE_VALID_CHECKPOINT
|
# and candidate.total_docs_indexed > 100
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
# don't keep using checkpoints if we've had a bunch of failed attempts in a row
|
|
||||||
# for now, capped at 10
|
|
||||||
if len(checkpoint_candidates) == _NUM_RECENT_ATTEMPTS_TO_CONSIDER:
|
|
||||||
logger.warning(
|
|
||||||
f"{_NUM_RECENT_ATTEMPTS_TO_CONSIDER} consecutive failed attempts found "
|
|
||||||
f"for cc_pair={cc_pair_id}. Ignoring checkpoint to let the run start "
|
|
||||||
"from scratch."
|
|
||||||
)
|
|
||||||
return connector.build_dummy_checkpoint()
|
|
||||||
|
|
||||||
# assumes latest checkpoint is the furthest along. This only isn't true
|
# assumes latest checkpoint is the furthest along. This only isn't true
|
||||||
# if something else has gone wrong.
|
# if something else has gone wrong.
|
||||||
latest_valid_checkpoint_candidate = (
|
latest_valid_checkpoint_candidate = (
|
||||||
@ -116,6 +137,9 @@ def get_latest_valid_checkpoint(
|
|||||||
|
|
||||||
checkpoint = connector.build_dummy_checkpoint()
|
checkpoint = connector.build_dummy_checkpoint()
|
||||||
if latest_valid_checkpoint_candidate is None:
|
if latest_valid_checkpoint_candidate is None:
|
||||||
|
logger.info(
|
||||||
|
f"No valid checkpoint found for cc_pair={cc_pair_id}. Starting from scratch."
|
||||||
|
)
|
||||||
return checkpoint
|
return checkpoint
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user