Connector checkpointing (#3876)

* wip checkpointing/continue on failure

more stuff for checkpointing

Basic implementation

FE stuff

More checkpointing/failure handling

rebase

rebase

initial scaffolding for IT

IT to test checkpointing

Cleanup

cleanup

Fix it

Rebase

Add todo

Fix actions IT

Test more

Pagination + fixes + cleanup

Fix IT networking

fix it

* rebase

* Address misc comments

* Address comments

* Remove unused router

* rebase

* Fix mypy

* Fixes

* fix it

* Fix tests

* Add drop index

* Add retries

* reset lock timeout

* Try hard drop of schema

* Add timeout/retries to downgrade

* rebase

* test

* test

* test

* Close all connections

* test closing idle only

* Fix it

* fix

* try using null pool

* Test

* fix

* rebase

* log

* Fix

* apply null pool

* Fix other test

* Fix quality checks

* Test not using the fixture

* Fix ordering

* fix test

* Change pooling behavior
This commit is contained in:
Chris Weaver
2025-02-15 18:34:39 -08:00
committed by GitHub
parent bc087fc20e
commit f1fc8ac19b
68 changed files with 3333 additions and 1102 deletions

View File

@@ -1,23 +1,21 @@
import traceback
from collections.abc import Callable
from functools import partial
from http import HTTPStatus
from typing import Protocol
import httpx
from pydantic import BaseModel
from pydantic import ConfigDict
from sqlalchemy.orm import Session
from onyx.access.access import get_access_for_documents
from onyx.access.models import DocumentAccess
from onyx.configs.app_configs import INDEXING_EXCEPTION_LIMIT
from onyx.configs.app_configs import MAX_DOCUMENT_CHARS
from onyx.configs.constants import DEFAULT_BOOST
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
get_experts_stores_representations,
)
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import IndexAttemptMetadata
from onyx.db.document import fetch_chunk_counts_for_documents
from onyx.db.document import get_documents_by_ids
@@ -29,7 +27,6 @@ from onyx.db.document import update_docs_updated_at__no_commit
from onyx.db.document import upsert_document_by_connector_credential_pair
from onyx.db.document import upsert_documents
from onyx.db.document_set import fetch_document_sets_for_documents
from onyx.db.index_attempt import create_index_attempt_error
from onyx.db.models import Document as DBDocument
from onyx.db.search_settings import get_current_search_settings
from onyx.db.tag import create_or_add_document_tag
@@ -41,10 +38,12 @@ from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import DocumentMetadata
from onyx.document_index.interfaces import IndexBatchParams
from onyx.indexing.chunker import Chunker
from onyx.indexing.embedder import embed_chunks_with_failure_handling
from onyx.indexing.embedder import IndexingEmbedder
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.models import DocAwareChunk
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
from onyx.utils.logger import setup_logger
from onyx.utils.timing import log_function_time
@@ -67,6 +66,8 @@ class IndexingPipelineResult(BaseModel):
# number of chunks that were inserted into Vespa
total_chunks: int
failures: list[ConnectorFailure]
class IndexingPipelineProtocol(Protocol):
def __call__(
@@ -156,14 +157,10 @@ def index_doc_batch_with_handler(
document_index: DocumentIndex,
document_batch: list[Document],
index_attempt_metadata: IndexAttemptMetadata,
attempt_id: int | None,
db_session: Session,
ignore_time_skip: bool = False,
tenant_id: str | None = None,
) -> IndexingPipelineResult:
index_pipeline_result = IndexingPipelineResult(
new_docs=0, total_docs=len(document_batch), total_chunks=0
)
try:
index_pipeline_result = index_doc_batch(
chunker=chunker,
@@ -176,47 +173,25 @@ def index_doc_batch_with_handler(
tenant_id=tenant_id,
)
except Exception as e:
if isinstance(e, httpx.HTTPStatusError):
if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE:
logger.error(
"NOTE: HTTP Status 507 Insufficient Storage indicates "
"you need to allocate more memory or disk space to the "
"Vespa/index container."
logger.exception(f"Failed to index document batch: {document_batch}")
index_pipeline_result = IndexingPipelineResult(
new_docs=0,
total_docs=len(document_batch),
total_chunks=0,
failures=[
ConnectorFailure(
failed_document=DocumentFailure(
document_id=document.id,
document_link=(
document.sections[0].link if document.sections else None
),
),
failure_message=str(e),
exception=e,
)
if INDEXING_EXCEPTION_LIMIT == 0:
raise
trace = traceback.format_exc()
create_index_attempt_error(
attempt_id,
batch=index_attempt_metadata.batch_num,
docs=document_batch,
exception_msg=str(e),
exception_traceback=trace,
db_session=db_session,
for document in document_batch
],
)
logger.exception(
f"Indexing batch {index_attempt_metadata.batch_num} failed. msg='{e}' trace='{trace}'"
)
index_attempt_metadata.num_exceptions += 1
if index_attempt_metadata.num_exceptions == INDEXING_EXCEPTION_LIMIT:
logger.warning(
f"Maximum number of exceptions for this index attempt "
f"({INDEXING_EXCEPTION_LIMIT}) has been reached. "
f"The next exception will abort the indexing attempt."
)
elif index_attempt_metadata.num_exceptions > INDEXING_EXCEPTION_LIMIT:
logger.warning(
f"Maximum number of exceptions for this index attempt "
f"({INDEXING_EXCEPTION_LIMIT}) has been exceeded."
)
raise RuntimeError(
f"Maximum exception limit of {INDEXING_EXCEPTION_LIMIT} exceeded."
)
else:
pass
return index_pipeline_result
@@ -376,8 +351,12 @@ def index_doc_batch(
document_ids=[doc.id for doc in filtered_documents],
db_session=db_session,
)
db_session.commit()
return IndexingPipelineResult(
new_docs=0, total_docs=len(filtered_documents), total_chunks=0
new_docs=0,
total_docs=len(filtered_documents),
total_chunks=0,
failures=[],
)
doc_descriptors = [
@@ -390,10 +369,19 @@ def index_doc_batch(
logger.debug(f"Starting indexing process for documents: {doc_descriptors}")
logger.debug("Starting chunking")
# NOTE: no special handling for failures here, since the chunker is not
# a common source of failure for the indexing pipeline
chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs)
logger.debug("Starting embedding")
chunks_with_embeddings = embedder.embed_chunks(chunks) if chunks else []
chunks_with_embeddings, embedding_failures = (
embed_chunks_with_failure_handling(
chunks=chunks,
embedder=embedder,
)
if chunks
else ([], [])
)
updatable_ids = [doc.id for doc in ctx.updatable_docs]
@@ -459,7 +447,11 @@ def index_doc_batch(
# A document will not be spread across different batches, so all the
# documents with chunks in this set, are fully represented by the chunks
# in this set
insertion_records = document_index.index(
(
insertion_records,
vector_db_write_failures,
) = write_chunks_to_vector_db_with_backoff(
document_index=document_index,
chunks=access_aware_chunks,
index_batch_params=IndexBatchParams(
doc_id_to_previous_chunk_cnt=doc_id_to_previous_chunk_cnt,
@@ -519,6 +511,7 @@ def index_doc_batch(
new_docs=len([r for r in insertion_records if r.already_existed is False]),
total_docs=len(filtered_documents),
total_chunks=len(access_aware_chunks),
failures=vector_db_write_failures + embedding_failures,
)
return result
@@ -531,7 +524,6 @@ def build_indexing_pipeline(
db_session: Session,
chunker: Chunker | None = None,
ignore_time_skip: bool = False,
attempt_id: int | None = None,
tenant_id: str | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> IndexingPipelineProtocol:
@@ -553,7 +545,6 @@ def build_indexing_pipeline(
embedder=embedder,
document_index=document_index,
ignore_time_skip=ignore_time_skip,
attempt_id=attempt_id,
db_session=db_session,
tenant_id=tenant_id,
)