mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-01 10:40:47 +02:00
* checkpointed confluence * confluence checkpointing tested * fixed integration tests * attempt to fix connector test flakiness * fix rebase
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
from typing import cast
|
|
from typing import TypeVar
|
|
|
|
from onyx.connectors.connector_runner import CheckpointOutputWrapper
|
|
from onyx.connectors.interfaces import CheckpointedConnector
|
|
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
|
from onyx.connectors.models import ConnectorCheckpoint
|
|
from onyx.connectors.models import ConnectorFailure
|
|
from onyx.connectors.models import Document
|
|
|
|
_ITERATION_LIMIT = 100_000
|
|
|
|
CT = TypeVar("CT", bound=ConnectorCheckpoint)
|
|
|
|
|
|
def load_all_docs_from_checkpoint_connector(
|
|
connector: CheckpointedConnector[CT],
|
|
start: SecondsSinceUnixEpoch,
|
|
end: SecondsSinceUnixEpoch,
|
|
) -> list[Document]:
|
|
num_iterations = 0
|
|
|
|
checkpoint = cast(CT, connector.build_dummy_checkpoint())
|
|
documents: list[Document] = []
|
|
while checkpoint.has_more:
|
|
doc_batch_generator = CheckpointOutputWrapper[CT]()(
|
|
connector.load_from_checkpoint(start, end, checkpoint)
|
|
)
|
|
for document, failure, next_checkpoint in doc_batch_generator:
|
|
if failure is not None:
|
|
raise RuntimeError(f"Failed to load documents: {failure}")
|
|
if document is not None:
|
|
documents.append(document)
|
|
if next_checkpoint is not None:
|
|
checkpoint = next_checkpoint
|
|
|
|
num_iterations += 1
|
|
if num_iterations > _ITERATION_LIMIT:
|
|
raise RuntimeError("Too many iterations. Infinite loop?")
|
|
|
|
return documents
|
|
|
|
|
|
def load_everything_from_checkpoint_connector(
|
|
connector: CheckpointedConnector[CT],
|
|
start: SecondsSinceUnixEpoch,
|
|
end: SecondsSinceUnixEpoch,
|
|
) -> list[Document | ConnectorFailure]:
|
|
"""Like load_all_docs_from_checkpoint_connector but returns both documents and failures"""
|
|
num_iterations = 0
|
|
|
|
checkpoint = connector.build_dummy_checkpoint()
|
|
outputs: list[Document | ConnectorFailure] = []
|
|
while checkpoint.has_more:
|
|
doc_batch_generator = CheckpointOutputWrapper[CT]()(
|
|
connector.load_from_checkpoint(start, end, checkpoint)
|
|
)
|
|
for document, failure, next_checkpoint in doc_batch_generator:
|
|
if failure is not None:
|
|
outputs.append(failure)
|
|
if document is not None:
|
|
outputs.append(document)
|
|
if next_checkpoint is not None:
|
|
checkpoint = next_checkpoint
|
|
|
|
num_iterations += 1
|
|
if num_iterations > _ITERATION_LIMIT:
|
|
raise RuntimeError("Too many iterations. Infinite loop?")
|
|
|
|
return outputs
|