mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-08 06:00:49 +02:00
* wip checkpointing/continue on failure more stuff for checkpointing Basic implementation FE stuff More checkpointing/failure handling rebase rebase initial scaffolding for IT IT to test checkpointing Cleanup cleanup Fix it Rebase Add todo Fix actions IT Test more Pagination + fixes + cleanup Fix IT networking fix it * rebase * Address misc comments * Address comments * Remove unused router * rebase * Fix mypy * Fixes * fix it * Fix tests * Add drop index * Add retries * reset lock timeout * Try hard drop of schema * Add timeout/retries to downgrade * rebase * test * test * test * Close all connections * test closing idle only * Fix it * fix * try using null pool * Test * fix * rebase * log * Fix * apply null pool * Fix other test * Fix quality checks * Test not using the fixture * Fix ordering * fix test * Change pooling behavior
178 lines
6.5 KiB
Python
178 lines
6.5 KiB
Python
import sys
|
|
import time
|
|
from collections.abc import Generator
|
|
from datetime import datetime
|
|
|
|
from onyx.connectors.interfaces import BaseConnector
|
|
from onyx.connectors.interfaces import CheckpointConnector
|
|
from onyx.connectors.interfaces import CheckpointOutput
|
|
from onyx.connectors.interfaces import LoadConnector
|
|
from onyx.connectors.interfaces import PollConnector
|
|
from onyx.connectors.models import ConnectorCheckpoint
|
|
from onyx.connectors.models import ConnectorFailure
|
|
from onyx.connectors.models import Document
|
|
from onyx.utils.logger import setup_logger
|
|
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
TimeRange = tuple[datetime, datetime]
|
|
|
|
|
|
class CheckpointOutputWrapper:
|
|
"""
|
|
Wraps a CheckpointOutput generator to give things back in a more digestible format.
|
|
The connector format is easier for the connector implementor (e.g. it enforces exactly
|
|
one new checkpoint is returned AND that the checkpoint is at the end), thus the different
|
|
formats.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self.next_checkpoint: ConnectorCheckpoint | None = None
|
|
|
|
def __call__(
|
|
self,
|
|
checkpoint_connector_generator: CheckpointOutput,
|
|
) -> Generator[
|
|
tuple[Document | None, ConnectorFailure | None, ConnectorCheckpoint | None],
|
|
None,
|
|
None,
|
|
]:
|
|
# grabs the final return value and stores it in the `next_checkpoint` variable
|
|
def _inner_wrapper(
|
|
checkpoint_connector_generator: CheckpointOutput,
|
|
) -> CheckpointOutput:
|
|
self.next_checkpoint = yield from checkpoint_connector_generator
|
|
return self.next_checkpoint # not used
|
|
|
|
for document_or_failure in _inner_wrapper(checkpoint_connector_generator):
|
|
if isinstance(document_or_failure, Document):
|
|
yield document_or_failure, None, None
|
|
elif isinstance(document_or_failure, ConnectorFailure):
|
|
yield None, document_or_failure, None
|
|
else:
|
|
raise ValueError(
|
|
f"Invalid document_or_failure type: {type(document_or_failure)}"
|
|
)
|
|
|
|
if self.next_checkpoint is None:
|
|
raise RuntimeError(
|
|
"Checkpoint is None. This should never happen - the connector should always return a checkpoint."
|
|
)
|
|
|
|
yield None, None, self.next_checkpoint
|
|
|
|
|
|
class ConnectorRunner:
|
|
"""
|
|
Handles:
|
|
- Batching
|
|
- Additional exception logging
|
|
- Combining different connector types to a single interface
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
connector: BaseConnector,
|
|
batch_size: int,
|
|
time_range: TimeRange | None = None,
|
|
):
|
|
self.connector = connector
|
|
self.time_range = time_range
|
|
self.batch_size = batch_size
|
|
|
|
self.doc_batch: list[Document] = []
|
|
|
|
def run(
|
|
self, checkpoint: ConnectorCheckpoint
|
|
) -> Generator[
|
|
tuple[
|
|
list[Document] | None, ConnectorFailure | None, ConnectorCheckpoint | None
|
|
],
|
|
None,
|
|
None,
|
|
]:
|
|
"""Adds additional exception logging to the connector."""
|
|
try:
|
|
if isinstance(self.connector, CheckpointConnector):
|
|
if self.time_range is None:
|
|
raise ValueError("time_range is required for CheckpointConnector")
|
|
|
|
start = time.monotonic()
|
|
checkpoint_connector_generator = self.connector.load_from_checkpoint(
|
|
start=self.time_range[0].timestamp(),
|
|
end=self.time_range[1].timestamp(),
|
|
checkpoint=checkpoint,
|
|
)
|
|
next_checkpoint: ConnectorCheckpoint | None = None
|
|
# this is guaranteed to always run at least once with next_checkpoint being non-None
|
|
for document, failure, next_checkpoint in CheckpointOutputWrapper()(
|
|
checkpoint_connector_generator
|
|
):
|
|
if document is not None:
|
|
self.doc_batch.append(document)
|
|
|
|
if failure is not None:
|
|
yield None, failure, None
|
|
|
|
if len(self.doc_batch) >= self.batch_size:
|
|
yield self.doc_batch, None, None
|
|
self.doc_batch = []
|
|
|
|
# yield remaining documents
|
|
if len(self.doc_batch) > 0:
|
|
yield self.doc_batch, None, None
|
|
self.doc_batch = []
|
|
|
|
yield None, None, next_checkpoint
|
|
|
|
logger.debug(
|
|
f"Connector took {time.monotonic() - start} seconds to get to the next checkpoint."
|
|
)
|
|
|
|
else:
|
|
finished_checkpoint = ConnectorCheckpoint.build_dummy_checkpoint()
|
|
finished_checkpoint.has_more = False
|
|
|
|
if isinstance(self.connector, PollConnector):
|
|
if self.time_range is None:
|
|
raise ValueError("time_range is required for PollConnector")
|
|
|
|
for document_batch in self.connector.poll_source(
|
|
start=self.time_range[0].timestamp(),
|
|
end=self.time_range[1].timestamp(),
|
|
):
|
|
yield document_batch, None, None
|
|
|
|
yield None, None, finished_checkpoint
|
|
elif isinstance(self.connector, LoadConnector):
|
|
for document_batch in self.connector.load_from_state():
|
|
yield document_batch, None, None
|
|
|
|
yield None, None, finished_checkpoint
|
|
else:
|
|
raise ValueError(f"Invalid connector. type: {type(self.connector)}")
|
|
except Exception:
|
|
exc_type, _, exc_traceback = sys.exc_info()
|
|
|
|
# Traverse the traceback to find the last frame where the exception was raised
|
|
tb = exc_traceback
|
|
if tb is None:
|
|
logger.error("No traceback found for exception")
|
|
raise
|
|
|
|
while tb.tb_next:
|
|
tb = tb.tb_next # Move to the next frame in the traceback
|
|
|
|
# Get the local variables from the frame where the exception occurred
|
|
local_vars = tb.tb_frame.f_locals
|
|
local_vars_str = "\n".join(
|
|
f"{key}: {value}" for key, value in local_vars.items()
|
|
)
|
|
logger.error(
|
|
f"Error in connector. type: {exc_type};\n"
|
|
f"local_vars below -> \n{local_vars_str[:1024]}"
|
|
)
|
|
raise
|