mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-25 15:30:59 +02:00
Fix Undeleteable Connectors (#1507)
This commit is contained in:
parent
57452b1030
commit
6e5d9f33d2
@ -0,0 +1,31 @@
|
||||
"""Remove Last Attempt Status from CC Pair
|
||||
|
||||
Revision ID: ec85f2b3c544
|
||||
Revises: 3879338f8ba1
|
||||
Create Date: 2024-05-23 21:39:46.126010
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "ec85f2b3c544"
|
||||
down_revision = "3879338f8ba1"
|
||||
branch_labels: None = None
|
||||
depends_on: None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_column("connector_credential_pair", "last_attempt_status")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.add_column(
|
||||
"connector_credential_pair",
|
||||
sa.Column(
|
||||
"last_attempt_status",
|
||||
sa.VARCHAR(),
|
||||
autoincrement=False,
|
||||
nullable=True,
|
||||
),
|
||||
)
|
@ -68,7 +68,9 @@ def cleanup_connector_credential_pair_task(
|
||||
f"{connector_id} and Credential ID: {credential_id} does not exist."
|
||||
)
|
||||
|
||||
deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(cc_pair)
|
||||
deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(
|
||||
connector_credential_pair=cc_pair, db_session=db_session
|
||||
)
|
||||
if deletion_attempt_disallowed_reason:
|
||||
raise ValueError(deletion_attempt_disallowed_reason)
|
||||
|
||||
|
@ -160,19 +160,19 @@ def _run_indexing(
|
||||
source_type=db_connector.source,
|
||||
)
|
||||
):
|
||||
window_start = max(
|
||||
window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
|
||||
datetime(1970, 1, 1, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
doc_batch_generator, is_listing_complete = _get_document_generator(
|
||||
db_session=db_session,
|
||||
attempt=index_attempt,
|
||||
start_time=window_start,
|
||||
end_time=window_end,
|
||||
)
|
||||
|
||||
try:
|
||||
window_start = max(
|
||||
window_start - timedelta(minutes=POLL_CONNECTOR_OFFSET),
|
||||
datetime(1970, 1, 1, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
doc_batch_generator, is_listing_complete = _get_document_generator(
|
||||
db_session=db_session,
|
||||
attempt=index_attempt,
|
||||
start_time=window_start,
|
||||
end_time=window_end,
|
||||
)
|
||||
|
||||
all_connector_doc_ids: set[str] = set()
|
||||
for doc_batch in doc_batch_generator:
|
||||
# Check if connector is disabled mid run and stop if so unless it's the secondary
|
||||
@ -263,7 +263,6 @@ def _run_indexing(
|
||||
db_session=db_session,
|
||||
connector_id=db_connector.id,
|
||||
credential_id=db_credential.id,
|
||||
attempt_status=IndexingStatus.IN_PROGRESS,
|
||||
net_docs=net_doc_change,
|
||||
run_dt=run_end_dt,
|
||||
)
|
||||
@ -294,7 +293,6 @@ def _run_indexing(
|
||||
db_session=db_session,
|
||||
connector_id=index_attempt.connector.id,
|
||||
credential_id=index_attempt.credential.id,
|
||||
attempt_status=IndexingStatus.FAILED,
|
||||
net_docs=net_doc_change,
|
||||
)
|
||||
raise e
|
||||
@ -309,7 +307,6 @@ def _run_indexing(
|
||||
db_session=db_session,
|
||||
connector_id=db_connector.id,
|
||||
credential_id=db_credential.id,
|
||||
attempt_status=IndexingStatus.SUCCESS,
|
||||
run_dt=run_end_dt,
|
||||
)
|
||||
|
||||
@ -343,15 +340,7 @@ def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexA
|
||||
|
||||
# only commit once, to make sure this all happens in a single transaction
|
||||
mark_attempt_in_progress__no_commit(attempt)
|
||||
is_primary = attempt.embedding_model.status == IndexModelStatus.PRESENT
|
||||
if is_primary:
|
||||
update_connector_credential_pair(
|
||||
db_session=db_session,
|
||||
connector_id=attempt.connector.id,
|
||||
credential_id=attempt.credential.id,
|
||||
attempt_status=IndexingStatus.IN_PROGRESS,
|
||||
)
|
||||
else:
|
||||
if attempt.embedding_model.status != IndexModelStatus.PRESENT:
|
||||
db_session.commit()
|
||||
|
||||
return attempt
|
||||
|
@ -17,8 +17,6 @@ from danswer.configs.app_configs import DASK_JOB_CLIENT_ENABLED
|
||||
from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
|
||||
from danswer.configs.app_configs import NUM_INDEXING_WORKERS
|
||||
from danswer.db.connector import fetch_connectors
|
||||
from danswer.db.connector_credential_pair import mark_all_in_progress_cc_pairs_failed
|
||||
from danswer.db.connector_credential_pair import update_connector_credential_pair
|
||||
from danswer.db.embedding_model import get_current_db_embedding_model
|
||||
from danswer.db.embedding_model import get_secondary_db_embedding_model
|
||||
from danswer.db.engine import get_db_current_time
|
||||
@ -119,17 +117,6 @@ def _mark_run_failed(
|
||||
db_session=db_session,
|
||||
failure_reason=failure_reason,
|
||||
)
|
||||
if (
|
||||
index_attempt.connector_id is not None
|
||||
and index_attempt.credential_id is not None
|
||||
and index_attempt.embedding_model.status == IndexModelStatus.PRESENT
|
||||
):
|
||||
update_connector_credential_pair(
|
||||
db_session=db_session,
|
||||
connector_id=index_attempt.connector_id,
|
||||
credential_id=index_attempt.credential_id,
|
||||
attempt_status=IndexingStatus.FAILED,
|
||||
)
|
||||
|
||||
|
||||
"""Main funcs"""
|
||||
@ -192,16 +179,6 @@ def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None:
|
||||
connector.id, credential.id, model.id, db_session
|
||||
)
|
||||
|
||||
# CC-Pair will have the status that it should for the primary index
|
||||
# Will be re-sync-ed once the indices are swapped
|
||||
if model.status == IndexModelStatus.PRESENT:
|
||||
update_connector_credential_pair(
|
||||
db_session=db_session,
|
||||
connector_id=connector.id,
|
||||
credential_id=credential.id,
|
||||
attempt_status=IndexingStatus.NOT_STARTED,
|
||||
)
|
||||
|
||||
|
||||
def cleanup_indexing_jobs(
|
||||
existing_jobs: dict[int, Future | SimpleJob],
|
||||
@ -391,11 +368,6 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non
|
||||
|
||||
existing_jobs: dict[int, Future | SimpleJob] = {}
|
||||
|
||||
with Session(engine) as db_session:
|
||||
# Previous version did not always clean up cc-pairs well leaving some connectors undeleteable
|
||||
# This ensures that bad states get cleaned up
|
||||
mark_all_in_progress_cc_pairs_failed(db_session)
|
||||
|
||||
while True:
|
||||
start = time.time()
|
||||
start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
@ -193,7 +193,7 @@ class DiscourseConnector(PollConnector):
|
||||
start=start_datetime, end=end_datetime
|
||||
)
|
||||
|
||||
return self._yield_discourse_documents(latest_topic_ids)
|
||||
yield from self._yield_discourse_documents(latest_topic_ids)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -207,7 +207,7 @@ class ProductboardConnector(PollConnector):
|
||||
):
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Unable to find updated_at for document '{document.id}'")
|
||||
logger.debug(f"Unable to find updated_at for document '{document.id}'")
|
||||
|
||||
return False
|
||||
|
||||
|
@ -4,7 +4,6 @@ from fastapi import HTTPException
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy import desc
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.connector import fetch_connector_by_id
|
||||
@ -96,7 +95,6 @@ def update_connector_credential_pair(
|
||||
db_session: Session,
|
||||
connector_id: int,
|
||||
credential_id: int,
|
||||
attempt_status: IndexingStatus,
|
||||
net_docs: int | None = None,
|
||||
run_dt: datetime | None = None,
|
||||
) -> None:
|
||||
@ -107,13 +105,9 @@ def update_connector_credential_pair(
|
||||
f"and credential id {credential_id}"
|
||||
)
|
||||
return
|
||||
cc_pair.last_attempt_status = attempt_status
|
||||
# simply don't update last_successful_index_time if run_dt is not specified
|
||||
# at worst, this would result in re-indexing documents that were already indexed
|
||||
if (
|
||||
attempt_status == IndexingStatus.SUCCESS
|
||||
or attempt_status == IndexingStatus.IN_PROGRESS
|
||||
) and run_dt is not None:
|
||||
if run_dt is not None:
|
||||
cc_pair.last_successful_index_time = run_dt
|
||||
if net_docs is not None:
|
||||
cc_pair.total_docs_indexed += net_docs
|
||||
@ -132,20 +126,6 @@ def delete_connector_credential_pair__no_commit(
|
||||
db_session.execute(stmt)
|
||||
|
||||
|
||||
def mark_all_in_progress_cc_pairs_failed(
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
stmt = (
|
||||
update(ConnectorCredentialPair)
|
||||
.where(
|
||||
ConnectorCredentialPair.last_attempt_status == IndexingStatus.IN_PROGRESS
|
||||
)
|
||||
.values(last_attempt_status=IndexingStatus.FAILED)
|
||||
)
|
||||
db_session.execute(stmt)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def associate_default_cc_pair(db_session: Session) -> None:
|
||||
existing_association = (
|
||||
db_session.query(ConnectorCredentialPair)
|
||||
@ -297,12 +277,4 @@ def resync_cc_pair(
|
||||
last_success.time_started if last_success else None
|
||||
)
|
||||
|
||||
last_run = find_latest_index_attempt(
|
||||
connector_id=cc_pair.connector_id,
|
||||
credential_id=cc_pair.credential_id,
|
||||
only_include_success=False,
|
||||
db_session=db_session,
|
||||
)
|
||||
cc_pair.last_attempt_status = last_run.status if last_run else None
|
||||
|
||||
db_session.commit()
|
||||
|
@ -1,9 +1,14 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.embedding_model import get_current_db_embedding_model
|
||||
from danswer.db.index_attempt import get_last_attempt
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.models import IndexingStatus
|
||||
|
||||
|
||||
def check_deletion_attempt_is_allowed(
|
||||
connector_credential_pair: ConnectorCredentialPair,
|
||||
db_session: Session,
|
||||
allow_scheduled: bool = False,
|
||||
) -> str | None:
|
||||
"""
|
||||
@ -21,9 +26,22 @@ def check_deletion_attempt_is_allowed(
|
||||
if not connector_credential_pair.connector.disabled:
|
||||
return base_error_msg + " Connector must be paused."
|
||||
|
||||
if connector_credential_pair.last_attempt_status == IndexingStatus.IN_PROGRESS or (
|
||||
connector_credential_pair.last_attempt_status == IndexingStatus.NOT_STARTED
|
||||
and not allow_scheduled
|
||||
connector_id = connector_credential_pair.connector_id
|
||||
credential_id = connector_credential_pair.credential_id
|
||||
current_embedding_model = get_current_db_embedding_model(db_session)
|
||||
|
||||
last_indexing = get_last_attempt(
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
embedding_model_id=current_embedding_model.id,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
if not last_indexing:
|
||||
return None
|
||||
|
||||
if last_indexing.status == IndexingStatus.IN_PROGRESS or (
|
||||
last_indexing.status == IndexingStatus.NOT_STARTED and not allow_scheduled
|
||||
):
|
||||
return (
|
||||
base_error_msg
|
||||
|
@ -270,9 +270,6 @@ class ConnectorCredentialPair(Base):
|
||||
last_successful_index_time: Mapped[datetime.datetime | None] = mapped_column(
|
||||
DateTime(timezone=True), default=None
|
||||
)
|
||||
last_attempt_status: Mapped[IndexingStatus | None] = mapped_column(
|
||||
Enum(IndexingStatus, native_enum=False)
|
||||
)
|
||||
total_docs_indexed: Mapped[int] = mapped_column(Integer, default=0)
|
||||
|
||||
connector: Mapped["Connector"] = relationship(
|
||||
|
@ -418,7 +418,9 @@ def get_connector_indexing_status(
|
||||
credential=CredentialSnapshot.from_credential_db_model(credential),
|
||||
public_doc=cc_pair.is_public,
|
||||
owner=credential.user.email if credential.user else "",
|
||||
last_status=cc_pair.last_attempt_status,
|
||||
last_status=latest_index_attempt.status
|
||||
if latest_index_attempt
|
||||
else None,
|
||||
last_success=cc_pair.last_successful_index_time,
|
||||
docs_indexed=cc_pair_to_document_cnt.get(
|
||||
(connector.id, credential.id), 0
|
||||
@ -438,6 +440,7 @@ def get_connector_indexing_status(
|
||||
),
|
||||
is_deletable=check_deletion_attempt_is_allowed(
|
||||
connector_credential_pair=cc_pair,
|
||||
db_session=db_session,
|
||||
# allow scheduled indexing attempts here, since on deletion request we will cancel them
|
||||
allow_scheduled=True,
|
||||
)
|
||||
|
@ -175,7 +175,9 @@ def create_deletion_attempt_for_connector_id(
|
||||
)
|
||||
|
||||
# Check if the deletion attempt should be allowed
|
||||
deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(cc_pair)
|
||||
deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(
|
||||
connector_credential_pair=cc_pair, db_session=db_session
|
||||
)
|
||||
if deletion_attempt_disallowed_reason:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
|
Loading…
x
Reference in New Issue
Block a user