mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-26 17:51:54 +01:00
Address bug with automatic document set cleanup on connector deletion
This commit is contained in:
parent
3e05c4fa67
commit
876c6fdaa6
@ -10,6 +10,7 @@ are multiple connector / credential pairs that have indexed it
|
||||
connector / credential pair from the access list
|
||||
(6) delete all relevant entries from postgres
|
||||
"""
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from typing import cast
|
||||
|
||||
@ -23,9 +24,6 @@ from danswer.db.connector import fetch_connector_by_id
|
||||
from danswer.db.connector_credential_pair import (
|
||||
delete_connector_credential_pair__no_commit,
|
||||
)
|
||||
from danswer.db.connector_credential_pair import (
|
||||
delete_document_set_relationships_for_cc_pair__no_commit,
|
||||
)
|
||||
from danswer.db.connector_credential_pair import get_connector_credential_pair
|
||||
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
|
||||
from danswer.db.document import delete_document_by_connector_credential_pair
|
||||
@ -33,6 +31,10 @@ from danswer.db.document import delete_documents_complete
|
||||
from danswer.db.document import get_document_connector_cnts
|
||||
from danswer.db.document import get_documents_for_connector_credential_pair
|
||||
from danswer.db.document import prepare_to_modify_documents
|
||||
from danswer.db.document_set import get_document_sets_by_ids
|
||||
from danswer.db.document_set import (
|
||||
mark_cc_pair__document_set_relationships_to_be_deleted__no_commit,
|
||||
)
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.db.index_attempt import delete_index_attempts
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
@ -103,38 +105,46 @@ def _delete_connector_credential_pair_batch(
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def postgres_cc_pair_cleanup__no_commit(
|
||||
def cleanup_synced_entities(
|
||||
cc_pair: ConnectorCredentialPair, db_session: Session
|
||||
) -> None:
|
||||
"""Cleans up all rows in Postgres related to the specified
|
||||
connector_credential_pair + deletes the connector itself if there are
|
||||
no other credentials left for the connector
|
||||
"""
|
||||
connector_id = cc_pair.connector_id
|
||||
credential_id = cc_pair.credential_id
|
||||
"""Updates the document sets associated with the connector / credential pair,
|
||||
then relies on the document set sync script to kick off Celery jobs which will
|
||||
sync these updates to Vespa.
|
||||
|
||||
delete_index_attempts(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
Waits until the document sets are synced before returning."""
|
||||
logger.info(f"Cleaning up Document Sets for CC Pair with ID: '{cc_pair.id}'")
|
||||
document_sets_ids_to_sync = list(
|
||||
mark_cc_pair__document_set_relationships_to_be_deleted__no_commit(
|
||||
cc_pair_id=cc_pair.id,
|
||||
db_session=db_session,
|
||||
)
|
||||
)
|
||||
delete_document_set_relationships_for_cc_pair__no_commit(
|
||||
cc_pair_id=cc_pair.id,
|
||||
db_session=db_session,
|
||||
db_session.commit()
|
||||
|
||||
# wait till all document sets are synced before continuing
|
||||
while True:
|
||||
all_synced = True
|
||||
document_sets = get_document_sets_by_ids(
|
||||
db_session=db_session, document_set_ids=document_sets_ids_to_sync
|
||||
)
|
||||
for document_set in document_sets:
|
||||
if not document_set.is_up_to_date:
|
||||
all_synced = False
|
||||
|
||||
if all_synced:
|
||||
break
|
||||
|
||||
# wait for 30 seconds before checking again
|
||||
db_session.commit() # end transaction
|
||||
logger.info(
|
||||
f"Document sets '{document_sets_ids_to_sync}' not synced yet, waiting 30s"
|
||||
)
|
||||
time.sleep(30)
|
||||
|
||||
logger.info(
|
||||
f"Finished cleaning up Document Sets for CC Pair with ID: '{cc_pair.id}'"
|
||||
)
|
||||
delete_connector_credential_pair__no_commit(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
)
|
||||
# if there are no credentials left, delete the connector
|
||||
connector = fetch_connector_by_id(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
if not connector or not len(connector.credentials):
|
||||
logger.debug("Found no credentials left for connector, deleting connector")
|
||||
db_session.delete(connector)
|
||||
|
||||
|
||||
def _delete_connector_credential_pair(
|
||||
@ -164,15 +174,36 @@ def _delete_connector_credential_pair(
|
||||
)
|
||||
num_docs_deleted += len(documents)
|
||||
|
||||
# cleanup everything else up
|
||||
postgres_cleanup__no_commit = cast(
|
||||
# Clean up document sets / access information from Postgres
|
||||
# and sync these updates to Vespa
|
||||
cleanup_synced_entities__versioned = cast(
|
||||
Callable[[ConnectorCredentialPair, Session], None],
|
||||
fetch_versioned_implementation(
|
||||
"danswer.background.connector_deletion",
|
||||
"postgres_cc_pair_cleanup__no_commit",
|
||||
"cleanup_synced_entities",
|
||||
),
|
||||
)
|
||||
postgres_cleanup__no_commit(cc_pair, db_session)
|
||||
cleanup_synced_entities__versioned(cc_pair, db_session)
|
||||
|
||||
# clean up the rest of the related Postgres entities
|
||||
delete_index_attempts(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
)
|
||||
delete_connector_credential_pair__no_commit(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
)
|
||||
# if there are no credentials left, delete the connector
|
||||
connector = fetch_connector_by_id(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id,
|
||||
)
|
||||
if not connector or not len(connector.credentials):
|
||||
logger.debug("Found no credentials left for connector, deleting connector")
|
||||
db_session.delete(connector)
|
||||
db_session.commit()
|
||||
|
||||
logger.info(
|
||||
|
@ -29,7 +29,9 @@ def _document_sync_loop() -> None:
|
||||
# kick off new tasks
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
# check if any document sets are not synced
|
||||
document_set_info = fetch_document_sets(db_session=db_session)
|
||||
document_set_info = fetch_document_sets(
|
||||
db_session=db_session, include_outdated=True
|
||||
)
|
||||
for document_set, _ in document_set_info:
|
||||
if not document_set.is_up_to_date:
|
||||
if document_set.id in _ExistingTaskCache:
|
||||
|
@ -9,7 +9,6 @@ from sqlalchemy.orm import Session
|
||||
from danswer.db.connector import fetch_connector_by_id
|
||||
from danswer.db.credentials import fetch_credential_by_id
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.models import DocumentSet__ConnectorCredentialPair
|
||||
from danswer.db.models import IndexingStatus
|
||||
from danswer.db.models import User
|
||||
from danswer.server.models import StatusResponse
|
||||
@ -82,16 +81,6 @@ def update_connector_credential_pair(
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def delete_document_set_relationships_for_cc_pair__no_commit(
|
||||
cc_pair_id: int, db_session: Session
|
||||
) -> None:
|
||||
"""NOTE: does not commit transaction, this must be done by the caller"""
|
||||
stmt = delete(DocumentSet__ConnectorCredentialPair).where(
|
||||
DocumentSet__ConnectorCredentialPair.connector_credential_pair_id == cc_pair_id
|
||||
)
|
||||
db_session.execute(stmt)
|
||||
|
||||
|
||||
def delete_connector_credential_pair__no_commit(
|
||||
db_session: Session,
|
||||
connector_id: int,
|
||||
|
@ -49,6 +49,14 @@ def get_document_set_by_id(
|
||||
)
|
||||
|
||||
|
||||
def get_document_sets_by_ids(
|
||||
db_session: Session, document_set_ids: list[int]
|
||||
) -> Sequence[DocumentSetDBModel]:
|
||||
return db_session.scalars(
|
||||
select(DocumentSetDBModel).where(DocumentSetDBModel.id.in_(document_set_ids))
|
||||
).all()
|
||||
|
||||
|
||||
def insert_document_set(
|
||||
document_set_creation_request: DocumentSetCreationRequest,
|
||||
user_id: UUID | None,
|
||||
@ -196,36 +204,72 @@ def mark_document_set_as_to_be_deleted(
|
||||
raise
|
||||
|
||||
|
||||
def mark_cc_pair__document_set_relationships_to_be_deleted__no_commit(
|
||||
cc_pair_id: int, db_session: Session
|
||||
) -> set[int]:
|
||||
"""Marks all CC Pair -> Document Set relationships for the specified
|
||||
`cc_pair_id` as not current and returns the list of all document set IDs
|
||||
affected.
|
||||
|
||||
NOTE: rases a `ValueError` if any of the document sets are currently syncing
|
||||
to avoid getting into a bad state."""
|
||||
document_set__cc_pair_relationships = db_session.scalars(
|
||||
select(DocumentSet__ConnectorCredentialPair).where(
|
||||
DocumentSet__ConnectorCredentialPair.connector_credential_pair_id
|
||||
== cc_pair_id
|
||||
)
|
||||
).all()
|
||||
|
||||
document_set_ids_touched: set[int] = set()
|
||||
for document_set__cc_pair_relationship in document_set__cc_pair_relationships:
|
||||
document_set__cc_pair_relationship.is_current = False
|
||||
|
||||
if not document_set__cc_pair_relationship.document_set.is_up_to_date:
|
||||
raise ValueError(
|
||||
"Cannot delete CC pair while it is attached to a document set "
|
||||
"that is syncing. Please wait for the document set to finish "
|
||||
"syncing, and then try again."
|
||||
)
|
||||
|
||||
document_set__cc_pair_relationship.document_set.is_up_to_date = False
|
||||
document_set_ids_touched.add(document_set__cc_pair_relationship.document_set_id)
|
||||
|
||||
return document_set_ids_touched
|
||||
|
||||
|
||||
def fetch_document_sets(
|
||||
db_session: Session,
|
||||
db_session: Session, include_outdated: bool = False
|
||||
) -> list[tuple[DocumentSetDBModel, list[ConnectorCredentialPair]]]:
|
||||
"""Return is a list where each element contains a tuple of:
|
||||
1. The document set itself
|
||||
2. All CC pairs associated with the document set"""
|
||||
stmt = (
|
||||
select(DocumentSetDBModel, ConnectorCredentialPair)
|
||||
.join(
|
||||
DocumentSet__ConnectorCredentialPair,
|
||||
DocumentSetDBModel.id
|
||||
== DocumentSet__ConnectorCredentialPair.document_set_id,
|
||||
isouter=True, # outer join is needed to also fetch document sets with no cc pairs
|
||||
)
|
||||
.join(
|
||||
ConnectorCredentialPair,
|
||||
ConnectorCredentialPair.id
|
||||
== DocumentSet__ConnectorCredentialPair.connector_credential_pair_id,
|
||||
isouter=True, # outer join is needed to also fetch document sets with no cc pairs
|
||||
)
|
||||
)
|
||||
if not include_outdated:
|
||||
stmt = stmt.where(
|
||||
or_(
|
||||
DocumentSet__ConnectorCredentialPair.is_current == True, # noqa: E712
|
||||
# `None` handles case where no CC Pairs exist for a Document Set
|
||||
DocumentSet__ConnectorCredentialPair.is_current.is_(None),
|
||||
)
|
||||
)
|
||||
|
||||
results = cast(
|
||||
list[tuple[DocumentSetDBModel, ConnectorCredentialPair | None]],
|
||||
db_session.execute(
|
||||
select(DocumentSetDBModel, ConnectorCredentialPair)
|
||||
.join(
|
||||
DocumentSet__ConnectorCredentialPair,
|
||||
DocumentSetDBModel.id
|
||||
== DocumentSet__ConnectorCredentialPair.document_set_id,
|
||||
isouter=True, # outer join is needed to also fetch document sets with no cc pairs
|
||||
)
|
||||
.join(
|
||||
ConnectorCredentialPair,
|
||||
ConnectorCredentialPair.id
|
||||
== DocumentSet__ConnectorCredentialPair.connector_credential_pair_id,
|
||||
isouter=True, # outer join is needed to also fetch document sets with no cc pairs
|
||||
)
|
||||
.where(
|
||||
or_(
|
||||
DocumentSet__ConnectorCredentialPair.is_current
|
||||
== True, # noqa: E712
|
||||
DocumentSet__ConnectorCredentialPair.is_current.is_(None),
|
||||
)
|
||||
)
|
||||
).all(),
|
||||
db_session.execute(stmt).all(),
|
||||
)
|
||||
|
||||
aggregated_results: dict[
|
||||
|
@ -119,6 +119,8 @@ class DocumentSet__ConnectorCredentialPair(Base):
|
||||
primary_key=True,
|
||||
)
|
||||
|
||||
document_set: Mapped["DocumentSet"] = relationship("DocumentSet")
|
||||
|
||||
|
||||
class ConnectorCredentialPair(Base):
|
||||
"""Connectors and Credentials can have a many-to-many relationship
|
||||
|
@ -129,7 +129,6 @@ const DocumentSetTable = ({
|
||||
},
|
||||
]}
|
||||
data={documentSets
|
||||
.filter((documentSet) => documentSet.cc_pair_descriptors.length > 0)
|
||||
.slice((page - 1) * numToDisplay, page * numToDisplay)
|
||||
.map((documentSet) => {
|
||||
return {
|
||||
@ -170,10 +169,14 @@ const DocumentSetTable = ({
|
||||
),
|
||||
status: documentSet.is_up_to_date ? (
|
||||
<div className="text-emerald-600">Up to date!</div>
|
||||
) : (
|
||||
) : documentSet.cc_pair_descriptors.length > 0 ? (
|
||||
<div className="text-gray-300 w-10">
|
||||
<LoadingAnimation text="Syncing" />
|
||||
</div>
|
||||
) : (
|
||||
<div className="text-red-500 w-10">
|
||||
<LoadingAnimation text="Deleting" />
|
||||
</div>
|
||||
),
|
||||
delete: (
|
||||
<div
|
||||
@ -182,13 +185,13 @@ const DocumentSetTable = ({
|
||||
const response = await deleteDocumentSet(documentSet.id);
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: `Document set "${documentSet.name}" deleted`,
|
||||
message: `Document set "${documentSet.name}" scheduled for deletion`,
|
||||
type: "success",
|
||||
});
|
||||
} else {
|
||||
const errorMsg = (await response.json()).detail;
|
||||
setPopup({
|
||||
message: `Failed to delete document set - ${errorMsg}`,
|
||||
message: `Failed to schedule document set for deletion - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user