diff --git a/backend/danswer/background/indexing/run_indexing.py b/backend/danswer/background/indexing/run_indexing.py index 1dcd8494f..6241af6f5 100644 --- a/backend/danswer/background/indexing/run_indexing.py +++ b/backend/danswer/background/indexing/run_indexing.py @@ -10,6 +10,7 @@ from danswer.background.connector_deletion import ( _delete_connector_credential_pair_batch, ) from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt +from danswer.configs.app_configs import DISABLE_DOCUMENT_CLEANUP from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET from danswer.connectors.factory import instantiate_connector from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -232,7 +233,7 @@ def _run_indexing( docs_removed_from_index=0, ) - if is_listing_complete: + if is_listing_complete and not DISABLE_DOCUMENT_CLEANUP: # clean up all documents from the index that have not been returned from the connector all_indexed_document_ids = { d.id diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 7a9ef380f..3df3145c2 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -196,6 +196,10 @@ ENABLE_MINI_CHUNK = os.environ.get("ENABLE_MINI_CHUNK", "").lower() == "true" MINI_CHUNK_SIZE = 150 # Timeout to wait for job's last update before killing it, in hours CLEANUP_INDEXING_JOBS_TIMEOUT = int(os.environ.get("CLEANUP_INDEXING_JOBS_TIMEOUT", 3)) +# If set to true, then will not clean up documents that "no longer exist" when running Load connectors +DISABLE_DOCUMENT_CLEANUP = ( + os.environ.get("DISABLE_DOCUMENT_CLEANUP", "").lower() == "true" +) ##### diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index d8671ca8c..e33073152 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -140,6 +140,7 @@ services: - GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-} - NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-} - GITHUB_CONNECTOR_BASE_URL=${GITHUB_CONNECTOR_BASE_URL:-} + - DISABLE_DOCUMENT_CLEANUP=${DISABLE_DOCUMENT_CLEANUP:-} # Danswer SlackBot Configs - DANSWER_BOT_SLACK_APP_TOKEN=${DANSWER_BOT_SLACK_APP_TOKEN:-} - DANSWER_BOT_SLACK_BOT_TOKEN=${DANSWER_BOT_SLACK_BOT_TOKEN:-}