diff --git a/backend/danswer/background/indexing/run_indexing.py b/backend/danswer/background/indexing/run_indexing.py index b7359f19ce4c..1182dd08f499 100644 --- a/backend/danswer/background/indexing/run_indexing.py +++ b/backend/danswer/background/indexing/run_indexing.py @@ -7,6 +7,7 @@ from datetime import timezone from sqlalchemy.orm import Session from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt +from danswer.configs.app_configs import INDEXING_SIZE_WARNING_THRESHOLD from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET from danswer.connectors.factory import instantiate_connector from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -200,9 +201,22 @@ def _run_indexing( # Likely due to user manually disabling it or model swap raise RuntimeError("Index Attempt was canceled") - logger.debug( - f"Indexing batch of documents: {[doc.to_short_descriptor() for doc in doc_batch]}" - ) + batch_description = [] + for doc in doc_batch: + batch_description.append(doc.to_short_descriptor()) + + doc_size = 0 + for section in doc.sections: + doc_size += len(section.text) + + if doc_size > INDEXING_SIZE_WARNING_THRESHOLD: + logger.warning( + f"Document size: doc='{doc.to_short_descriptor()}' " + f"size={doc_size} " + f"threshold={INDEXING_SIZE_WARNING_THRESHOLD}" + ) + + logger.debug(f"Indexing batch of documents: {batch_description}") new_docs, total_batch_chunks = indexing_pipeline( document_batch=doc_batch, diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index adfb1c712316..50f7fc5ad54c 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -202,6 +202,11 @@ CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING = ( os.environ.get("CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING", "").lower() == "true" ) +# Attachments exceeding this size will not be retrieved (in bytes) +CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD = int( + os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD", 50 * 1024 * 1024) +) + JIRA_CONNECTOR_LABELS_TO_SKIP = [ ignored_tag for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",") @@ -277,6 +282,10 @@ SKIP_METADATA_IN_CHUNK = os.environ.get("SKIP_METADATA_IN_CHUNK", "").lower() == # Timeout to wait for job's last update before killing it, in hours CLEANUP_INDEXING_JOBS_TIMEOUT = int(os.environ.get("CLEANUP_INDEXING_JOBS_TIMEOUT", 3)) +# The indexer will warn in the logs whenver a document exceeds this threshold (in bytes) +INDEXING_SIZE_WARNING_THRESHOLD = int( + os.environ.get("INDEXING_SIZE_WARNING_THRESHOLD", 100 * 1024 * 1024) +) ##### # Miscellaneous diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 30a9032c9767..3e366ead1c5c 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -13,6 +13,7 @@ import bs4 from atlassian import Confluence # type:ignore from requests import HTTPError +from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING @@ -560,6 +561,17 @@ class ConfluenceConnector(LoadConnector, PollConnector): if attachment["title"] not in files_in_used: continue + download_link = confluence_client.url + attachment["_links"]["download"] + + attachment_size = attachment["extensions"]["fileSize"] + if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD: + logger.warning( + f"Skipping {download_link} due to size. " + f"size={attachment_size} " + f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}" + ) + continue + download_link = confluence_client.url + attachment["_links"]["download"] response = confluence_client._session.get(download_link)