From d2774f897994865ca179610add9563ee61f3f34e Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Wed, 24 Apr 2024 16:50:40 -0700 Subject: [PATCH] k --- backend/danswer/configs/app_configs.py | 1 + backend/danswer/connectors/web/connector.py | 6 ++++++ deployment/docker_compose/docker-compose.dev.yml | 1 + deployment/kubernetes/env-configmap.yaml | 1 + 4 files changed, 9 insertions(+) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 3f18435f5..2521a2a27 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -147,6 +147,7 @@ WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get( WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID") WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET") WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL") +WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS") NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = ( os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower() diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index c5fb6db26..355b8392d 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -20,6 +20,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL +from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup @@ -49,7 +50,11 @@ def protected_url_check(url: str) -> None: - Fetching this is assumed to be relatively fast compared to other bottlenecks like reading the page or embedding the contents - To be extra safe, all IPs associated with the URL must be global + - This is to prevent misuse and not explicit attacks """ + if not WEB_CONNECTOR_VALIDATE_URLS: + return + parse = urlparse(url) if parse.scheme != "http" and parse.scheme != "https": raise ValueError("URL must be of scheme https?://") @@ -262,6 +267,7 @@ class WebConnector(LoadConnector): final_page = page.url if final_page != current_url: logger.info(f"Redirected to {final_page}") + protected_url_check(final_page) current_url = final_page if current_url in visited_links: logger.info("Redirected page already indexed") diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index f3a569340..83ace4795 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -147,6 +147,7 @@ services: - EXPERIMENTAL_CHECKPOINTING_ENABLED=${EXPERIMENTAL_CHECKPOINTING_ENABLED:-} - CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-} - JIRA_CONNECTOR_LABELS_TO_SKIP=${JIRA_CONNECTOR_LABELS_TO_SKIP:-} + - WEB_CONNECTOR_VALIDATE_URLS=${WEB_CONNECTOR_VALIDATE_URLS:-} - JIRA_API_VERSION=${JIRA_API_VERSION:-} - GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-} - NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-} diff --git a/deployment/kubernetes/env-configmap.yaml b/deployment/kubernetes/env-configmap.yaml index 88ed9e096..e88e996a8 100644 --- a/deployment/kubernetes/env-configmap.yaml +++ b/deployment/kubernetes/env-configmap.yaml @@ -55,6 +55,7 @@ data: EXPERIMENTAL_CHECKPOINTING_ENABLED: "" CONFLUENCE_CONNECTOR_LABELS_TO_SKIP: "" JIRA_API_VERSION: "" + WEB_CONNECTOR_VALIDATE_URLS: "" GONG_CONNECTOR_START_TIME: "" NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP: "" # DanswerBot SlackBot Configs