This commit is contained in:
Yuhong Sun 2024-04-24 16:50:40 -07:00
parent 0b1695f616
commit d2774f8979
4 changed files with 9 additions and 0 deletions

View File

@ -147,6 +147,7 @@ WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
WEB_CONNECTOR_OAUTH_TOKEN_URL = os.environ.get("WEB_CONNECTOR_OAUTH_TOKEN_URL")
WEB_CONNECTOR_VALIDATE_URLS = os.environ.get("WEB_CONNECTOR_VALIDATE_URLS")
NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP = (
os.environ.get("NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP", "").lower()

View File

@ -20,6 +20,7 @@ from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
@ -49,7 +50,11 @@ def protected_url_check(url: str) -> None:
- Fetching this is assumed to be relatively fast compared to other bottlenecks like reading
the page or embedding the contents
- To be extra safe, all IPs associated with the URL must be global
- This is to prevent misuse and not explicit attacks
"""
if not WEB_CONNECTOR_VALIDATE_URLS:
return
parse = urlparse(url)
if parse.scheme != "http" and parse.scheme != "https":
raise ValueError("URL must be of scheme https?://")
@ -262,6 +267,7 @@ class WebConnector(LoadConnector):
final_page = page.url
if final_page != current_url:
logger.info(f"Redirected to {final_page}")
protected_url_check(final_page)
current_url = final_page
if current_url in visited_links:
logger.info("Redirected page already indexed")

View File

@ -147,6 +147,7 @@ services:
- EXPERIMENTAL_CHECKPOINTING_ENABLED=${EXPERIMENTAL_CHECKPOINTING_ENABLED:-}
- CONFLUENCE_CONNECTOR_LABELS_TO_SKIP=${CONFLUENCE_CONNECTOR_LABELS_TO_SKIP:-}
- JIRA_CONNECTOR_LABELS_TO_SKIP=${JIRA_CONNECTOR_LABELS_TO_SKIP:-}
- WEB_CONNECTOR_VALIDATE_URLS=${WEB_CONNECTOR_VALIDATE_URLS:-}
- JIRA_API_VERSION=${JIRA_API_VERSION:-}
- GONG_CONNECTOR_START_TIME=${GONG_CONNECTOR_START_TIME:-}
- NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP=${NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP:-}

View File

@ -55,6 +55,7 @@ data:
EXPERIMENTAL_CHECKPOINTING_ENABLED: ""
CONFLUENCE_CONNECTOR_LABELS_TO_SKIP: ""
JIRA_API_VERSION: ""
WEB_CONNECTOR_VALIDATE_URLS: ""
GONG_CONNECTOR_START_TIME: ""
NOTION_CONNECTOR_ENABLE_RECURSIVE_PAGE_LOOKUP: ""
# DanswerBot SlackBot Configs