From d7b7714d865b17aaca913e2c29cb32c3ddf8492e Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Sat, 16 Sep 2023 23:43:24 -0700 Subject: [PATCH] Cleanup for Mintlify Websites (#453) --- backend/danswer/configs/app_configs.py | 4 ++-- backend/danswer/connectors/web/connector.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 18a8df088..2f0103041 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -124,10 +124,10 @@ FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get( ) # TODO these should be available for frontend configuration, via advanced options expandable WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( - "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer" + "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer" ).split(",") WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get( - "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside" + "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside" ).split(",") WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID") WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET") diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index 100e280db..da830b210 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -1,4 +1,5 @@ import io +from copy import copy from datetime import datetime from typing import Any from typing import cast @@ -32,6 +33,9 @@ from danswer.utils.text_processing import format_document_soup logger = setup_logger() +MINTLIFY_UNWANTED = ["sticky", "hidden"] + + def is_valid_url(url: str) -> bool: try: result = urlparse(url) @@ -90,11 +94,13 @@ class WebConnector(LoadConnector): def __init__( self, base_url: str, + mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well batch_size: int = INDEX_BATCH_SIZE, ) -> None: if "://" not in base_url: base_url = "https://" + base_url self.base_url = base_url + self.mintlify_cleanup = mintlify_cleanup self.batch_size = batch_size def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: @@ -171,7 +177,10 @@ class WebConnector(LoadConnector): title_tag.extract() # Heuristics based cleaning of elements based on css classes - for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES: + unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES) + if self.mintlify_cleanup: + unwanted_classes.extend(MINTLIFY_UNWANTED) + for undesired_element in unwanted_classes: [ tag.extract() for tag in soup.find_all( @@ -182,7 +191,8 @@ class WebConnector(LoadConnector): for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS: [tag.extract() for tag in soup.find_all(undesired_tag)] - page_text = format_document_soup(soup) + # 200B is ZeroWidthSpace which we don't care for + page_text = format_document_soup(soup).replace("\u200B", "") doc_batch.append( Document(