Cleanup for Mintlify Websites (#453)

2025-06-19 04:20:57 +02:00 · 2023-09-16 23:43:24 -07:00 · 2023-09-16 23:43:24 -07:00 · d7b7714d86
commit d7b7714d86
parent 6b305c56b3
2 changed files with 14 additions and 4 deletions
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@ -124,10 +124,10 @@ FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
 )
 # TODO these should be available for frontend configuration, via advanced options expandable
 WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
-    "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer"
+    "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
 ).split(",")
 WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
-    "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside"
+    "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside"
 ).split(",")
 WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
 WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@ -1,4 +1,5 @@
 import io
 from copy import copy
 from datetime import datetime
 from typing import Any
 from typing import cast
@ -32,6 +33,9 @@ from danswer.utils.text_processing import format_document_soup
 logger = setup_logger()
 MINTLIFY_UNWANTED = ["sticky", "hidden"]
 def is_valid_url(url: str) -> bool:
    try:
        result = urlparse(url)
@ -90,11 +94,13 @@ class WebConnector(LoadConnector):
    def __init__(
        self,
        base_url: str,
        mintlify_cleanup: bool = True,  # Mostly ok to apply to other websites as well
        batch_size: int = INDEX_BATCH_SIZE,
    ) -> None:
        if "://" not in base_url:
            base_url = "https://" + base_url
        self.base_url = base_url
        self.mintlify_cleanup = mintlify_cleanup
        self.batch_size = batch_size
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@ -171,7 +177,10 @@ class WebConnector(LoadConnector):
                    title_tag.extract()
                # Heuristics based cleaning of elements based on css classes
-                for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
+                unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
                if self.mintlify_cleanup:
                    unwanted_classes.extend(MINTLIFY_UNWANTED)
                for undesired_element in unwanted_classes:
                    [
                        tag.extract()
                        for tag in soup.find_all(
@ -182,7 +191,8 @@ class WebConnector(LoadConnector):
                for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
                    [tag.extract() for tag in soup.find_all(undesired_tag)]
-                page_text = format_document_soup(soup)
+                # 200B is ZeroWidthSpace which we don't care for
                page_text = format_document_soup(soup).replace("\u200B", "")
                doc_batch.append(
                    Document(