Cleanup for Mintlify Websites (#453)

This commit is contained in:
Yuhong Sun 2023-09-16 23:43:24 -07:00 committed by GitHub
parent 6b305c56b3
commit d7b7714d86
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 4 deletions

View File

@ -124,10 +124,10 @@ FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
) )
# TODO these should be available for frontend configuration, via advanced options expandable # TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer" "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
).split(",") ).split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get( WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside" "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside"
).split(",") ).split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID") WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET") WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")

View File

@ -1,4 +1,5 @@
import io import io
from copy import copy
from datetime import datetime from datetime import datetime
from typing import Any from typing import Any
from typing import cast from typing import cast
@ -32,6 +33,9 @@ from danswer.utils.text_processing import format_document_soup
logger = setup_logger() logger = setup_logger()
MINTLIFY_UNWANTED = ["sticky", "hidden"]
def is_valid_url(url: str) -> bool: def is_valid_url(url: str) -> bool:
try: try:
result = urlparse(url) result = urlparse(url)
@ -90,11 +94,13 @@ class WebConnector(LoadConnector):
def __init__( def __init__(
self, self,
base_url: str, base_url: str,
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
) -> None: ) -> None:
if "://" not in base_url: if "://" not in base_url:
base_url = "https://" + base_url base_url = "https://" + base_url
self.base_url = base_url self.base_url = base_url
self.mintlify_cleanup = mintlify_cleanup
self.batch_size = batch_size self.batch_size = batch_size
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@ -171,7 +177,10 @@ class WebConnector(LoadConnector):
title_tag.extract() title_tag.extract()
# Heuristics based cleaning of elements based on css classes # Heuristics based cleaning of elements based on css classes
for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES: unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
if self.mintlify_cleanup:
unwanted_classes.extend(MINTLIFY_UNWANTED)
for undesired_element in unwanted_classes:
[ [
tag.extract() tag.extract()
for tag in soup.find_all( for tag in soup.find_all(
@ -182,7 +191,8 @@ class WebConnector(LoadConnector):
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS: for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
[tag.extract() for tag in soup.find_all(undesired_tag)] [tag.extract() for tag in soup.find_all(undesired_tag)]
page_text = format_document_soup(soup) # 200B is ZeroWidthSpace which we don't care for
page_text = format_document_soup(soup).replace("\u200B", "")
doc_batch.append( doc_batch.append(
Document( Document(