Cleanup for Mintlify Websites (#453)

This commit is contained in:
Yuhong Sun 2023-09-16 23:43:24 -07:00 committed by GitHub
parent 6b305c56b3
commit d7b7714d86
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 4 deletions

View File

@ -124,10 +124,10 @@ FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
)
# TODO these should be available for frontend configuration, via advanced options expandable
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer"
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
).split(",")
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside"
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside"
).split(",")
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")

View File

@ -1,4 +1,5 @@
import io
from copy import copy
from datetime import datetime
from typing import Any
from typing import cast
@ -32,6 +33,9 @@ from danswer.utils.text_processing import format_document_soup
logger = setup_logger()
MINTLIFY_UNWANTED = ["sticky", "hidden"]
def is_valid_url(url: str) -> bool:
try:
result = urlparse(url)
@ -90,11 +94,13 @@ class WebConnector(LoadConnector):
def __init__(
self,
base_url: str,
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
if "://" not in base_url:
base_url = "https://" + base_url
self.base_url = base_url
self.mintlify_cleanup = mintlify_cleanup
self.batch_size = batch_size
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
@ -171,7 +177,10 @@ class WebConnector(LoadConnector):
title_tag.extract()
# Heuristics based cleaning of elements based on css classes
for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
if self.mintlify_cleanup:
unwanted_classes.extend(MINTLIFY_UNWANTED)
for undesired_element in unwanted_classes:
[
tag.extract()
for tag in soup.find_all(
@ -182,7 +191,8 @@ class WebConnector(LoadConnector):
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
[tag.extract() for tag in soup.find_all(undesired_tag)]
page_text = format_document_soup(soup)
# 200B is ZeroWidthSpace which we don't care for
page_text = format_document_soup(soup).replace("\u200B", "")
doc_batch.append(
Document(