mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-19 04:20:57 +02:00
Cleanup for Mintlify Websites (#453)
This commit is contained in:
parent
6b305c56b3
commit
d7b7714d86
@ -124,10 +124,10 @@ FILE_CONNECTOR_TMP_STORAGE_PATH = os.environ.get(
|
|||||||
)
|
)
|
||||||
# TODO these should be available for frontend configuration, via advanced options expandable
|
# TODO these should be available for frontend configuration, via advanced options expandable
|
||||||
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
|
||||||
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,header,footer"
|
"WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
|
||||||
).split(",")
|
).split(",")
|
||||||
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
|
WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
|
||||||
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,header,footer,meta,script,style,symbol,aside"
|
"WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside"
|
||||||
).split(",")
|
).split(",")
|
||||||
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
|
WEB_CONNECTOR_OAUTH_CLIENT_ID = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_ID")
|
||||||
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
|
WEB_CONNECTOR_OAUTH_CLIENT_SECRET = os.environ.get("WEB_CONNECTOR_OAUTH_CLIENT_SECRET")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import io
|
import io
|
||||||
|
from copy import copy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
@ -32,6 +33,9 @@ from danswer.utils.text_processing import format_document_soup
|
|||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
MINTLIFY_UNWANTED = ["sticky", "hidden"]
|
||||||
|
|
||||||
|
|
||||||
def is_valid_url(url: str) -> bool:
|
def is_valid_url(url: str) -> bool:
|
||||||
try:
|
try:
|
||||||
result = urlparse(url)
|
result = urlparse(url)
|
||||||
@ -90,11 +94,13 @@ class WebConnector(LoadConnector):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
|
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
||||||
batch_size: int = INDEX_BATCH_SIZE,
|
batch_size: int = INDEX_BATCH_SIZE,
|
||||||
) -> None:
|
) -> None:
|
||||||
if "://" not in base_url:
|
if "://" not in base_url:
|
||||||
base_url = "https://" + base_url
|
base_url = "https://" + base_url
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
|
self.mintlify_cleanup = mintlify_cleanup
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
@ -171,7 +177,10 @@ class WebConnector(LoadConnector):
|
|||||||
title_tag.extract()
|
title_tag.extract()
|
||||||
|
|
||||||
# Heuristics based cleaning of elements based on css classes
|
# Heuristics based cleaning of elements based on css classes
|
||||||
for undesired_element in WEB_CONNECTOR_IGNORED_CLASSES:
|
unwanted_classes = copy(WEB_CONNECTOR_IGNORED_CLASSES)
|
||||||
|
if self.mintlify_cleanup:
|
||||||
|
unwanted_classes.extend(MINTLIFY_UNWANTED)
|
||||||
|
for undesired_element in unwanted_classes:
|
||||||
[
|
[
|
||||||
tag.extract()
|
tag.extract()
|
||||||
for tag in soup.find_all(
|
for tag in soup.find_all(
|
||||||
@ -182,7 +191,8 @@ class WebConnector(LoadConnector):
|
|||||||
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
for undesired_tag in WEB_CONNECTOR_IGNORED_ELEMENTS:
|
||||||
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
[tag.extract() for tag in soup.find_all(undesired_tag)]
|
||||||
|
|
||||||
page_text = format_document_soup(soup)
|
# 200B is ZeroWidthSpace which we don't care for
|
||||||
|
page_text = format_document_soup(soup).replace("\u200B", "")
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user