Add more flexibility to the Web Connector (#462)

This commit is contained in:
Yuhong Sun 2023-09-19 20:18:25 -07:00 committed by GitHub
parent da6dd5b617
commit 4b98e47036
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,6 +1,7 @@
import io import io
from copy import copy from copy import copy
from datetime import datetime from datetime import datetime
from enum import Enum
from typing import Any from typing import Any
from typing import cast from typing import cast
from typing import Tuple from typing import Tuple
@ -36,6 +37,17 @@ logger = setup_logger()
MINTLIFY_UNWANTED = ["sticky", "hidden"] MINTLIFY_UNWANTED = ["sticky", "hidden"]
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
# Given a base site, index everything under that path
RECURSIVE = "recursive"
# Given a URL, index only the given page
SINGLE = "single"
# Given a sitemap.xml URL, parse all the pages in it
SITEMAP = "sitemap"
# Given a file upload where every line is a URL, parse all the URLs provided
UPLOAD = "upload"
def is_valid_url(url: str) -> bool: def is_valid_url(url: str) -> bool:
try: try:
result = urlparse(url) result = urlparse(url)
@ -90,18 +102,58 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:
return playwright, context return playwright, context
def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response = requests.get(sitemap_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
urls = [loc_tag.text for loc_tag in soup.find_all("loc")]
return urls
def _ensure_valid_url(url: str) -> str:
if "://" not in url:
return "https://" + url
return url
def _read_urls_file(location: str) -> list[str]:
with open(location, "r") as f:
urls = [_ensure_valid_url(line.strip()) for line in f if line.strip()]
return urls
class WebConnector(LoadConnector): class WebConnector(LoadConnector):
def __init__( def __init__(
self, self,
base_url: str, base_url: str, # Can't change this without disrupting existing users
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
) -> None: ) -> None:
if "://" not in base_url:
base_url = "https://" + base_url
self.base_url = base_url
self.mintlify_cleanup = mintlify_cleanup self.mintlify_cleanup = mintlify_cleanup
self.batch_size = batch_size self.batch_size = batch_size
self.recursive = False
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
self.recursive = True
self.to_visit_list = [_ensure_valid_url(base_url)]
return
elif web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value:
self.to_visit_list = [_ensure_valid_url(base_url)]
elif web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SITEMAP:
self.to_visit_list = extract_urls_from_sitemap(_ensure_valid_url(base_url))
elif web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.UPLOAD:
self.to_visit_list = _read_urls_file(base_url)
else:
raise ValueError(
"Invalid Web Connector Config, must choose a valid type between: " ""
)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
if credentials: if credentials:
@ -112,7 +164,8 @@ class WebConnector(LoadConnector):
"""Traverses through all pages found on the website """Traverses through all pages found on the website
and converts them into documents""" and converts them into documents"""
visited_links: set[str] = set() visited_links: set[str] = set()
to_visit: list[str] = [self.base_url] to_visit: list[str] = self.to_visit_list
base_url = to_visit[0] # For the recursive case
doc_batch: list[Document] = [] doc_batch: list[Document] = []
playwright, context = start_playwright() playwright, context = start_playwright()
@ -165,10 +218,11 @@ class WebConnector(LoadConnector):
content = page.content() content = page.content()
soup = BeautifulSoup(content, "html.parser") soup = BeautifulSoup(content, "html.parser")
internal_links = get_internal_links(self.base_url, current_url, soup) if self.recursive:
for link in internal_links: internal_links = get_internal_links(base_url, current_url, soup)
if link not in visited_links: for link in internal_links:
to_visit.append(link) if link not in visited_links:
to_visit.append(link)
title_tag = soup.find("title") title_tag = soup.find("title")
title = None title = None