Add more flexibility to the Web Connector (#462)

This commit is contained in:
Yuhong Sun 2023-09-19 20:18:25 -07:00 committed by GitHub
parent da6dd5b617
commit 4b98e47036
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,6 +1,7 @@
import io
from copy import copy
from datetime import datetime
from enum import Enum
from typing import Any
from typing import cast
from typing import Tuple
@ -36,6 +37,17 @@ logger = setup_logger()
MINTLIFY_UNWANTED = ["sticky", "hidden"]
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
# Given a base site, index everything under that path
RECURSIVE = "recursive"
# Given a URL, index only the given page
SINGLE = "single"
# Given a sitemap.xml URL, parse all the pages in it
SITEMAP = "sitemap"
# Given a file upload where every line is a URL, parse all the URLs provided
UPLOAD = "upload"
def is_valid_url(url: str) -> bool:
try:
result = urlparse(url)
@ -90,18 +102,58 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:
return playwright, context
def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
response = requests.get(sitemap_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
urls = [loc_tag.text for loc_tag in soup.find_all("loc")]
return urls
def _ensure_valid_url(url: str) -> str:
if "://" not in url:
return "https://" + url
return url
def _read_urls_file(location: str) -> list[str]:
with open(location, "r") as f:
urls = [_ensure_valid_url(line.strip()) for line in f if line.strip()]
return urls
class WebConnector(LoadConnector):
def __init__(
self,
base_url: str,
base_url: str, # Can't change this without disrupting existing users
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
if "://" not in base_url:
base_url = "https://" + base_url
self.base_url = base_url
self.mintlify_cleanup = mintlify_cleanup
self.batch_size = batch_size
self.recursive = False
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
self.recursive = True
self.to_visit_list = [_ensure_valid_url(base_url)]
return
elif web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value:
self.to_visit_list = [_ensure_valid_url(base_url)]
elif web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SITEMAP:
self.to_visit_list = extract_urls_from_sitemap(_ensure_valid_url(base_url))
elif web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.UPLOAD:
self.to_visit_list = _read_urls_file(base_url)
else:
raise ValueError(
"Invalid Web Connector Config, must choose a valid type between: " ""
)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
if credentials:
@ -112,7 +164,8 @@ class WebConnector(LoadConnector):
"""Traverses through all pages found on the website
and converts them into documents"""
visited_links: set[str] = set()
to_visit: list[str] = [self.base_url]
to_visit: list[str] = self.to_visit_list
base_url = to_visit[0] # For the recursive case
doc_batch: list[Document] = []
playwright, context = start_playwright()
@ -165,10 +218,11 @@ class WebConnector(LoadConnector):
content = page.content()
soup = BeautifulSoup(content, "html.parser")
internal_links = get_internal_links(self.base_url, current_url, soup)
for link in internal_links:
if link not in visited_links:
to_visit.append(link)
if self.recursive:
internal_links = get_internal_links(base_url, current_url, soup)
for link in internal_links:
if link not in visited_links:
to_visit.append(link)
title_tag = soup.find("title")
title = None