2023-05-11 20:10:23 -07:00

124 lines
4.1 KiB
Python

from collections.abc import Generator
from typing import Any
from typing import cast
from urllib.parse import urljoin
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.interfaces import PullLoader
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logging import setup_logger
from playwright.sync_api import sync_playwright
logger = setup_logger()
TAG_SEPARATOR = "\n"
def is_valid_url(url: str) -> bool:
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def get_internal_links(
base_url: str, url: str, soup: BeautifulSoup, should_ignore_pound: bool = True
) -> list[str]:
internal_links = []
for link in cast(list[dict[str, Any]], soup.find_all("a")):
href = cast(str | None, link.get("href"))
if not href:
continue
if should_ignore_pound and "#" in href:
href = href.split("#")[0]
if not is_valid_url(href):
href = urljoin(url, href)
if urlparse(href).netloc == urlparse(url).netloc and base_url in href:
internal_links.append(href)
return internal_links
class WebLoader(PullLoader):
def __init__(
self,
base_url: str,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.base_url = base_url
self.batch_size = batch_size
def load(self) -> Generator[list[Document], None, None]:
"""Traverses through all pages found on the website
and converts them into documents"""
visited_links: set[str] = set()
to_visit: list[str] = [self.base_url]
doc_batch: list[Document] = []
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
while to_visit:
current_url = to_visit.pop()
if current_url in visited_links:
continue
visited_links.add(current_url)
try:
page = context.new_page()
page.goto(current_url)
content = page.content()
soup = BeautifulSoup(content, "html.parser")
title_tag = soup.find("title")
title = None
if title_tag and title_tag.text:
title = title_tag.text
# Heuristics based cleaning
for undesired_tag in ["nav", "header", "footer", "meta"]:
[tag.extract() for tag in soup.find_all(undesired_tag)]
for undesired_div in ["sidebar", "header", "footer"]:
[
tag.extract()
for tag in soup.find_all("div", {"class": undesired_div})
]
page_text = soup.get_text(TAG_SEPARATOR)
doc_batch.append(
Document(
id=current_url,
sections=[Section(link=current_url, text=page_text)],
source=DocumentSource.WEB,
semantic_identifier=title,
metadata={},
)
)
internal_links = get_internal_links(
self.base_url, current_url, soup
)
for link in internal_links:
if link not in visited_links:
to_visit.append(link)
page.close()
except Exception as e:
logger.error(f"Failed to fetch '{current_url}': {e}")
continue
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch