mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-04 19:50:57 +02:00
DAN-139 web connector recursion (#126)
Now handled by checking final page after redirects
This commit is contained in:
@ -30,8 +30,8 @@ def is_valid_url(url: str) -> bool:
|
|||||||
|
|
||||||
def get_internal_links(
|
def get_internal_links(
|
||||||
base_url: str, url: str, soup: BeautifulSoup, should_ignore_pound: bool = True
|
base_url: str, url: str, soup: BeautifulSoup, should_ignore_pound: bool = True
|
||||||
) -> list[str]:
|
) -> set[str]:
|
||||||
internal_links = []
|
internal_links = set()
|
||||||
for link in cast(list[dict[str, Any]], soup.find_all("a")):
|
for link in cast(list[dict[str, Any]], soup.find_all("a")):
|
||||||
href = cast(str | None, link.get("href"))
|
href = cast(str | None, link.get("href"))
|
||||||
if not href:
|
if not href:
|
||||||
@ -47,7 +47,7 @@ def get_internal_links(
|
|||||||
href = urljoin(url, href)
|
href = urljoin(url, href)
|
||||||
|
|
||||||
if urlparse(href).netloc == urlparse(url).netloc and base_url in href:
|
if urlparse(href).netloc == urlparse(url).netloc and base_url in href:
|
||||||
internal_links.append(href)
|
internal_links.add(href)
|
||||||
return internal_links
|
return internal_links
|
||||||
|
|
||||||
|
|
||||||
@ -74,11 +74,6 @@ class WebConnector(LoadConnector):
|
|||||||
to_visit: list[str] = [self.base_url]
|
to_visit: list[str] = [self.base_url]
|
||||||
doc_batch: list[Document] = []
|
doc_batch: list[Document] = []
|
||||||
|
|
||||||
# Edge case handling user provides HTML without terminating slash (prevents duplicate)
|
|
||||||
# Most sites either redirect no slash to slash or serve same content
|
|
||||||
if self.base_url[-1] != "/":
|
|
||||||
visited_links.add(self.base_url + "/")
|
|
||||||
|
|
||||||
restart_playwright = True
|
restart_playwright = True
|
||||||
while to_visit:
|
while to_visit:
|
||||||
current_url = to_visit.pop()
|
current_url = to_visit.pop()
|
||||||
@ -86,6 +81,8 @@ class WebConnector(LoadConnector):
|
|||||||
continue
|
continue
|
||||||
visited_links.add(current_url)
|
visited_links.add(current_url)
|
||||||
|
|
||||||
|
logger.info(f"Indexing {current_url}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if restart_playwright:
|
if restart_playwright:
|
||||||
playwright = sync_playwright().start()
|
playwright = sync_playwright().start()
|
||||||
@ -114,6 +111,15 @@ class WebConnector(LoadConnector):
|
|||||||
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
page.goto(current_url)
|
page.goto(current_url)
|
||||||
|
final_page = page.url
|
||||||
|
if final_page != current_url:
|
||||||
|
logger.info(f"Redirected to {final_page}")
|
||||||
|
current_url = final_page
|
||||||
|
if current_url in visited_links:
|
||||||
|
logger.info(f"Redirected page already indexed")
|
||||||
|
continue
|
||||||
|
visited_links.add(current_url)
|
||||||
|
|
||||||
content = page.content()
|
content = page.content()
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
@ -161,6 +167,8 @@ class WebConnector(LoadConnector):
|
|||||||
page.close()
|
page.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to fetch '{current_url}': {e}")
|
logger.error(f"Failed to fetch '{current_url}': {e}")
|
||||||
|
playwright.stop()
|
||||||
|
restart_playwright = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if len(doc_batch) >= self.batch_size:
|
if len(doc_batch) >= self.batch_size:
|
||||||
|
Reference in New Issue
Block a user