mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-10 21:09:51 +02:00
DAN-139 web connector recursion (#126)
Now handled by checking final page after redirects
This commit is contained in:
parent
24e61a646d
commit
381b3719c9
@ -30,8 +30,8 @@ def is_valid_url(url: str) -> bool:
|
||||
|
||||
def get_internal_links(
|
||||
base_url: str, url: str, soup: BeautifulSoup, should_ignore_pound: bool = True
|
||||
) -> list[str]:
|
||||
internal_links = []
|
||||
) -> set[str]:
|
||||
internal_links = set()
|
||||
for link in cast(list[dict[str, Any]], soup.find_all("a")):
|
||||
href = cast(str | None, link.get("href"))
|
||||
if not href:
|
||||
@ -47,7 +47,7 @@ def get_internal_links(
|
||||
href = urljoin(url, href)
|
||||
|
||||
if urlparse(href).netloc == urlparse(url).netloc and base_url in href:
|
||||
internal_links.append(href)
|
||||
internal_links.add(href)
|
||||
return internal_links
|
||||
|
||||
|
||||
@ -74,11 +74,6 @@ class WebConnector(LoadConnector):
|
||||
to_visit: list[str] = [self.base_url]
|
||||
doc_batch: list[Document] = []
|
||||
|
||||
# Edge case handling user provides HTML without terminating slash (prevents duplicate)
|
||||
# Most sites either redirect no slash to slash or serve same content
|
||||
if self.base_url[-1] != "/":
|
||||
visited_links.add(self.base_url + "/")
|
||||
|
||||
restart_playwright = True
|
||||
while to_visit:
|
||||
current_url = to_visit.pop()
|
||||
@ -86,6 +81,8 @@ class WebConnector(LoadConnector):
|
||||
continue
|
||||
visited_links.add(current_url)
|
||||
|
||||
logger.info(f"Indexing {current_url}")
|
||||
|
||||
try:
|
||||
if restart_playwright:
|
||||
playwright = sync_playwright().start()
|
||||
@ -114,6 +111,15 @@ class WebConnector(LoadConnector):
|
||||
|
||||
page = context.new_page()
|
||||
page.goto(current_url)
|
||||
final_page = page.url
|
||||
if final_page != current_url:
|
||||
logger.info(f"Redirected to {final_page}")
|
||||
current_url = final_page
|
||||
if current_url in visited_links:
|
||||
logger.info(f"Redirected page already indexed")
|
||||
continue
|
||||
visited_links.add(current_url)
|
||||
|
||||
content = page.content()
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
@ -161,6 +167,8 @@ class WebConnector(LoadConnector):
|
||||
page.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch '{current_url}': {e}")
|
||||
playwright.stop()
|
||||
restart_playwright = True
|
||||
continue
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
|
Loading…
x
Reference in New Issue
Block a user