DAN-139 web connector recursion (#126)

Now handled by checking final page after redirects
This commit is contained in:
Yuhong Sun 2023-07-02 19:01:13 -07:00 committed by GitHub
parent 24e61a646d
commit 381b3719c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -30,8 +30,8 @@ def is_valid_url(url: str) -> bool:
def get_internal_links(
base_url: str, url: str, soup: BeautifulSoup, should_ignore_pound: bool = True
) -> list[str]:
internal_links = []
) -> set[str]:
internal_links = set()
for link in cast(list[dict[str, Any]], soup.find_all("a")):
href = cast(str | None, link.get("href"))
if not href:
@ -47,7 +47,7 @@ def get_internal_links(
href = urljoin(url, href)
if urlparse(href).netloc == urlparse(url).netloc and base_url in href:
internal_links.append(href)
internal_links.add(href)
return internal_links
@ -74,11 +74,6 @@ class WebConnector(LoadConnector):
to_visit: list[str] = [self.base_url]
doc_batch: list[Document] = []
# Edge case handling user provides HTML without terminating slash (prevents duplicate)
# Most sites either redirect no slash to slash or serve same content
if self.base_url[-1] != "/":
visited_links.add(self.base_url + "/")
restart_playwright = True
while to_visit:
current_url = to_visit.pop()
@ -86,6 +81,8 @@ class WebConnector(LoadConnector):
continue
visited_links.add(current_url)
logger.info(f"Indexing {current_url}")
try:
if restart_playwright:
playwright = sync_playwright().start()
@ -114,6 +111,15 @@ class WebConnector(LoadConnector):
page = context.new_page()
page.goto(current_url)
final_page = page.url
if final_page != current_url:
logger.info(f"Redirected to {final_page}")
current_url = final_page
if current_url in visited_links:
logger.info(f"Redirected page already indexed")
continue
visited_links.add(current_url)
content = page.content()
soup = BeautifulSoup(content, "html.parser")
@ -161,6 +167,8 @@ class WebConnector(LoadConnector):
page.close()
except Exception as e:
logger.error(f"Failed to fetch '{current_url}': {e}")
playwright.stop()
restart_playwright = True
continue
if len(doc_batch) >= self.batch_size: