Handling of #! sites (#4169)

This commit is contained in:
Yuhong Sun
2025-03-03 08:18:44 -08:00
committed by GitHub
parent e9905a398b
commit 7f0653d173

View File

@ -16,6 +16,7 @@ from oauthlib.oauth2 import BackendApplicationClient
from playwright.sync_api import BrowserContext from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from requests_oauthlib import OAuth2Session # type:ignore from requests_oauthlib import OAuth2Session # type:ignore
from urllib3.exceptions import MaxRetryError from urllib3.exceptions import MaxRetryError
@ -293,6 +294,7 @@ class WebConnector(LoadConnector):
and converts them into documents""" and converts them into documents"""
visited_links: set[str] = set() visited_links: set[str] = set()
to_visit: list[str] = self.to_visit_list to_visit: list[str] = self.to_visit_list
content_hashes = set()
if not to_visit: if not to_visit:
raise ValueError("No URLs to visit") raise ValueError("No URLs to visit")
@ -352,11 +354,19 @@ class WebConnector(LoadConnector):
continue continue
page = context.new_page() page = context.new_page()
"""wait_until="networkidle" is used to wait for the page to load completely which is necessary # wait_until="networkidle" is used to wait for the page to load completely which is necessary
for the javascript heavy websites""" # for the javascript heavy websites
page_response = page.goto( try:
initial_url, wait_until="networkidle", timeout=60000 page_response = page.goto(
) initial_url,
wait_until="networkidle",
timeout=30000, # 30 seconds
)
except PlaywrightTimeoutError:
logger.warning(
f"NetworkIdle timeout for {initial_url}, falling back to default load"
)
page_response = page.goto(initial_url)
last_modified = ( last_modified = (
page_response.header_value("Last-Modified") page_response.header_value("Last-Modified")
if page_response if page_response
@ -424,6 +434,14 @@ class WebConnector(LoadConnector):
else: else:
parsed_html.cleaned_text += "\n" + document_text parsed_html.cleaned_text += "\n" + document_text
# Sometimes pages with #! will server duplicate content
# There are also just other ways this can happen
hashed_text = hash(parsed_html.cleaned_text)
if hashed_text in content_hashes:
logger.info(f"Skipping duplicate content for {initial_url}")
continue
content_hashes.add(hashed_text)
doc_batch.append( doc_batch.append(
Document( Document(
id=initial_url, id=initial_url,