mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-07-13 22:53:01 +02:00
Handling of #! sites (#4169)
This commit is contained in:
@ -16,6 +16,7 @@ from oauthlib.oauth2 import BackendApplicationClient
|
|||||||
from playwright.sync_api import BrowserContext
|
from playwright.sync_api import BrowserContext
|
||||||
from playwright.sync_api import Playwright
|
from playwright.sync_api import Playwright
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
||||||
from requests_oauthlib import OAuth2Session # type:ignore
|
from requests_oauthlib import OAuth2Session # type:ignore
|
||||||
from urllib3.exceptions import MaxRetryError
|
from urllib3.exceptions import MaxRetryError
|
||||||
|
|
||||||
@ -293,6 +294,7 @@ class WebConnector(LoadConnector):
|
|||||||
and converts them into documents"""
|
and converts them into documents"""
|
||||||
visited_links: set[str] = set()
|
visited_links: set[str] = set()
|
||||||
to_visit: list[str] = self.to_visit_list
|
to_visit: list[str] = self.to_visit_list
|
||||||
|
content_hashes = set()
|
||||||
|
|
||||||
if not to_visit:
|
if not to_visit:
|
||||||
raise ValueError("No URLs to visit")
|
raise ValueError("No URLs to visit")
|
||||||
@ -352,11 +354,19 @@ class WebConnector(LoadConnector):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
"""wait_until="networkidle" is used to wait for the page to load completely which is necessary
|
# wait_until="networkidle" is used to wait for the page to load completely which is necessary
|
||||||
for the javascript heavy websites"""
|
# for the javascript heavy websites
|
||||||
page_response = page.goto(
|
try:
|
||||||
initial_url, wait_until="networkidle", timeout=60000
|
page_response = page.goto(
|
||||||
)
|
initial_url,
|
||||||
|
wait_until="networkidle",
|
||||||
|
timeout=30000, # 30 seconds
|
||||||
|
)
|
||||||
|
except PlaywrightTimeoutError:
|
||||||
|
logger.warning(
|
||||||
|
f"NetworkIdle timeout for {initial_url}, falling back to default load"
|
||||||
|
)
|
||||||
|
page_response = page.goto(initial_url)
|
||||||
last_modified = (
|
last_modified = (
|
||||||
page_response.header_value("Last-Modified")
|
page_response.header_value("Last-Modified")
|
||||||
if page_response
|
if page_response
|
||||||
@ -424,6 +434,14 @@ class WebConnector(LoadConnector):
|
|||||||
else:
|
else:
|
||||||
parsed_html.cleaned_text += "\n" + document_text
|
parsed_html.cleaned_text += "\n" + document_text
|
||||||
|
|
||||||
|
# Sometimes pages with #! will server duplicate content
|
||||||
|
# There are also just other ways this can happen
|
||||||
|
hashed_text = hash(parsed_html.cleaned_text)
|
||||||
|
if hashed_text in content_hashes:
|
||||||
|
logger.info(f"Skipping duplicate content for {initial_url}")
|
||||||
|
continue
|
||||||
|
content_hashes.add(hashed_text)
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
id=initial_url,
|
id=initial_url,
|
||||||
|
Reference in New Issue
Block a user