mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-25 19:37:29 +02:00
Web Connector - Get doc_updated_at from Last-Modified
header (#1693)
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
import io
|
import io
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import socket
|
import socket
|
||||||
|
from datetime import datetime
|
||||||
|
from datetime import timezone
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import cast
|
from typing import cast
|
||||||
@@ -203,6 +205,15 @@ def _read_urls_file(location: str) -> list[str]:
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | None:
|
||||||
|
try:
|
||||||
|
return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(
|
||||||
|
tzinfo=timezone.utc
|
||||||
|
)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class WebConnector(LoadConnector):
|
class WebConnector(LoadConnector):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -288,6 +299,7 @@ class WebConnector(LoadConnector):
|
|||||||
page_text, metadata = read_pdf_file(
|
page_text, metadata = read_pdf_file(
|
||||||
file=io.BytesIO(response.content)
|
file=io.BytesIO(response.content)
|
||||||
)
|
)
|
||||||
|
last_modified = response.headers.get("Last-Modified")
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
@@ -296,12 +308,22 @@ class WebConnector(LoadConnector):
|
|||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=current_url.split("/")[-1],
|
semantic_identifier=current_url.split("/")[-1],
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
|
doc_updated_at=_get_datetime_from_last_modified_header(
|
||||||
|
last_modified
|
||||||
|
)
|
||||||
|
if last_modified
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
page_response = page.goto(current_url)
|
page_response = page.goto(current_url)
|
||||||
|
last_modified = (
|
||||||
|
page_response.header_value("Last-Modified")
|
||||||
|
if page_response
|
||||||
|
else None
|
||||||
|
)
|
||||||
final_page = page.url
|
final_page = page.url
|
||||||
if final_page != current_url:
|
if final_page != current_url:
|
||||||
logger.info(f"Redirected to {final_page}")
|
logger.info(f"Redirected to {final_page}")
|
||||||
@@ -337,6 +359,11 @@ class WebConnector(LoadConnector):
|
|||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=parsed_html.title or current_url,
|
semantic_identifier=parsed_html.title or current_url,
|
||||||
metadata={},
|
metadata={},
|
||||||
|
doc_updated_at=_get_datetime_from_last_modified_header(
|
||||||
|
last_modified
|
||||||
|
)
|
||||||
|
if last_modified
|
||||||
|
else None,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user