mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-20 13:05:49 +02:00
Web Connector - Get doc_updated_at from Last-Modified
header (#1693)
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
import io
|
||||
import ipaddress
|
||||
import socket
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
@@ -203,6 +205,15 @@ def _read_urls_file(location: str) -> list[str]:
|
||||
return urls
|
||||
|
||||
|
||||
def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | None:
|
||||
try:
|
||||
return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
class WebConnector(LoadConnector):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -288,6 +299,7 @@ class WebConnector(LoadConnector):
|
||||
page_text, metadata = read_pdf_file(
|
||||
file=io.BytesIO(response.content)
|
||||
)
|
||||
last_modified = response.headers.get("Last-Modified")
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
@@ -296,12 +308,22 @@ class WebConnector(LoadConnector):
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=current_url.split("/")[-1],
|
||||
metadata=metadata,
|
||||
doc_updated_at=_get_datetime_from_last_modified_header(
|
||||
last_modified
|
||||
)
|
||||
if last_modified
|
||||
else None,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
page = context.new_page()
|
||||
page_response = page.goto(current_url)
|
||||
last_modified = (
|
||||
page_response.header_value("Last-Modified")
|
||||
if page_response
|
||||
else None
|
||||
)
|
||||
final_page = page.url
|
||||
if final_page != current_url:
|
||||
logger.info(f"Redirected to {final_page}")
|
||||
@@ -337,6 +359,11 @@ class WebConnector(LoadConnector):
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=parsed_html.title or current_url,
|
||||
metadata={},
|
||||
doc_updated_at=_get_datetime_from_last_modified_header(
|
||||
last_modified
|
||||
)
|
||||
if last_modified
|
||||
else None,
|
||||
)
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user