Web Connector - Get doc_updated_at from Last-Modified header (#1693)

This commit is contained in:
Moshe Zada
2024-09-08 20:05:04 +03:00
committed by GitHub
parent d985cd4352
commit 0c66da17bb

View File

@@ -1,6 +1,8 @@
import io
import ipaddress
import socket
from datetime import datetime
from datetime import timezone
from enum import Enum
from typing import Any
from typing import cast
@@ -203,6 +205,15 @@ def _read_urls_file(location: str) -> list[str]:
return urls
def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | None:
try:
return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(
tzinfo=timezone.utc
)
except (ValueError, TypeError):
return None
class WebConnector(LoadConnector):
def __init__(
self,
@@ -288,6 +299,7 @@ class WebConnector(LoadConnector):
page_text, metadata = read_pdf_file(
file=io.BytesIO(response.content)
)
last_modified = response.headers.get("Last-Modified")
doc_batch.append(
Document(
@@ -296,12 +308,22 @@ class WebConnector(LoadConnector):
source=DocumentSource.WEB,
semantic_identifier=current_url.split("/")[-1],
metadata=metadata,
doc_updated_at=_get_datetime_from_last_modified_header(
last_modified
)
if last_modified
else None,
)
)
continue
page = context.new_page()
page_response = page.goto(current_url)
last_modified = (
page_response.header_value("Last-Modified")
if page_response
else None
)
final_page = page.url
if final_page != current_url:
logger.info(f"Redirected to {final_page}")
@@ -337,6 +359,11 @@ class WebConnector(LoadConnector):
source=DocumentSource.WEB,
semantic_identifier=parsed_html.title or current_url,
metadata={},
doc_updated_at=_get_datetime_from_last_modified_header(
last_modified
)
if last_modified
else None,
)
)