Web Connector - Get doc_updated_at from Last-Modified header (#1693)

This commit is contained in:
Moshe Zada
2024-09-08 20:05:04 +03:00
committed by GitHub
parent d985cd4352
commit 0c66da17bb

View File

@@ -1,6 +1,8 @@
import io import io
import ipaddress import ipaddress
import socket import socket
from datetime import datetime
from datetime import timezone
from enum import Enum from enum import Enum
from typing import Any from typing import Any
from typing import cast from typing import cast
@@ -203,6 +205,15 @@ def _read_urls_file(location: str) -> list[str]:
return urls return urls
def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | None:
try:
return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(
tzinfo=timezone.utc
)
except (ValueError, TypeError):
return None
class WebConnector(LoadConnector): class WebConnector(LoadConnector):
def __init__( def __init__(
self, self,
@@ -288,6 +299,7 @@ class WebConnector(LoadConnector):
page_text, metadata = read_pdf_file( page_text, metadata = read_pdf_file(
file=io.BytesIO(response.content) file=io.BytesIO(response.content)
) )
last_modified = response.headers.get("Last-Modified")
doc_batch.append( doc_batch.append(
Document( Document(
@@ -296,12 +308,22 @@ class WebConnector(LoadConnector):
source=DocumentSource.WEB, source=DocumentSource.WEB,
semantic_identifier=current_url.split("/")[-1], semantic_identifier=current_url.split("/")[-1],
metadata=metadata, metadata=metadata,
doc_updated_at=_get_datetime_from_last_modified_header(
last_modified
)
if last_modified
else None,
) )
) )
continue continue
page = context.new_page() page = context.new_page()
page_response = page.goto(current_url) page_response = page.goto(current_url)
last_modified = (
page_response.header_value("Last-Modified")
if page_response
else None
)
final_page = page.url final_page = page.url
if final_page != current_url: if final_page != current_url:
logger.info(f"Redirected to {final_page}") logger.info(f"Redirected to {final_page}")
@@ -337,6 +359,11 @@ class WebConnector(LoadConnector):
source=DocumentSource.WEB, source=DocumentSource.WEB,
semantic_identifier=parsed_html.title or current_url, semantic_identifier=parsed_html.title or current_url,
metadata={}, metadata={},
doc_updated_at=_get_datetime_from_last_modified_header(
last_modified
)
if last_modified
else None,
) )
) )