diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index db346fd8b8d1..bb1f64efdfe2 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -1,6 +1,8 @@ import io import ipaddress import socket +from datetime import datetime +from datetime import timezone from enum import Enum from typing import Any from typing import cast @@ -203,6 +205,15 @@ def _read_urls_file(location: str) -> list[str]: return urls +def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | None: + try: + return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace( + tzinfo=timezone.utc + ) + except (ValueError, TypeError): + return None + + class WebConnector(LoadConnector): def __init__( self, @@ -288,6 +299,7 @@ class WebConnector(LoadConnector): page_text, metadata = read_pdf_file( file=io.BytesIO(response.content) ) + last_modified = response.headers.get("Last-Modified") doc_batch.append( Document( @@ -296,12 +308,22 @@ class WebConnector(LoadConnector): source=DocumentSource.WEB, semantic_identifier=current_url.split("/")[-1], metadata=metadata, + doc_updated_at=_get_datetime_from_last_modified_header( + last_modified + ) + if last_modified + else None, ) ) continue page = context.new_page() page_response = page.goto(current_url) + last_modified = ( + page_response.header_value("Last-Modified") + if page_response + else None + ) final_page = page.url if final_page != current_url: logger.info(f"Redirected to {final_page}") @@ -337,6 +359,11 @@ class WebConnector(LoadConnector): source=DocumentSource.WEB, semantic_identifier=parsed_html.title or current_url, metadata={}, + doc_updated_at=_get_datetime_from_last_modified_header( + last_modified + ) + if last_modified + else None, ) )