From f6982b03b6972d7b867ce36ecf3f2386ba7c7dcc Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 19 Oct 2023 17:46:13 -0700 Subject: [PATCH] Handle PDF parse failures gracefully (#599) --- .../danswer/connectors/cross_connector_utils/file_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py index 812f089c8..9a841d01e 100644 --- a/backend/danswer/connectors/cross_connector_utils/file_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py @@ -33,7 +33,11 @@ def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> # can be discoverable by title. return "" - return "\n".join(page.extract_text() for page in pdf_reader.pages) + try: + return "\n".join(page.extract_text() for page in pdf_reader.pages) + except Exception: + logger.exception(f"Failed to read PDF {file_name}") + return "" def is_macos_resource_fork_file(file_name: str) -> bool: