Google Drive handle invalid PDFs (#838)

2025-09-27 20:38:32 +02:00 · 2023-12-18 23:39:45 -08:00
parent 2180a40bd3
commit c281859509
1 changed files with 25 additions and 19 deletions
--- a/backend/danswer/connectors/cross_connector_utils/file_utils.py
+++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py
@@ -9,6 +9,7 @@ from typing import IO
 import chardet
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError
 from danswer.utils.logger import setup_logger
@@ -37,9 +38,10 @@ def extract_metadata(line: str) -> dict | None:
 def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
    try:
        pdf_reader = PdfReader(file)
-    # if marked as encrypted and a password is provided, try to decrypt
+        # If marked as encrypted and a password is provided, try to decrypt
        if pdf_reader.is_encrypted and pdf_pass is not None:
            decrypt_success = False
            if pdf_pass is not None:
@@ -55,10 +57,14 @@ def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) ->
                # can be discoverable by title.
                return ""
    try:
        return "\n".join(page.extract_text() for page in pdf_reader.pages)
    except PdfStreamError:
        logger.exception(f"PDF file {file_name} is not a valid PDF")
    except Exception:
        logger.exception(f"Failed to read PDF {file_name}")
    # File is still discoverable by title
    # but the contents are not included as they cannot be parsed
    return ""