Google Drive handle invalid PDFs (#838)

This commit is contained in:
Yuhong Sun
2023-12-18 23:39:45 -08:00
committed by GitHub
parent 2180a40bd3
commit c281859509

View File

@@ -9,6 +9,7 @@ from typing import IO
import chardet import chardet
from pypdf import PdfReader from pypdf import PdfReader
from pypdf.errors import PdfStreamError
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@@ -37,9 +38,10 @@ def extract_metadata(line: str) -> dict | None:
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str: def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
try:
pdf_reader = PdfReader(file) pdf_reader = PdfReader(file)
# if marked as encrypted and a password is provided, try to decrypt # If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None: if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False decrypt_success = False
if pdf_pass is not None: if pdf_pass is not None:
@@ -55,10 +57,14 @@ def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) ->
# can be discoverable by title. # can be discoverable by title.
return "" return ""
try:
return "\n".join(page.extract_text() for page in pdf_reader.pages) return "\n".join(page.extract_text() for page in pdf_reader.pages)
except PdfStreamError:
logger.exception(f"PDF file {file_name} is not a valid PDF")
except Exception: except Exception:
logger.exception(f"Failed to read PDF {file_name}") logger.exception(f"Failed to read PDF {file_name}")
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return "" return ""