mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-04-08 03:48:14 +02:00
Google Drive handle invalid PDFs (#838)
This commit is contained in:
parent
2180a40bd3
commit
c281859509
@ -9,6 +9,7 @@ from typing import IO
|
||||
|
||||
import chardet
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfStreamError
|
||||
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
@ -37,29 +38,34 @@ def extract_metadata(line: str) -> dict | None:
|
||||
|
||||
|
||||
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
||||
pdf_reader = PdfReader(file)
|
||||
|
||||
# if marked as encrypted and a password is provided, try to decrypt
|
||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||
decrypt_success = False
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
||||
else:
|
||||
logger.info(f"No Password available to to decrypt pdf {file_name}")
|
||||
|
||||
if not decrypt_success:
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return ""
|
||||
|
||||
try:
|
||||
pdf_reader = PdfReader(file)
|
||||
|
||||
# If marked as encrypted and a password is provided, try to decrypt
|
||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||
decrypt_success = False
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
||||
else:
|
||||
logger.info(f"No Password available to to decrypt pdf {file_name}")
|
||||
|
||||
if not decrypt_success:
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return ""
|
||||
|
||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||
except PdfStreamError:
|
||||
logger.exception(f"PDF file {file_name} is not a valid PDF")
|
||||
except Exception:
|
||||
logger.exception(f"Failed to read PDF {file_name}")
|
||||
return ""
|
||||
|
||||
# File is still discoverable by title
|
||||
# but the contents are not included as they cannot be parsed
|
||||
return ""
|
||||
|
||||
|
||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||
|
Loading…
x
Reference in New Issue
Block a user