mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 20:38:32 +02:00
Google Drive handle invalid PDFs (#838)
This commit is contained in:
@@ -9,6 +9,7 @@ from typing import IO
|
|||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
from pypdf.errors import PdfStreamError
|
||||||
|
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@@ -37,9 +38,10 @@ def extract_metadata(line: str) -> dict | None:
|
|||||||
|
|
||||||
|
|
||||||
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
||||||
|
try:
|
||||||
pdf_reader = PdfReader(file)
|
pdf_reader = PdfReader(file)
|
||||||
|
|
||||||
# if marked as encrypted and a password is provided, try to decrypt
|
# If marked as encrypted and a password is provided, try to decrypt
|
||||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||||
decrypt_success = False
|
decrypt_success = False
|
||||||
if pdf_pass is not None:
|
if pdf_pass is not None:
|
||||||
@@ -55,10 +57,14 @@ def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) ->
|
|||||||
# can be discoverable by title.
|
# can be discoverable by title.
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
try:
|
|
||||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||||
|
except PdfStreamError:
|
||||||
|
logger.exception(f"PDF file {file_name} is not a valid PDF")
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception(f"Failed to read PDF {file_name}")
|
logger.exception(f"Failed to read PDF {file_name}")
|
||||||
|
|
||||||
|
# File is still discoverable by title
|
||||||
|
# but the contents are not included as they cannot be parsed
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user