From 7c347446552012eef16bceeb0c3a8cdec308f950 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 19 Oct 2023 17:01:55 -0700 Subject: [PATCH] Use shared PDF utility function to not error on encrypted PDFs (#596) --- .../cross_connector_utils/file_utils.py | 27 +++++++++++++++++ backend/danswer/connectors/file/connector.py | 30 ++++--------------- .../connectors/google_drive/connector.py | 14 ++------- backend/danswer/connectors/web/connector.py | 9 +++--- 4 files changed, 39 insertions(+), 41 deletions(-) diff --git a/backend/danswer/connectors/cross_connector_utils/file_utils.py b/backend/danswer/connectors/cross_connector_utils/file_utils.py index 89360f39d..812f089c8 100644 --- a/backend/danswer/connectors/cross_connector_utils/file_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/file_utils.py @@ -6,9 +6,36 @@ from pathlib import Path from typing import Any from typing import IO +from pypdf import PdfReader + +from danswer.utils.logger import setup_logger + + +logger = setup_logger() + _METADATA_FLAG = "#DANSWER_METADATA=" +def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str: + pdf_reader = PdfReader(file) + if pdf_reader.is_encrypted: + decrypt_success = False + if pdf_pass is not None: + try: + decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 + except Exception: + logger.error(f"Unable to decrypt pdf {file_name}") + else: + logger.info(f"No Password available to to decrypt pdf {file_name}") + + if not decrypt_success: + # By user request, keep files that are unreadable just so they + # can be discoverable by title. + return "" + + return "\n".join(page.extract_text() for page in pdf_reader.pages) + + def is_macos_resource_fork_file(file_name: str) -> bool: return os.path.basename(file_name).startswith("._") and file_name.startswith( "__MACOSX" diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 846b3e884..b1a299ecc 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -3,12 +3,11 @@ from pathlib import Path from typing import Any from typing import IO -from pypdf import PdfReader - from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip from danswer.connectors.cross_connector_utils.file_utils import read_file +from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.file.utils import check_file_ext_is_valid from danswer.connectors.file.utils import get_file_ext from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -50,30 +49,11 @@ def _process_file( return [] metadata: dict[str, Any] = {} + if extension == ".pdf": - pdf_reader = PdfReader(file) - if pdf_reader.is_encrypted: - decrypt_success = False - if pdf_pass is not None: - try: - decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 - except Exception: - logger.error(f"Unable to decrypt pdf {file_name}") - if not decrypt_success: - # By user request, keep files that are unreadable just so they - # can be discoverable by title. - return [ - Document( - id=file_name, - sections=[Section(link=metadata.get("link", ""), text="")], - source=DocumentSource.FILE, - semantic_identifier=file_name, - metadata={}, - ) - ] - - file_content_raw = "\n".join(page.extract_text() for page in pdf_reader.pages) - + file_content_raw = read_pdf_file( + file=file, file_name=file_name, pdf_pass=pdf_pass + ) else: file_content_raw, metadata = read_file(file) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 6b3b78ccb..6310d168d 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -12,7 +12,6 @@ import docx2txt # type:ignore from google.auth.credentials import Credentials # type: ignore from googleapiclient import discovery # type: ignore from googleapiclient.errors import HttpError # type: ignore -from pypdf import PdfReader from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS @@ -20,6 +19,7 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.configs.constants import IGNORE_FOR_QA +from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.google_drive.connector_auth import ( get_google_drive_creds_for_authorized_user, ) @@ -313,16 +313,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: return docx2txt.process(temp_path) elif mime_type == GDriveMimeType.PDF.value: response = service.files().get_media(fileId=file["id"]).execute() - pdf_stream = io.BytesIO(response) - pdf_reader = PdfReader(pdf_stream) - - if pdf_reader.is_encrypted: - logger.warning( - f"Google drive file: {file['name']} is encrypted - Danswer will ignore it's content" - ) - return "" - - return "\n".join(page.extract_text() for page in pdf_reader.pages) + file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"]) + return file_contents return UNSUPPORTED_FILE_TYPE_CONTENT diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index b1b495afe..dfb1a878e 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -13,7 +13,6 @@ from oauthlib.oauth2 import BackendApplicationClient from playwright.sync_api import BrowserContext from playwright.sync_api import Playwright from playwright.sync_api import sync_playwright -from pypdf import PdfReader from requests_oauthlib import OAuth2Session # type:ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE @@ -21,6 +20,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector @@ -182,10 +182,9 @@ class WebConnector(LoadConnector): if current_url.split(".")[-1] == "pdf": # PDF files are not checked for links response = requests.get(current_url) - pdf_reader = PdfReader(io.BytesIO(response.content)) - page_text = "" - for pdf_page in pdf_reader.pages: - page_text += pdf_page.extract_text() + page_text = read_pdf_file( + file=io.BytesIO(response.content), file_name=current_url + ) doc_batch.append( Document(