From 2037e1149588aa31f4516f3298055b601d6c91f1 Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 19 Oct 2023 16:12:31 -0700 Subject: [PATCH] Replace PyPDF2 with pypdf (#595) --- backend/danswer/connectors/file/connector.py | 52 +++++++++++++++---- .../connectors/google_drive/connector.py | 2 +- backend/danswer/connectors/web/connector.py | 2 +- backend/requirements/default.txt | 3 +- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index d3097249f9..846b3e8847 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -1,10 +1,9 @@ -import os from collections.abc import Generator from pathlib import Path from typing import Any from typing import IO -from PyPDF2 import PdfReader +from pypdf import PdfReader from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource @@ -40,20 +39,41 @@ def _open_files_at_location( logger.warning(f"Skipping file '{file_path}' with extension '{extension}'") -def _process_file(file_name: str, file: IO[Any]) -> list[Document]: +def _process_file( + file_name: str, + file: IO[Any], + pdf_pass: str | None = None, +) -> list[Document]: extension = get_file_ext(file_name) if not check_file_ext_is_valid(extension): logger.warning(f"Skipping file '{file_name}' with extension '{extension}'") return [] metadata: dict[str, Any] = {} - file_content_raw = "" if extension == ".pdf": pdf_reader = PdfReader(file) - if not pdf_reader.is_encrypted: - file_content_raw = "\n".join( - page.extract_text() for page in pdf_reader.pages - ) + if pdf_reader.is_encrypted: + decrypt_success = False + if pdf_pass is not None: + try: + decrypt_success = pdf_reader.decrypt(pdf_pass) != 0 + except Exception: + logger.error(f"Unable to decrypt pdf {file_name}") + if not decrypt_success: + # By user request, keep files that are unreadable just so they + # can be discoverable by title. + return [ + Document( + id=file_name, + sections=[Section(link=metadata.get("link", ""), text="")], + source=DocumentSource.FILE, + semantic_identifier=file_name, + metadata={}, + ) + ] + + file_content_raw = "\n".join(page.extract_text() for page in pdf_reader.pages) + else: file_content_raw, metadata = read_file(file) @@ -76,9 +96,11 @@ class LocalFileConnector(LoadConnector): ) -> None: self.file_locations = [Path(file_location) for file_location in file_locations] self.batch_size = batch_size + self.pdf_pass: str | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - pass + self.pdf_pass = credentials.get("pdf_password") + return None def load_from_state(self) -> GenerateDocumentsOutput: documents: list[Document] = [] @@ -86,7 +108,7 @@ class LocalFileConnector(LoadConnector): files = _open_files_at_location(file_location) for file_name, file in files: - documents.extend(_process_file(file_name, file)) + documents.extend(_process_file(file_name, file, self.pdf_pass)) if len(documents) >= self.batch_size: yield documents @@ -94,3 +116,13 @@ class LocalFileConnector(LoadConnector): if documents: yield documents + + +if __name__ == "__main__": + import os + + connector = LocalFileConnector(file_locations=[os.environ["TEST_FILE"]]) + connector.load_credentials({"pdf_password": os.environ["PDF_PASSWORD"]}) + + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 24a2db5f11..6b3b78ccb4 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -12,7 +12,7 @@ import docx2txt # type:ignore from google.auth.credentials import Credentials # type: ignore from googleapiclient import discovery # type: ignore from googleapiclient.errors import HttpError # type: ignore -from PyPDF2 import PdfReader +from pypdf import PdfReader from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index ddb02490c6..b1b495afe8 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -13,7 +13,7 @@ from oauthlib.oauth2 import BackendApplicationClient from playwright.sync_api import BrowserContext from playwright.sync_api import Playwright from playwright.sync_api import sync_playwright -from PyPDF2 import PdfReader +from pypdf import PdfReader from requests_oauthlib import OAuth2Session # type:ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 11fee09b07..706500034e 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -31,9 +31,10 @@ oauthlib==3.2.2 playwright==1.37.0 psycopg2==2.9.6 psycopg2-binary==2.9.6 +pycryptodome==3.19.0 pydantic==1.10.7 PyGithub==1.58.2 -PyPDF2==3.0.1 +pypdf==3.16.4 pytest-playwright==0.3.2 python-multipart==0.0.6 qdrant-client==1.2.0