Replace PyPDF2 with pypdf (#595)

This commit is contained in:
Yuhong Sun
2023-10-19 16:12:31 -07:00
committed by GitHub
parent 6a449f1fb1
commit 2037e11495
4 changed files with 46 additions and 13 deletions

View File

@ -1,10 +1,9 @@
import os
from collections.abc import Generator
from pathlib import Path
from typing import Any
from typing import IO
from PyPDF2 import PdfReader
from pypdf import PdfReader
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
@ -40,20 +39,41 @@ def _open_files_at_location(
logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")
def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
def _process_file(
file_name: str,
file: IO[Any],
pdf_pass: str | None = None,
) -> list[Document]:
extension = get_file_ext(file_name)
if not check_file_ext_is_valid(extension):
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
return []
metadata: dict[str, Any] = {}
file_content_raw = ""
if extension == ".pdf":
pdf_reader = PdfReader(file)
if not pdf_reader.is_encrypted:
file_content_raw = "\n".join(
page.extract_text() for page in pdf_reader.pages
)
if pdf_reader.is_encrypted:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return [
Document(
id=file_name,
sections=[Section(link=metadata.get("link", ""), text="")],
source=DocumentSource.FILE,
semantic_identifier=file_name,
metadata={},
)
]
file_content_raw = "\n".join(page.extract_text() for page in pdf_reader.pages)
else:
file_content_raw, metadata = read_file(file)
@ -76,9 +96,11 @@ class LocalFileConnector(LoadConnector):
) -> None:
self.file_locations = [Path(file_location) for file_location in file_locations]
self.batch_size = batch_size
self.pdf_pass: str | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
pass
self.pdf_pass = credentials.get("pdf_password")
return None
def load_from_state(self) -> GenerateDocumentsOutput:
documents: list[Document] = []
@ -86,7 +108,7 @@ class LocalFileConnector(LoadConnector):
files = _open_files_at_location(file_location)
for file_name, file in files:
documents.extend(_process_file(file_name, file))
documents.extend(_process_file(file_name, file, self.pdf_pass))
if len(documents) >= self.batch_size:
yield documents
@ -94,3 +116,13 @@ class LocalFileConnector(LoadConnector):
if documents:
yield documents
if __name__ == "__main__":
import os
connector = LocalFileConnector(file_locations=[os.environ["TEST_FILE"]])
connector.load_credentials({"pdf_password": os.environ["PDF_PASSWORD"]})
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@ -12,7 +12,7 @@ import docx2txt # type:ignore
from google.auth.credentials import Credentials # type: ignore
from googleapiclient import discovery # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from PyPDF2 import PdfReader
from pypdf import PdfReader
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS

View File

@ -13,7 +13,7 @@ from oauthlib.oauth2 import BackendApplicationClient
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from PyPDF2 import PdfReader
from pypdf import PdfReader
from requests_oauthlib import OAuth2Session # type:ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE

View File

@ -31,9 +31,10 @@ oauthlib==3.2.2
playwright==1.37.0
psycopg2==2.9.6
psycopg2-binary==2.9.6
pycryptodome==3.19.0
pydantic==1.10.7
PyGithub==1.58.2
PyPDF2==3.0.1
pypdf==3.16.4
pytest-playwright==0.3.2
python-multipart==0.0.6
qdrant-client==1.2.0