Use shared PDF utility function to not error on encrypted PDFs (#596)

This commit is contained in:
Yuhong Sun 2023-10-19 17:01:55 -07:00 committed by GitHub
parent 2037e11495
commit 7c34744655
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 41 deletions

View File

@ -6,9 +6,36 @@ from pathlib import Path
from typing import Any
from typing import IO
from pypdf import PdfReader
from danswer.utils.logger import setup_logger
logger = setup_logger()
_METADATA_FLAG = "#DANSWER_METADATA="
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
pdf_reader = PdfReader(file)
if pdf_reader.is_encrypted:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
else:
logger.info(f"No Password available to to decrypt pdf {file_name}")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
return "\n".join(page.extract_text() for page in pdf_reader.pages)
def is_macos_resource_fork_file(file_name: str) -> bool:
return os.path.basename(file_name).startswith("._") and file_name.startswith(
"__MACOSX"

View File

@ -3,12 +3,11 @@ from pathlib import Path
from typing import Any
from typing import IO
from pypdf import PdfReader
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
from danswer.connectors.cross_connector_utils.file_utils import read_file
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.file.utils import check_file_ext_is_valid
from danswer.connectors.file.utils import get_file_ext
from danswer.connectors.interfaces import GenerateDocumentsOutput
@ -50,30 +49,11 @@ def _process_file(
return []
metadata: dict[str, Any] = {}
if extension == ".pdf":
pdf_reader = PdfReader(file)
if pdf_reader.is_encrypted:
decrypt_success = False
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error(f"Unable to decrypt pdf {file_name}")
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return [
Document(
id=file_name,
sections=[Section(link=metadata.get("link", ""), text="")],
source=DocumentSource.FILE,
semantic_identifier=file_name,
metadata={},
)
]
file_content_raw = "\n".join(page.extract_text() for page in pdf_reader.pages)
file_content_raw = read_pdf_file(
file=file, file_name=file_name, pdf_pass=pdf_pass
)
else:
file_content_raw, metadata = read_file(file)

View File

@ -12,7 +12,6 @@ import docx2txt # type:ignore
from google.auth.credentials import Credentials # type: ignore
from googleapiclient import discovery # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from pypdf import PdfReader
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
@ -20,6 +19,7 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.configs.constants import IGNORE_FOR_QA
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.google_drive.connector_auth import (
get_google_drive_creds_for_authorized_user,
)
@ -313,16 +313,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
return docx2txt.process(temp_path)
elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute()
pdf_stream = io.BytesIO(response)
pdf_reader = PdfReader(pdf_stream)
if pdf_reader.is_encrypted:
logger.warning(
f"Google drive file: {file['name']} is encrypted - Danswer will ignore it's content"
)
return ""
return "\n".join(page.extract_text() for page in pdf_reader.pages)
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"])
return file_contents
return UNSUPPORTED_FILE_TYPE_CONTENT

View File

@ -13,7 +13,6 @@ from oauthlib.oauth2 import BackendApplicationClient
from playwright.sync_api import BrowserContext
from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from pypdf import PdfReader
from requests_oauthlib import OAuth2Session # type:ignore
from danswer.configs.app_configs import INDEX_BATCH_SIZE
@ -21,6 +20,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
@ -182,10 +182,9 @@ class WebConnector(LoadConnector):
if current_url.split(".")[-1] == "pdf":
# PDF files are not checked for links
response = requests.get(current_url)
pdf_reader = PdfReader(io.BytesIO(response.content))
page_text = ""
for pdf_page in pdf_reader.pages:
page_text += pdf_page.extract_text()
page_text = read_pdf_file(
file=io.BytesIO(response.content), file_name=current_url
)
doc_batch.append(
Document(