mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-19 00:00:37 +02:00
Use shared PDF utility function to not error on encrypted PDFs (#596)
This commit is contained in:
parent
2037e11495
commit
7c34744655
@ -6,9 +6,36 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
_METADATA_FLAG = "#DANSWER_METADATA="
|
||||
|
||||
|
||||
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
||||
pdf_reader = PdfReader(file)
|
||||
if pdf_reader.is_encrypted:
|
||||
decrypt_success = False
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
||||
else:
|
||||
logger.info(f"No Password available to to decrypt pdf {file_name}")
|
||||
|
||||
if not decrypt_success:
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return ""
|
||||
|
||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||
|
||||
|
||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||
"__MACOSX"
|
||||
|
@ -3,12 +3,11 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||
from danswer.connectors.file.utils import get_file_ext
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
@ -50,30 +49,11 @@ def _process_file(
|
||||
return []
|
||||
|
||||
metadata: dict[str, Any] = {}
|
||||
|
||||
if extension == ".pdf":
|
||||
pdf_reader = PdfReader(file)
|
||||
if pdf_reader.is_encrypted:
|
||||
decrypt_success = False
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
||||
if not decrypt_success:
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return [
|
||||
Document(
|
||||
id=file_name,
|
||||
sections=[Section(link=metadata.get("link", ""), text="")],
|
||||
source=DocumentSource.FILE,
|
||||
semantic_identifier=file_name,
|
||||
metadata={},
|
||||
)
|
||||
]
|
||||
|
||||
file_content_raw = "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||
|
||||
file_content_raw = read_pdf_file(
|
||||
file=file, file_name=file_name, pdf_pass=pdf_pass
|
||||
)
|
||||
else:
|
||||
file_content_raw, metadata = read_file(file)
|
||||
|
||||
|
@ -12,7 +12,6 @@ import docx2txt # type:ignore
|
||||
from google.auth.credentials import Credentials # type: ignore
|
||||
from googleapiclient import discovery # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from pypdf import PdfReader
|
||||
|
||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
||||
@ -20,6 +19,7 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import IGNORE_FOR_QA
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
get_google_drive_creds_for_authorized_user,
|
||||
)
|
||||
@ -313,16 +313,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
||||
return docx2txt.process(temp_path)
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
pdf_stream = io.BytesIO(response)
|
||||
pdf_reader = PdfReader(pdf_stream)
|
||||
|
||||
if pdf_reader.is_encrypted:
|
||||
logger.warning(
|
||||
f"Google drive file: {file['name']} is encrypted - Danswer will ignore it's content"
|
||||
)
|
||||
return ""
|
||||
|
||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"])
|
||||
return file_contents
|
||||
|
||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
|
||||
|
@ -13,7 +13,6 @@ from oauthlib.oauth2 import BackendApplicationClient
|
||||
from playwright.sync_api import BrowserContext
|
||||
from playwright.sync_api import Playwright
|
||||
from playwright.sync_api import sync_playwright
|
||||
from pypdf import PdfReader
|
||||
from requests_oauthlib import OAuth2Session # type:ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
@ -21,6 +20,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
@ -182,10 +182,9 @@ class WebConnector(LoadConnector):
|
||||
if current_url.split(".")[-1] == "pdf":
|
||||
# PDF files are not checked for links
|
||||
response = requests.get(current_url)
|
||||
pdf_reader = PdfReader(io.BytesIO(response.content))
|
||||
page_text = ""
|
||||
for pdf_page in pdf_reader.pages:
|
||||
page_text += pdf_page.extract_text()
|
||||
page_text = read_pdf_file(
|
||||
file=io.BytesIO(response.content), file_name=current_url
|
||||
)
|
||||
|
||||
doc_batch.append(
|
||||
Document(
|
||||
|
Loading…
x
Reference in New Issue
Block a user