mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-20 00:30:10 +02:00
Use shared PDF utility function to not error on encrypted PDFs (#596)
This commit is contained in:
parent
2037e11495
commit
7c34744655
@ -6,9 +6,36 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
_METADATA_FLAG = "#DANSWER_METADATA="
|
_METADATA_FLAG = "#DANSWER_METADATA="
|
||||||
|
|
||||||
|
|
||||||
|
def read_pdf_file(file: IO[Any], file_name: str, pdf_pass: str | None = None) -> str:
|
||||||
|
pdf_reader = PdfReader(file)
|
||||||
|
if pdf_reader.is_encrypted:
|
||||||
|
decrypt_success = False
|
||||||
|
if pdf_pass is not None:
|
||||||
|
try:
|
||||||
|
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||||
|
except Exception:
|
||||||
|
logger.error(f"Unable to decrypt pdf {file_name}")
|
||||||
|
else:
|
||||||
|
logger.info(f"No Password available to to decrypt pdf {file_name}")
|
||||||
|
|
||||||
|
if not decrypt_success:
|
||||||
|
# By user request, keep files that are unreadable just so they
|
||||||
|
# can be discoverable by title.
|
||||||
|
return ""
|
||||||
|
|
||||||
|
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
||||||
|
|
||||||
|
|
||||||
def is_macos_resource_fork_file(file_name: str) -> bool:
|
def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||||
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
return os.path.basename(file_name).startswith("._") and file_name.startswith(
|
||||||
"__MACOSX"
|
"__MACOSX"
|
||||||
|
@ -3,12 +3,11 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
from pypdf import PdfReader
|
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip
|
||||||
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
from danswer.connectors.cross_connector_utils.file_utils import read_file
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||||
from danswer.connectors.file.utils import get_file_ext
|
from danswer.connectors.file.utils import get_file_ext
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
@ -50,30 +49,11 @@ def _process_file(
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
metadata: dict[str, Any] = {}
|
metadata: dict[str, Any] = {}
|
||||||
|
|
||||||
if extension == ".pdf":
|
if extension == ".pdf":
|
||||||
pdf_reader = PdfReader(file)
|
file_content_raw = read_pdf_file(
|
||||||
if pdf_reader.is_encrypted:
|
file=file, file_name=file_name, pdf_pass=pdf_pass
|
||||||
decrypt_success = False
|
)
|
||||||
if pdf_pass is not None:
|
|
||||||
try:
|
|
||||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
|
||||||
except Exception:
|
|
||||||
logger.error(f"Unable to decrypt pdf {file_name}")
|
|
||||||
if not decrypt_success:
|
|
||||||
# By user request, keep files that are unreadable just so they
|
|
||||||
# can be discoverable by title.
|
|
||||||
return [
|
|
||||||
Document(
|
|
||||||
id=file_name,
|
|
||||||
sections=[Section(link=metadata.get("link", ""), text="")],
|
|
||||||
source=DocumentSource.FILE,
|
|
||||||
semantic_identifier=file_name,
|
|
||||||
metadata={},
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
file_content_raw = "\n".join(page.extract_text() for page in pdf_reader.pages)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
file_content_raw, metadata = read_file(file)
|
file_content_raw, metadata = read_file(file)
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ import docx2txt # type:ignore
|
|||||||
from google.auth.credentials import Credentials # type: ignore
|
from google.auth.credentials import Credentials # type: ignore
|
||||||
from googleapiclient import discovery # type: ignore
|
from googleapiclient import discovery # type: ignore
|
||||||
from googleapiclient.errors import HttpError # type: ignore
|
from googleapiclient.errors import HttpError # type: ignore
|
||||||
from pypdf import PdfReader
|
|
||||||
|
|
||||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||||
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS
|
||||||
@ -20,6 +19,7 @@ from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED
|
|||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.configs.constants import IGNORE_FOR_QA
|
from danswer.configs.constants import IGNORE_FOR_QA
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||||
from danswer.connectors.google_drive.connector_auth import (
|
from danswer.connectors.google_drive.connector_auth import (
|
||||||
get_google_drive_creds_for_authorized_user,
|
get_google_drive_creds_for_authorized_user,
|
||||||
)
|
)
|
||||||
@ -313,16 +313,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
|||||||
return docx2txt.process(temp_path)
|
return docx2txt.process(temp_path)
|
||||||
elif mime_type == GDriveMimeType.PDF.value:
|
elif mime_type == GDriveMimeType.PDF.value:
|
||||||
response = service.files().get_media(fileId=file["id"]).execute()
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
pdf_stream = io.BytesIO(response)
|
file_contents = read_pdf_file(file=io.BytesIO(response), file_name=file["name"])
|
||||||
pdf_reader = PdfReader(pdf_stream)
|
return file_contents
|
||||||
|
|
||||||
if pdf_reader.is_encrypted:
|
|
||||||
logger.warning(
|
|
||||||
f"Google drive file: {file['name']} is encrypted - Danswer will ignore it's content"
|
|
||||||
)
|
|
||||||
return ""
|
|
||||||
|
|
||||||
return "\n".join(page.extract_text() for page in pdf_reader.pages)
|
|
||||||
|
|
||||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||||
|
|
||||||
|
@ -13,7 +13,6 @@ from oauthlib.oauth2 import BackendApplicationClient
|
|||||||
from playwright.sync_api import BrowserContext
|
from playwright.sync_api import BrowserContext
|
||||||
from playwright.sync_api import Playwright
|
from playwright.sync_api import Playwright
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from pypdf import PdfReader
|
|
||||||
from requests_oauthlib import OAuth2Session # type:ignore
|
from requests_oauthlib import OAuth2Session # type:ignore
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
@ -21,6 +20,7 @@ from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_ID
|
|||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||||
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
from danswer.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
|
from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file
|
||||||
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
from danswer.connectors.cross_connector_utils.html_utils import web_html_cleanup
|
||||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
@ -182,10 +182,9 @@ class WebConnector(LoadConnector):
|
|||||||
if current_url.split(".")[-1] == "pdf":
|
if current_url.split(".")[-1] == "pdf":
|
||||||
# PDF files are not checked for links
|
# PDF files are not checked for links
|
||||||
response = requests.get(current_url)
|
response = requests.get(current_url)
|
||||||
pdf_reader = PdfReader(io.BytesIO(response.content))
|
page_text = read_pdf_file(
|
||||||
page_text = ""
|
file=io.BytesIO(response.content), file_name=current_url
|
||||||
for pdf_page in pdf_reader.pages:
|
)
|
||||||
page_text += pdf_page.extract_text()
|
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user