From d6e87df548afa19e016251488a798d43b31712e3 Mon Sep 17 00:00:00 2001 From: Yohann Fabri <11197007+YoranSys@users.noreply.github.com> Date: Fri, 1 Sep 2023 01:57:08 +0200 Subject: [PATCH] gdrive connector ignore encrypted pdf file (#353) (#362) Thanks for your contribution! --- backend/danswer/connectors/google_drive/connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 7131157c6..e219c2217 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -308,7 +308,10 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str: response = service.files().get_media(fileId=file["id"]).execute() pdf_stream = io.BytesIO(response) pdf_reader = PdfReader(pdf_stream) - return "\n".join(page.extract_text() for page in pdf_reader.pages) + if pdf_reader.is_encrypted: + logger.warning(f"Google drive file: {file['name']} is encrypted danswer will ignore it's content") + else: + return "\n".join(page.extract_text() for page in pdf_reader.pages) class GoogleDriveConnector(LoadConnector, PollConnector):