Return empty string for encrypted PDF (#369)

This commit is contained in:
Yuhong Sun 2023-08-31 16:59:28 -07:00 committed by GitHub
parent d6e87df548
commit 80a08bbf0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -308,10 +308,14 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
response = service.files().get_media(fileId=file["id"]).execute()
pdf_stream = io.BytesIO(response)
pdf_reader = PdfReader(pdf_stream)
if pdf_reader.is_encrypted:
logger.warning(f"Google drive file: {file['name']} is encrypted danswer will ignore it's content")
else:
return "\n".join(page.extract_text() for page in pdf_reader.pages)
logger.warning(
f"Google drive file: {file['name']} is encrypted - Danswer will ignore it's content"
)
return ""
return "\n".join(page.extract_text() for page in pdf_reader.pages)
class GoogleDriveConnector(LoadConnector, PollConnector):