From 5800c7158e92e341ef39c6675e48c1179fa44caa Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Fri, 30 Aug 2024 17:16:56 -0700 Subject: [PATCH] Add typing to pdf extraction (#2280) --- .../file_processing/extract_file_text.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index eeb7b5ae8ba3..3fcb2a99e30f 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -8,6 +8,7 @@ from collections.abc import Iterator from email.parser import Parser as EmailParser from pathlib import Path from typing import Any +from typing import Dict from typing import IO import chardet @@ -177,11 +178,18 @@ def read_text_file( return file_content_raw, metadata +def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str: + """Extract text from a PDF file.""" + # Return only the extracted text from read_pdf_file + text, _ = read_pdf_file(file, pdf_pass) + return text + + def read_pdf_file( file: IO[Any], pdf_pass: str | None = None, -) -> str: - metadata = {} +) -> tuple[str, dict]: + metadata: Dict[str, Any] = {} try: pdf_reader = PdfReader(file) @@ -199,13 +207,16 @@ def read_pdf_file( if not decrypt_success: # By user request, keep files that are unreadable just so they # can be discoverable by title. - return "" + return "", metadata # Extract metadata from the PDF, removing leading '/' from keys if present # This standardizes the metadata keys for consistency - metadata = { - k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items() - } + metadata = {} + if pdf_reader.metadata is not None: + metadata = { + k[1:] if k.startswith("/") else k: v + for k, v in pdf_reader.metadata.items() + } return ( TEXT_SECTION_SEPARATOR.join( page.extract_text() for page in pdf_reader.pages @@ -285,7 +296,7 @@ def extract_file_text( break_on_unprocessable: bool = True, ) -> str: extension_to_function: dict[str, Callable[[IO[Any]], str]] = { - ".pdf": read_pdf_file, + ".pdf": pdf_to_text, ".docx": docx_to_text, ".pptx": pptx_to_text, ".xlsx": xlsx_to_text,