Add typing to pdf extraction (#2280)

This commit is contained in:
pablodanswer
2024-08-30 17:16:56 -07:00
committed by GitHub
parent 21af852073
commit 5800c7158e

View File

@@ -8,6 +8,7 @@ from collections.abc import Iterator
from email.parser import Parser as EmailParser
from pathlib import Path
from typing import Any
from typing import Dict
from typing import IO
import chardet
@@ -177,11 +178,18 @@ def read_text_file(
return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
"""Extract text from a PDF file."""
# Return only the extracted text from read_pdf_file
text, _ = read_pdf_file(file, pdf_pass)
return text
def read_pdf_file(
file: IO[Any],
pdf_pass: str | None = None,
) -> str:
metadata = {}
) -> tuple[str, dict]:
metadata: Dict[str, Any] = {}
try:
pdf_reader = PdfReader(file)
@@ -199,13 +207,16 @@ def read_pdf_file(
if not decrypt_success:
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return ""
return "", metadata
# Extract metadata from the PDF, removing leading '/' from keys if present
# This standardizes the metadata keys for consistency
metadata = {
k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items()
}
metadata = {}
if pdf_reader.metadata is not None:
metadata = {
k[1:] if k.startswith("/") else k: v
for k, v in pdf_reader.metadata.items()
}
return (
TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
@@ -285,7 +296,7 @@ def extract_file_text(
break_on_unprocessable: bool = True,
) -> str:
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": read_pdf_file,
".pdf": pdf_to_text,
".docx": docx_to_text,
".pptx": pptx_to_text,
".xlsx": xlsx_to_text,