Add typing to pdf extraction (#2280)

This commit is contained in:
pablodanswer
2024-08-30 17:16:56 -07:00
committed by GitHub
parent 21af852073
commit 5800c7158e

View File

@@ -8,6 +8,7 @@ from collections.abc import Iterator
from email.parser import Parser as EmailParser from email.parser import Parser as EmailParser
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from typing import Dict
from typing import IO from typing import IO
import chardet import chardet
@@ -177,11 +178,18 @@ def read_text_file(
return file_content_raw, metadata return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
"""Extract text from a PDF file."""
# Return only the extracted text from read_pdf_file
text, _ = read_pdf_file(file, pdf_pass)
return text
def read_pdf_file( def read_pdf_file(
file: IO[Any], file: IO[Any],
pdf_pass: str | None = None, pdf_pass: str | None = None,
) -> str: ) -> tuple[str, dict]:
metadata = {} metadata: Dict[str, Any] = {}
try: try:
pdf_reader = PdfReader(file) pdf_reader = PdfReader(file)
@@ -199,12 +207,15 @@ def read_pdf_file(
if not decrypt_success: if not decrypt_success:
# By user request, keep files that are unreadable just so they # By user request, keep files that are unreadable just so they
# can be discoverable by title. # can be discoverable by title.
return "" return "", metadata
# Extract metadata from the PDF, removing leading '/' from keys if present # Extract metadata from the PDF, removing leading '/' from keys if present
# This standardizes the metadata keys for consistency # This standardizes the metadata keys for consistency
metadata = {}
if pdf_reader.metadata is not None:
metadata = { metadata = {
k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items() k[1:] if k.startswith("/") else k: v
for k, v in pdf_reader.metadata.items()
} }
return ( return (
TEXT_SECTION_SEPARATOR.join( TEXT_SECTION_SEPARATOR.join(
@@ -285,7 +296,7 @@ def extract_file_text(
break_on_unprocessable: bool = True, break_on_unprocessable: bool = True,
) -> str: ) -> str:
extension_to_function: dict[str, Callable[[IO[Any]], str]] = { extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": read_pdf_file, ".pdf": pdf_to_text,
".docx": docx_to_text, ".docx": docx_to_text,
".pptx": pptx_to_text, ".pptx": pptx_to_text,
".xlsx": xlsx_to_text, ".xlsx": xlsx_to_text,