mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
Add typing to pdf extraction (#2280)
This commit is contained in:
@@ -8,6 +8,7 @@ from collections.abc import Iterator
|
|||||||
from email.parser import Parser as EmailParser
|
from email.parser import Parser as EmailParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from typing import Dict
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
@@ -177,11 +178,18 @@ def read_text_file(
|
|||||||
return file_content_raw, metadata
|
return file_content_raw, metadata
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||||
|
"""Extract text from a PDF file."""
|
||||||
|
# Return only the extracted text from read_pdf_file
|
||||||
|
text, _ = read_pdf_file(file, pdf_pass)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def read_pdf_file(
|
def read_pdf_file(
|
||||||
file: IO[Any],
|
file: IO[Any],
|
||||||
pdf_pass: str | None = None,
|
pdf_pass: str | None = None,
|
||||||
) -> str:
|
) -> tuple[str, dict]:
|
||||||
metadata = {}
|
metadata: Dict[str, Any] = {}
|
||||||
try:
|
try:
|
||||||
pdf_reader = PdfReader(file)
|
pdf_reader = PdfReader(file)
|
||||||
|
|
||||||
@@ -199,12 +207,15 @@ def read_pdf_file(
|
|||||||
if not decrypt_success:
|
if not decrypt_success:
|
||||||
# By user request, keep files that are unreadable just so they
|
# By user request, keep files that are unreadable just so they
|
||||||
# can be discoverable by title.
|
# can be discoverable by title.
|
||||||
return ""
|
return "", metadata
|
||||||
|
|
||||||
# Extract metadata from the PDF, removing leading '/' from keys if present
|
# Extract metadata from the PDF, removing leading '/' from keys if present
|
||||||
# This standardizes the metadata keys for consistency
|
# This standardizes the metadata keys for consistency
|
||||||
|
metadata = {}
|
||||||
|
if pdf_reader.metadata is not None:
|
||||||
metadata = {
|
metadata = {
|
||||||
k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items()
|
k[1:] if k.startswith("/") else k: v
|
||||||
|
for k, v in pdf_reader.metadata.items()
|
||||||
}
|
}
|
||||||
return (
|
return (
|
||||||
TEXT_SECTION_SEPARATOR.join(
|
TEXT_SECTION_SEPARATOR.join(
|
||||||
@@ -285,7 +296,7 @@ def extract_file_text(
|
|||||||
break_on_unprocessable: bool = True,
|
break_on_unprocessable: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||||
".pdf": read_pdf_file,
|
".pdf": pdf_to_text,
|
||||||
".docx": docx_to_text,
|
".docx": docx_to_text,
|
||||||
".pptx": pptx_to_text,
|
".pptx": pptx_to_text,
|
||||||
".xlsx": xlsx_to_text,
|
".xlsx": xlsx_to_text,
|
||||||
|
Reference in New Issue
Block a user