mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-10 21:26:01 +02:00
add metadata to pdf extraction (#2278)
This commit is contained in:
@@ -23,7 +23,7 @@ from danswer.file_processing.extract_file_text import extract_file_text
|
|||||||
from danswer.file_processing.extract_file_text import get_file_ext
|
from danswer.file_processing.extract_file_text import get_file_ext
|
||||||
from danswer.file_processing.extract_file_text import is_text_file_extension
|
from danswer.file_processing.extract_file_text import is_text_file_extension
|
||||||
from danswer.file_processing.extract_file_text import load_files_from_zip
|
from danswer.file_processing.extract_file_text import load_files_from_zip
|
||||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
from danswer.file_processing.extract_file_text import read_pdf_file
|
||||||
from danswer.file_processing.extract_file_text import read_text_file
|
from danswer.file_processing.extract_file_text import read_text_file
|
||||||
from danswer.file_store.file_store import get_default_file_store
|
from danswer.file_store.file_store import get_default_file_store
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
@@ -75,7 +75,7 @@ def _process_file(
|
|||||||
|
|
||||||
# Using the PDF reader function directly to pass in password cleanly
|
# Using the PDF reader function directly to pass in password cleanly
|
||||||
elif extension == ".pdf":
|
elif extension == ".pdf":
|
||||||
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)
|
file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
file_content_raw = extract_file_text(
|
file_content_raw = extract_file_text(
|
||||||
|
@@ -41,8 +41,8 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
|||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.file_processing.extract_file_text import docx_to_text
|
from danswer.file_processing.extract_file_text import docx_to_text
|
||||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
|
||||||
from danswer.file_processing.extract_file_text import pptx_to_text
|
from danswer.file_processing.extract_file_text import pptx_to_text
|
||||||
|
from danswer.file_processing.extract_file_text import read_pdf_file
|
||||||
from danswer.utils.batching import batch_generator
|
from danswer.utils.batching import batch_generator
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
|
|
||||||
@@ -334,7 +334,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
|
|||||||
return docx_to_text(file=io.BytesIO(response))
|
return docx_to_text(file=io.BytesIO(response))
|
||||||
elif mime_type == GDriveMimeType.PDF.value:
|
elif mime_type == GDriveMimeType.PDF.value:
|
||||||
response = service.files().get_media(fileId=file["id"]).execute()
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
return pdf_to_text(file=io.BytesIO(response))
|
text, _ = read_pdf_file(file=io.BytesIO(response))
|
||||||
|
return text
|
||||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||||
response = service.files().get_media(fileId=file["id"]).execute()
|
response = service.files().get_media(fileId=file["id"]).execute()
|
||||||
return pptx_to_text(file=io.BytesIO(response))
|
return pptx_to_text(file=io.BytesIO(response))
|
||||||
|
@@ -27,7 +27,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
|
|||||||
from danswer.connectors.interfaces import LoadConnector
|
from danswer.connectors.interfaces import LoadConnector
|
||||||
from danswer.connectors.models import Document
|
from danswer.connectors.models import Document
|
||||||
from danswer.connectors.models import Section
|
from danswer.connectors.models import Section
|
||||||
from danswer.file_processing.extract_file_text import pdf_to_text
|
from danswer.file_processing.extract_file_text import read_pdf_file
|
||||||
from danswer.file_processing.html_utils import web_html_cleanup
|
from danswer.file_processing.html_utils import web_html_cleanup
|
||||||
from danswer.utils.logger import setup_logger
|
from danswer.utils.logger import setup_logger
|
||||||
from danswer.utils.sitemap import list_pages_for_site
|
from danswer.utils.sitemap import list_pages_for_site
|
||||||
@@ -284,7 +284,9 @@ class WebConnector(LoadConnector):
|
|||||||
if current_url.split(".")[-1] == "pdf":
|
if current_url.split(".")[-1] == "pdf":
|
||||||
# PDF files are not checked for links
|
# PDF files are not checked for links
|
||||||
response = requests.get(current_url)
|
response = requests.get(current_url)
|
||||||
page_text = pdf_to_text(file=io.BytesIO(response.content))
|
page_text, metadata = read_pdf_file(
|
||||||
|
file=io.BytesIO(response.content)
|
||||||
|
)
|
||||||
|
|
||||||
doc_batch.append(
|
doc_batch.append(
|
||||||
Document(
|
Document(
|
||||||
@@ -292,7 +294,7 @@ class WebConnector(LoadConnector):
|
|||||||
sections=[Section(link=current_url, text=page_text)],
|
sections=[Section(link=current_url, text=page_text)],
|
||||||
source=DocumentSource.WEB,
|
source=DocumentSource.WEB,
|
||||||
semantic_identifier=current_url.split("/")[-1],
|
semantic_identifier=current_url.split("/")[-1],
|
||||||
metadata={},
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
@@ -177,7 +177,11 @@ def read_text_file(
|
|||||||
return file_content_raw, metadata
|
return file_content_raw, metadata
|
||||||
|
|
||||||
|
|
||||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
def read_pdf_file(
|
||||||
|
file: IO[Any],
|
||||||
|
pdf_pass: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
metadata = {}
|
||||||
try:
|
try:
|
||||||
pdf_reader = PdfReader(file)
|
pdf_reader = PdfReader(file)
|
||||||
|
|
||||||
@@ -197,8 +201,16 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
|||||||
# can be discoverable by title.
|
# can be discoverable by title.
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
return TEXT_SECTION_SEPARATOR.join(
|
# Extract metadata from the PDF, removing leading '/' from keys if present
|
||||||
page.extract_text() for page in pdf_reader.pages
|
# This standardizes the metadata keys for consistency
|
||||||
|
metadata = {
|
||||||
|
k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items()
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
TEXT_SECTION_SEPARATOR.join(
|
||||||
|
page.extract_text() for page in pdf_reader.pages
|
||||||
|
),
|
||||||
|
metadata,
|
||||||
)
|
)
|
||||||
except PdfStreamError:
|
except PdfStreamError:
|
||||||
logger.exception("PDF file is not a valid PDF")
|
logger.exception("PDF file is not a valid PDF")
|
||||||
@@ -207,7 +219,7 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
|||||||
|
|
||||||
# File is still discoverable by title
|
# File is still discoverable by title
|
||||||
# but the contents are not included as they cannot be parsed
|
# but the contents are not included as they cannot be parsed
|
||||||
return ""
|
return "", metadata
|
||||||
|
|
||||||
|
|
||||||
def docx_to_text(file: IO[Any]) -> str:
|
def docx_to_text(file: IO[Any]) -> str:
|
||||||
@@ -273,7 +285,7 @@ def extract_file_text(
|
|||||||
break_on_unprocessable: bool = True,
|
break_on_unprocessable: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||||
".pdf": pdf_to_text,
|
".pdf": read_pdf_file,
|
||||||
".docx": docx_to_text,
|
".docx": docx_to_text,
|
||||||
".pptx": pptx_to_text,
|
".pptx": pptx_to_text,
|
||||||
".xlsx": xlsx_to_text,
|
".xlsx": xlsx_to_text,
|
||||||
|
Reference in New Issue
Block a user