add metadata to pdf extraction (#2278)

This commit is contained in:
pablodanswer
2024-08-30 15:14:02 -07:00
committed by GitHub
parent 44c45cbf2a
commit 4181124e7a
4 changed files with 27 additions and 12 deletions

View File

@@ -23,7 +23,7 @@ from danswer.file_processing.extract_file_text import extract_file_text
from danswer.file_processing.extract_file_text import get_file_ext from danswer.file_processing.extract_file_text import get_file_ext
from danswer.file_processing.extract_file_text import is_text_file_extension from danswer.file_processing.extract_file_text import is_text_file_extension
from danswer.file_processing.extract_file_text import load_files_from_zip from danswer.file_processing.extract_file_text import load_files_from_zip
from danswer.file_processing.extract_file_text import pdf_to_text from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.file_processing.extract_file_text import read_text_file from danswer.file_processing.extract_file_text import read_text_file
from danswer.file_store.file_store import get_default_file_store from danswer.file_store.file_store import get_default_file_store
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@@ -75,7 +75,7 @@ def _process_file(
# Using the PDF reader function directly to pass in password cleanly # Using the PDF reader function directly to pass in password cleanly
elif extension == ".pdf": elif extension == ".pdf":
file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass) file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)
else: else:
file_content_raw = extract_file_text( file_content_raw = extract_file_text(

View File

@@ -41,8 +41,8 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import docx_to_text from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import pptx_to_text from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.utils.batching import batch_generator from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
@@ -334,7 +334,8 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
return docx_to_text(file=io.BytesIO(response)) return docx_to_text(file=io.BytesIO(response))
elif mime_type == GDriveMimeType.PDF.value: elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute() response = service.files().get_media(fileId=file["id"]).execute()
return pdf_to_text(file=io.BytesIO(response)) text, _ = read_pdf_file(file=io.BytesIO(response))
return text
elif mime_type == GDriveMimeType.POWERPOINT.value: elif mime_type == GDriveMimeType.POWERPOINT.value:
response = service.files().get_media(fileId=file["id"]).execute() response = service.files().get_media(fileId=file["id"]).execute()
return pptx_to_text(file=io.BytesIO(response)) return pptx_to_text(file=io.BytesIO(response))

View File

@@ -27,7 +27,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import Document from danswer.connectors.models import Document
from danswer.connectors.models import Section from danswer.connectors.models import Section
from danswer.file_processing.extract_file_text import pdf_to_text from danswer.file_processing.extract_file_text import read_pdf_file
from danswer.file_processing.html_utils import web_html_cleanup from danswer.file_processing.html_utils import web_html_cleanup
from danswer.utils.logger import setup_logger from danswer.utils.logger import setup_logger
from danswer.utils.sitemap import list_pages_for_site from danswer.utils.sitemap import list_pages_for_site
@@ -284,7 +284,9 @@ class WebConnector(LoadConnector):
if current_url.split(".")[-1] == "pdf": if current_url.split(".")[-1] == "pdf":
# PDF files are not checked for links # PDF files are not checked for links
response = requests.get(current_url) response = requests.get(current_url)
page_text = pdf_to_text(file=io.BytesIO(response.content)) page_text, metadata = read_pdf_file(
file=io.BytesIO(response.content)
)
doc_batch.append( doc_batch.append(
Document( Document(
@@ -292,7 +294,7 @@ class WebConnector(LoadConnector):
sections=[Section(link=current_url, text=page_text)], sections=[Section(link=current_url, text=page_text)],
source=DocumentSource.WEB, source=DocumentSource.WEB,
semantic_identifier=current_url.split("/")[-1], semantic_identifier=current_url.split("/")[-1],
metadata={}, metadata=metadata,
) )
) )
continue continue

View File

@@ -177,7 +177,11 @@ def read_text_file(
return file_content_raw, metadata return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str: def read_pdf_file(
file: IO[Any],
pdf_pass: str | None = None,
) -> str:
metadata = {}
try: try:
pdf_reader = PdfReader(file) pdf_reader = PdfReader(file)
@@ -197,8 +201,16 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
# can be discoverable by title. # can be discoverable by title.
return "" return ""
return TEXT_SECTION_SEPARATOR.join( # Extract metadata from the PDF, removing leading '/' from keys if present
page.extract_text() for page in pdf_reader.pages # This standardizes the metadata keys for consistency
metadata = {
k[1:] if k.startswith("/") else k: v for k, v in pdf_reader.metadata.items()
}
return (
TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
),
metadata,
) )
except PdfStreamError: except PdfStreamError:
logger.exception("PDF file is not a valid PDF") logger.exception("PDF file is not a valid PDF")
@@ -207,7 +219,7 @@ def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
# File is still discoverable by title # File is still discoverable by title
# but the contents are not included as they cannot be parsed # but the contents are not included as they cannot be parsed
return "" return "", metadata
def docx_to_text(file: IO[Any]) -> str: def docx_to_text(file: IO[Any]) -> str:
@@ -273,7 +285,7 @@ def extract_file_text(
break_on_unprocessable: bool = True, break_on_unprocessable: bool = True,
) -> str: ) -> str:
extension_to_function: dict[str, Callable[[IO[Any]], str]] = { extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": pdf_to_text, ".pdf": read_pdf_file,
".docx": docx_to_text, ".docx": docx_to_text,
".pptx": pptx_to_text, ".pptx": pptx_to_text,
".xlsx": xlsx_to_text, ".xlsx": xlsx_to_text,