allow pdf file in File Connector (#488)

This commit is contained in:
Jignesh Solanki 2023-10-02 11:24:40 +05:30 committed by GitHub
parent 2d06008f6f
commit a808c733b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 26 additions and 13 deletions

View File

@ -6,6 +6,8 @@ from pathlib import Path
from typing import Any
from typing import IO
from PyPDF2 import PdfReader
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.file.utils import check_file_ext_is_valid
@ -38,8 +40,11 @@ def _open_files_at_location(
if extension == ".zip":
yield from _get_files_from_zip(file_path)
elif extension == ".txt":
with open(file_path, "r") as file:
elif extension == ".txt" or extension == ".pdf":
mode = "r"
if extension == ".pdf":
mode = "rb"
with open(file_path, mode) as file:
yield os.path.basename(file_path), file
else:
logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")
@ -53,15 +58,22 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = str(line)
if extension == ".pdf":
pdf_reader = PdfReader(file)
if not pdf_reader.is_encrypted:
file_content_raw = "\n".join(
page.extract_text() for page in pdf_reader.pages
)
else:
for ind, line in enumerate(file):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = str(line)
if ind == 0 and line.startswith(_METADATA_FLAG):
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
else:
file_content_raw += line
if ind == 0 and line.startswith(_METADATA_FLAG):
metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
else:
file_content_raw += line
return [
Document(

View File

@ -9,7 +9,7 @@ from typing import IO
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
_FILE_AGE_CLEANUP_THRESHOLD_HOURS = 24 * 7 # 1 week
_VALID_FILE_EXTENSIONS = [".txt", ".zip"]
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"]
def get_file_ext(file_path_or_name: str | Path) -> str:

View File

@ -55,8 +55,9 @@ const Main = () => {
{filesAreUploading && <Spinner />}
<p className="text-sm mb-2">
Specify files below, click the <b>Upload</b> button, and the contents of
these files will be searchable via Danswer! Currently only <i>.txt</i>{" "}
and <i>.zip</i> files (containing only <i>.txt</i> files) are supported.
these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
<i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
are supported.
</p>
<div className="text-sm mb-3">
<b>NOTE:</b> if the original document is accessible via a link, you can