mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-26 20:08:38 +02:00
allow pdf file in File Connector (#488)
This commit is contained in:
@@ -6,6 +6,8 @@ from pathlib import Path
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
from typing import IO
|
from typing import IO
|
||||||
|
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||||
from danswer.configs.constants import DocumentSource
|
from danswer.configs.constants import DocumentSource
|
||||||
from danswer.connectors.file.utils import check_file_ext_is_valid
|
from danswer.connectors.file.utils import check_file_ext_is_valid
|
||||||
@@ -38,8 +40,11 @@ def _open_files_at_location(
|
|||||||
|
|
||||||
if extension == ".zip":
|
if extension == ".zip":
|
||||||
yield from _get_files_from_zip(file_path)
|
yield from _get_files_from_zip(file_path)
|
||||||
elif extension == ".txt":
|
elif extension == ".txt" or extension == ".pdf":
|
||||||
with open(file_path, "r") as file:
|
mode = "r"
|
||||||
|
if extension == ".pdf":
|
||||||
|
mode = "rb"
|
||||||
|
with open(file_path, mode) as file:
|
||||||
yield os.path.basename(file_path), file
|
yield os.path.basename(file_path), file
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")
|
logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")
|
||||||
@@ -53,6 +58,13 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:
|
|||||||
|
|
||||||
metadata = {}
|
metadata = {}
|
||||||
file_content_raw = ""
|
file_content_raw = ""
|
||||||
|
if extension == ".pdf":
|
||||||
|
pdf_reader = PdfReader(file)
|
||||||
|
if not pdf_reader.is_encrypted:
|
||||||
|
file_content_raw = "\n".join(
|
||||||
|
page.extract_text() for page in pdf_reader.pages
|
||||||
|
)
|
||||||
|
else:
|
||||||
for ind, line in enumerate(file):
|
for ind, line in enumerate(file):
|
||||||
if isinstance(line, bytes):
|
if isinstance(line, bytes):
|
||||||
line = line.decode("utf-8")
|
line = line.decode("utf-8")
|
||||||
|
@@ -9,7 +9,7 @@ from typing import IO
|
|||||||
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH
|
||||||
|
|
||||||
_FILE_AGE_CLEANUP_THRESHOLD_HOURS = 24 * 7 # 1 week
|
_FILE_AGE_CLEANUP_THRESHOLD_HOURS = 24 * 7 # 1 week
|
||||||
_VALID_FILE_EXTENSIONS = [".txt", ".zip"]
|
_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"]
|
||||||
|
|
||||||
|
|
||||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||||
|
@@ -55,8 +55,9 @@ const Main = () => {
|
|||||||
{filesAreUploading && <Spinner />}
|
{filesAreUploading && <Spinner />}
|
||||||
<p className="text-sm mb-2">
|
<p className="text-sm mb-2">
|
||||||
Specify files below, click the <b>Upload</b> button, and the contents of
|
Specify files below, click the <b>Upload</b> button, and the contents of
|
||||||
these files will be searchable via Danswer! Currently only <i>.txt</i>{" "}
|
these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
|
||||||
and <i>.zip</i> files (containing only <i>.txt</i> files) are supported.
|
<i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
|
||||||
|
are supported.
|
||||||
</p>
|
</p>
|
||||||
<div className="text-sm mb-3">
|
<div className="text-sm mb-3">
|
||||||
<b>NOTE:</b> if the original document is accessible via a link, you can
|
<b>NOTE:</b> if the original document is accessible via a link, you can
|
||||||
|
Reference in New Issue
Block a user