allow pdf file in File Connector (#488)

2025-05-30 17:50:27 +02:00 · 2023-10-02 11:24:40 +05:30 · 2023-10-02 11:24:40 +05:30 · a808c733b8
commit a808c733b8
parent 2d06008f6f
3 changed files with 26 additions and 13 deletions
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@ -6,6 +6,8 @@ from pathlib import Path
 from typing import Any
 from typing import IO

+from PyPDF2 import PdfReader
+
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
 from danswer.connectors.file.utils import check_file_ext_is_valid
@ -38,8 +40,11 @@ def _open_files_at_location(

    if extension == ".zip":
        yield from _get_files_from_zip(file_path)
-    elif extension == ".txt":
-        with open(file_path, "r") as file:
+    elif extension == ".txt" or extension == ".pdf":
+        mode = "r"
+        if extension == ".pdf":
+            mode = "rb"
+        with open(file_path, mode) as file:
            yield os.path.basename(file_path), file
    else:
        logger.warning(f"Skipping file '{file_path}' with extension '{extension}'")
@ -53,15 +58,22 @@ def _process_file(file_name: str, file: IO[Any]) -> list[Document]:

    metadata = {}
    file_content_raw = ""
-    for ind, line in enumerate(file):
-        if isinstance(line, bytes):
-            line = line.decode("utf-8")
-        line = str(line)
+    if extension == ".pdf":
+        pdf_reader = PdfReader(file)
+        if not pdf_reader.is_encrypted:
+            file_content_raw = "\n".join(
+                page.extract_text() for page in pdf_reader.pages
+            )
+    else:
+        for ind, line in enumerate(file):
+            if isinstance(line, bytes):
+                line = line.decode("utf-8")
+            line = str(line)

-        if ind == 0 and line.startswith(_METADATA_FLAG):
-            metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
-        else:
-            file_content_raw += line
+            if ind == 0 and line.startswith(_METADATA_FLAG):
+                metadata = json.loads(line.replace(_METADATA_FLAG, "", 1).strip())
+            else:
+                file_content_raw += line

    return [
        Document(
--- a/backend/danswer/connectors/file/utils.py
+++ b/backend/danswer/connectors/file/utils.py
@ -9,7 +9,7 @@ from typing import IO
 from danswer.configs.app_configs import FILE_CONNECTOR_TMP_STORAGE_PATH

 _FILE_AGE_CLEANUP_THRESHOLD_HOURS = 24 * 7  # 1 week
-_VALID_FILE_EXTENSIONS = [".txt", ".zip"]
+_VALID_FILE_EXTENSIONS = [".txt", ".zip", ".pdf"]


 def get_file_ext(file_path_or_name: str | Path) -> str:
--- a/web/src/app/admin/connectors/file/page.tsx
+++ b/web/src/app/admin/connectors/file/page.tsx
@ -55,8 +55,9 @@ const Main = () => {
      {filesAreUploading && <Spinner />}
      <p className="text-sm mb-2">
        Specify files below, click the <b>Upload</b> button, and the contents of
-        these files will be searchable via Danswer! Currently only <i>.txt</i>{" "}
-        and <i>.zip</i> files (containing only <i>.txt</i> files) are supported.
+        these files will be searchable via Danswer! Currently only <i>.txt</i>,{" "}
+        <i>.pdf</i> and <i>.zip</i> files (containing only <i>.txt</i> files)
+        are supported.
      </p>
      <div className="text-sm mb-3">
        <b>NOTE:</b> if the original document is accessible via a link, you can