refactor file extension checking and add test for blob s3 (#4369)

* refactor file extension checking and add test for blob s3 * code review * fix checking ext --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-04-09 20:39:29 +02:00 · 2025-03-27 11:57:44 -07:00 · 2025-03-27 11:57:44 -07:00 · f08fa878a6
commit f08fa878a6
parent d307534781
7 changed files with 134 additions and 14 deletions
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@ -9,6 +9,10 @@ on:
    - cron: "0 16 * * *"

 env:
+  # AWS
+  AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS }}
+  AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS }}
+  
  # Confluence
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
--- a/backend/onyx/connectors/blob/connector.py
+++ b/backend/onyx/connectors/blob/connector.py
@ -87,7 +87,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
                credentials.get(key)
                for key in ["aws_access_key_id", "aws_secret_access_key"]
            ):
-                raise ConnectorMissingCredentialError("Google Cloud Storage")
+                raise ConnectorMissingCredentialError("Amazon S3")

            session = boto3.Session(
                aws_access_key_id=credentials["aws_access_key_id"],
--- a/backend/onyx/connectors/egnyte/connector.py
+++ b/backend/onyx/connectors/egnyte/connector.py
@ -28,8 +28,9 @@ from onyx.connectors.models import TextSection
 from onyx.file_processing.extract_file_text import detect_encoding
 from onyx.file_processing.extract_file_text import extract_file_text
 from onyx.file_processing.extract_file_text import get_file_ext
+from onyx.file_processing.extract_file_text import is_accepted_file_ext
 from onyx.file_processing.extract_file_text import is_text_file_extension
-from onyx.file_processing.extract_file_text import is_valid_file_ext
+from onyx.file_processing.extract_file_text import OnyxExtensionType
 from onyx.file_processing.extract_file_text import read_text_file
 from onyx.utils.logger import setup_logger
 from onyx.utils.retry_wrapper import request_with_retries
@ -69,7 +70,9 @@ def _process_egnyte_file(

    file_name = file_metadata["name"]
    extension = get_file_ext(file_name)
-    if not is_valid_file_ext(extension):
+    if not is_accepted_file_ext(
+        extension, OnyxExtensionType.Plain | OnyxExtensionType.Document
+    ):
        logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
        return None

--- a/backend/onyx/connectors/file/connector.py
+++ b/backend/onyx/connectors/file/connector.py
@ -22,8 +22,9 @@ from onyx.db.engine import get_session_with_current_tenant
 from onyx.db.pg_file_store import get_pgfilestore_by_file_name
 from onyx.file_processing.extract_file_text import extract_text_and_images
 from onyx.file_processing.extract_file_text import get_file_ext
-from onyx.file_processing.extract_file_text import is_valid_file_ext
+from onyx.file_processing.extract_file_text import is_accepted_file_ext
 from onyx.file_processing.extract_file_text import load_files_from_zip
+from onyx.file_processing.extract_file_text import OnyxExtensionType
 from onyx.file_processing.image_utils import store_image_and_create_section
 from onyx.file_store.file_store import get_default_file_store
 from onyx.utils.logger import setup_logger
@ -51,7 +52,7 @@ def _read_files_and_metadata(
            file_content, ignore_dirs=True
        ):
            yield os.path.join(directory_path, file_info.filename), subfile, metadata
-    elif is_valid_file_ext(extension):
+    elif is_accepted_file_ext(extension, OnyxExtensionType.All):
        yield file_name, file_content, metadata
    else:
        logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
@ -122,7 +123,7 @@ def _process_file(
        logger.warning(f"No file record found for '{file_name}' in PG; skipping.")
        return []

-    if not is_valid_file_ext(extension):
+    if not is_accepted_file_ext(extension, OnyxExtensionType.All):
        logger.warning(
            f"Skipping file '{file_name}' with unrecognized extension '{extension}'"
        )
--- a/backend/onyx/connectors/highspot/connector.py
+++ b/backend/onyx/connectors/highspot/connector.py
@ -20,8 +20,8 @@ from onyx.connectors.models import ConnectorMissingCredentialError
 from onyx.connectors.models import Document
 from onyx.connectors.models import SlimDocument
 from onyx.connectors.models import TextSection
+from onyx.file_processing.extract_file_text import ALL_ACCEPTED_FILE_EXTENSIONS
 from onyx.file_processing.extract_file_text import extract_file_text
-from onyx.file_processing.extract_file_text import VALID_FILE_EXTENSIONS
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger

@ -298,7 +298,7 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):

            elif (
                is_valid_format
-                and file_extension in VALID_FILE_EXTENSIONS
+                and file_extension in ALL_ACCEPTED_FILE_EXTENSIONS
                and can_download
            ):
                # For documents, try to get the text content
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@ -7,6 +7,8 @@ from collections.abc import Callable
 from collections.abc import Iterator
 from collections.abc import Sequence
 from email.parser import Parser as EmailParser
+from enum import auto
+from enum import IntFlag
 from io import BytesIO
 from pathlib import Path
 from typing import Any
@ -35,7 +37,7 @@ logger = setup_logger()

 TEXT_SECTION_SEPARATOR = "\n\n"

-PLAIN_TEXT_FILE_EXTENSIONS = [
+ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS = [
    ".txt",
    ".md",
    ".mdx",
@ -49,7 +51,7 @@ PLAIN_TEXT_FILE_EXTENSIONS = [
    ".yaml",
 ]

-VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
+ACCEPTED_DOCUMENT_FILE_EXTENSIONS = [
    ".pdf",
    ".docx",
    ".pptx",
@ -57,12 +59,21 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
    ".eml",
    ".epub",
    ".html",
+]
+
+ACCEPTED_IMAGE_FILE_EXTENSIONS = [
    ".png",
    ".jpg",
    ".jpeg",
    ".webp",
 ]

+ALL_ACCEPTED_FILE_EXTENSIONS = (
+    ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
+    + ACCEPTED_DOCUMENT_FILE_EXTENSIONS
+    + ACCEPTED_IMAGE_FILE_EXTENSIONS
+)
+
 IMAGE_MEDIA_TYPES = [
    "image/png",
    "image/jpeg",
@ -70,8 +81,15 @@ IMAGE_MEDIA_TYPES = [
 ]


+class OnyxExtensionType(IntFlag):
+    Plain = auto()
+    Document = auto()
+    Multimedia = auto()
+    All = Plain | Document | Multimedia
+
+
 def is_text_file_extension(file_name: str) -> bool:
-    return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
+    return any(file_name.endswith(ext) for ext in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS)


 def get_file_ext(file_path_or_name: str | Path) -> str:
@ -83,8 +101,20 @@ def is_valid_media_type(media_type: str) -> bool:
    return media_type in IMAGE_MEDIA_TYPES


-def is_valid_file_ext(ext: str) -> bool:
-    return ext in VALID_FILE_EXTENSIONS
+def is_accepted_file_ext(ext: str, ext_type: OnyxExtensionType) -> bool:
+    if ext_type & OnyxExtensionType.Plain:
+        if ext in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS:
+            return True
+
+    if ext_type & OnyxExtensionType.Document:
+        if ext in ACCEPTED_DOCUMENT_FILE_EXTENSIONS:
+            return True
+
+    if ext_type & OnyxExtensionType.Multimedia:
+        if ext in ACCEPTED_IMAGE_FILE_EXTENSIONS:
+            return True
+
+    return False


 def is_text_file(file: IO[bytes]) -> bool:
@ -382,6 +412,9 @@ def extract_file_text(
    """
    Legacy function that returns *only text*, ignoring embedded images.
    For backward-compatibility in code that only wants text.
+
+    NOTE: Ignoring seems to be defined as returning an empty string for files it can't
+    handle (such as images).
    """
    extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
        ".pdf": pdf_to_text,
@ -405,7 +438,9 @@ def extract_file_text(
        if extension is None:
            extension = get_file_ext(file_name)

-        if is_valid_file_ext(extension):
+        if is_accepted_file_ext(
+            extension, OnyxExtensionType.Plain | OnyxExtensionType.Document
+        ):
            func = extension_to_function.get(extension, file_io_to_text)
            file.seek(0)
            return func(file)
--- a/backend/tests/daily/connectors/blob/test_blob_connector.py
+++ b/backend/tests/daily/connectors/blob/test_blob_connector.py
@ -0,0 +1,77 @@
+import os
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+import pytest
+
+from onyx.configs.constants import BlobType
+from onyx.connectors.blob.connector import BlobStorageConnector
+from onyx.connectors.models import Document
+from onyx.connectors.models import TextSection
+from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
+from onyx.file_processing.extract_file_text import ACCEPTED_IMAGE_FILE_EXTENSIONS
+from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
+from onyx.file_processing.extract_file_text import get_file_ext
+
+
+@pytest.fixture
+def blob_connector(request: pytest.FixtureRequest) -> BlobStorageConnector:
+    connector = BlobStorageConnector(
+        bucket_type=BlobType.S3, bucket_name="onyx-connector-tests"
+    )
+
+    connector.load_credentials(
+        {
+            "aws_access_key_id": os.environ["AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS"],
+            "aws_secret_access_key": os.environ[
+                "AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS"
+            ],
+        }
+    )
+
+    return connector
+
+
+@patch(
+    "onyx.file_processing.extract_file_text.get_unstructured_api_key",
+    return_value=None,
+)
+def test_blob_s3_connector(
+    mock_get_api_key: MagicMock, blob_connector: BlobStorageConnector
+) -> None:
+    """
+    Plain and document file types should be fully indexed.
+
+    Multimedia and unknown file types will be indexed by title only with one empty section.
+
+    This is intentional in order to allow searching by just the title even if we can't
+    index the file content.
+    """
+    all_docs: list[Document] = []
+    document_batches = blob_connector.load_from_state()
+    for doc_batch in document_batches:
+        for doc in doc_batch:
+            all_docs.append(doc)
+
+    #
+    assert len(all_docs) == 19
+
+    for doc in all_docs:
+        section = doc.sections[0]
+        assert isinstance(section, TextSection)
+
+        file_extension = get_file_ext(doc.semantic_identifier)
+        if file_extension in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS:
+            assert len(section.text) > 0
+            continue
+
+        if file_extension in ACCEPTED_DOCUMENT_FILE_EXTENSIONS:
+            assert len(section.text) > 0
+            continue
+
+        if file_extension in ACCEPTED_IMAGE_FILE_EXTENSIONS:
+            assert len(section.text) == 0
+            continue
+
+        # unknown extension
+        assert len(section.text) == 0