diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7f3d54513..dea74adb6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ - + # Contributing to Onyx diff --git a/README.md b/README.md index edb32ec67..1b82f3b0e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - + diff --git a/backend/alembic/README.md b/backend/alembic/README.md index d0db92766..b7d294dd4 100644 --- a/backend/alembic/README.md +++ b/backend/alembic/README.md @@ -1,4 +1,4 @@ - + # Alembic DB Migrations diff --git a/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py b/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py index 0ace7836a..ad803edd0 100644 --- a/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py +++ b/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py @@ -6,12 +6,6 @@ Create Date: 2025-04-01 07:26:10.539362 """ -from alembic import op -import sqlalchemy as sa -from sqlalchemy import inspect -import datetime - - # revision identifiers, used by Alembic. revision = "6a804aeb4830" down_revision = "8e1ac4f39a9f" @@ -19,99 +13,10 @@ branch_labels = None depends_on = None +# Leaving this around only because some people might be on this migration +# originally was a duplicate of the user files migration def upgrade() -> None: - # Check if user_file table already exists - conn = op.get_bind() - inspector = inspect(conn) - - if not inspector.has_table("user_file"): - # Create user_folder table without parent_id - op.create_table( - "user_folder", - sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), - sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True), - sa.Column("name", sa.String(length=255), nullable=True), - sa.Column("description", sa.String(length=255), nullable=True), - sa.Column("display_priority", sa.Integer(), nullable=True, default=0), - sa.Column( - "created_at", sa.DateTime(timezone=True), server_default=sa.func.now() - ), - ) - - # Create user_file table with folder_id instead of parent_folder_id - op.create_table( - "user_file", - sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), - sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True), - sa.Column( - "folder_id", - sa.Integer(), - sa.ForeignKey("user_folder.id"), - nullable=True, - ), - sa.Column("link_url", sa.String(), nullable=True), - sa.Column("token_count", sa.Integer(), nullable=True), - sa.Column("file_type", sa.String(), nullable=True), - sa.Column("file_id", sa.String(length=255), nullable=False), - sa.Column("document_id", sa.String(length=255), nullable=False), - sa.Column("name", sa.String(length=255), nullable=False), - sa.Column( - "created_at", - sa.DateTime(), - default=datetime.datetime.utcnow, - ), - sa.Column( - "cc_pair_id", - sa.Integer(), - sa.ForeignKey("connector_credential_pair.id"), - nullable=True, - unique=True, - ), - ) - - # Create persona__user_file table - op.create_table( - "persona__user_file", - sa.Column( - "persona_id", - sa.Integer(), - sa.ForeignKey("persona.id"), - primary_key=True, - ), - sa.Column( - "user_file_id", - sa.Integer(), - sa.ForeignKey("user_file.id"), - primary_key=True, - ), - ) - - # Create persona__user_folder table - op.create_table( - "persona__user_folder", - sa.Column( - "persona_id", - sa.Integer(), - sa.ForeignKey("persona.id"), - primary_key=True, - ), - sa.Column( - "user_folder_id", - sa.Integer(), - sa.ForeignKey("user_folder.id"), - primary_key=True, - ), - ) - - op.add_column( - "connector_credential_pair", - sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False), - ) - - # Update existing records to have is_user_file=False instead of NULL - op.execute( - "UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL" - ) + pass def downgrade() -> None: diff --git a/backend/alembic/versions/9aadf32dfeb4_add_user_files.py b/backend/alembic/versions/9aadf32dfeb4_add_user_files.py index 01b3633c4..ce5ec4eed 100644 --- a/backend/alembic/versions/9aadf32dfeb4_add_user_files.py +++ b/backend/alembic/versions/9aadf32dfeb4_add_user_files.py @@ -103,6 +103,7 @@ def upgrade() -> None: def downgrade() -> None: + op.drop_column("connector_credential_pair", "is_user_file") # Drop the persona__user_folder table op.drop_table("persona__user_folder") # Drop the persona__user_file table @@ -111,4 +112,3 @@ def downgrade() -> None: op.drop_table("user_file") # Drop the user_folder table op.drop_table("user_folder") - op.drop_column("connector_credential_pair", "is_user_file") diff --git a/backend/onyx/connectors/README.md b/backend/onyx/connectors/README.md index b97e7afe6..fc9619a7f 100644 --- a/backend/onyx/connectors/README.md +++ b/backend/onyx/connectors/README.md @@ -1,4 +1,4 @@ - + # Writing a new Onyx Connector diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py index 13ff1fc25..5e1d819d2 100644 --- a/backend/onyx/connectors/file/connector.py +++ b/backend/onyx/connectors/file/connector.py @@ -30,7 +30,7 @@ from onyx.utils.logger import setup_logger logger = setup_logger() -def _read_file_from_postgres( +def _read_file_from_filestore( file_name: str, db_session: Session, ) -> IO | None: @@ -307,7 +307,7 @@ class LocalFileConnector(LoadConnector): for file_path in self.file_locations: current_datetime = datetime.now(timezone.utc) - file_io = _read_file_from_postgres( + file_io = _read_file_from_filestore( file_name=file_path, db_session=db_session, ) diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py index 153c61962..febdf54b2 100644 --- a/backend/onyx/file_processing/extract_file_text.py +++ b/backend/onyx/file_processing/extract_file_text.py @@ -148,7 +148,7 @@ def load_files_from_zip( ignore_dirs: bool = True, ) -> Iterator[tuple[zipfile.ZipInfo, IO[Any]]]: """ - If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile. + Iterates through files in a zip archive, yielding (ZipInfo, file handle) pairs. """ with zipfile.ZipFile(zip_file_io, "r") as zip_file: for file_info in zip_file.infolist(): diff --git a/backend/tests/integration/common_utils/managers/file.py b/backend/tests/integration/common_utils/managers/file.py index 29f4bedd6..85ebdafc7 100644 --- a/backend/tests/integration/common_utils/managers/file.py +++ b/backend/tests/integration/common_utils/managers/file.py @@ -66,7 +66,10 @@ class FileManager: @staticmethod def upload_file_for_connector( - file_path: str, file_name: str, user_performing_action: DATestUser + file_path: str, + file_name: str, + user_performing_action: DATestUser, + content_type: str = "application/octet-stream", ) -> dict: # Read the file content with open(file_path, "rb") as f: @@ -76,7 +79,7 @@ class FileManager: file_obj = io.BytesIO(file_content) # The 'files' form field expects a list of files - files = [("files", (file_name, file_obj, "application/octet-stream"))] + files = [("files", (file_name, file_obj, content_type))] # Use the user's headers but without Content-Type # as requests will set the correct multipart/form-data Content-Type for us diff --git a/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py b/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py new file mode 100644 index 000000000..d3bb4150a --- /dev/null +++ b/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py @@ -0,0 +1,118 @@ +import json +import os +from datetime import datetime +from datetime import timezone + +import pytest + +from onyx.connectors.models import InputType +from onyx.db.document import get_documents_for_cc_pair +from onyx.db.engine import get_session_context_manager +from onyx.db.enums import AccessType +from onyx.server.documents.models import DocumentSource +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.connector import ConnectorManager +from tests.integration.common_utils.managers.credential import CredentialManager +from tests.integration.common_utils.managers.file import FileManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.vespa import vespa_fixture + + +# This is a placeholder - you'll need to create this zip file with actual test files +TEST_FILES_BASE = "tests/integration/tests/indexing/file_connector/test_files" +TEST_META_ZIP_PATH = f"{TEST_FILES_BASE}/with_meta.zip" +TEST_NO_META_ZIP_PATH = f"{TEST_FILES_BASE}/without_meta.zip" +TEST_METADATA_FILE = f"{TEST_FILES_BASE}/.onyx_metadata.json" + + +@pytest.mark.parametrize( + "zip_path, has_metadata", + [ + (TEST_META_ZIP_PATH, True), + (TEST_NO_META_ZIP_PATH, False), + ], +) +def test_zip_metadata_handling( + reset: None, + vespa_client: vespa_fixture, + zip_path: str, + has_metadata: bool, +) -> None: + before = datetime.now(timezone.utc) + # Create an admin user + admin_user: DATestUser = UserManager.create( + email="admin@onyx-test.com", + ) + + # Upload the test zip file (simulate this happening from frontend) + upload_response = FileManager.upload_file_for_connector( + file_path=zip_path, + file_name=os.path.basename(zip_path), + user_performing_action=admin_user, + content_type="application/zip", + ) + + file_paths = upload_response.get("file_paths", []) + assert file_paths, "File upload failed - no file paths returned" + if has_metadata: + metadata = upload_response.get("zip_metadata", {}) + assert metadata, "Metadata should be present" + else: + metadata = {} + + # Create a dummy credential for the file connector + credential = CredentialManager.create( + source=DocumentSource.FILE, + credential_json={}, + user_performing_action=admin_user, + ) + + # Create the connector + connector_name = f"FileConnector-{int(datetime.now().timestamp())}" + connector = ConnectorManager.create( + name=connector_name, + source=DocumentSource.FILE, + input_type=InputType.LOAD_STATE, + connector_specific_config={ + "file_locations": file_paths, + "zip_metadata": metadata, + }, + access_type=AccessType.PUBLIC, + groups=[], + user_performing_action=admin_user, + ) + + # Link the credential to the connector + cc_pair = CCPairManager.create( + credential_id=credential.id, + connector_id=connector.id, + access_type=AccessType.PUBLIC, + user_performing_action=admin_user, + ) + + # Run the connector to index the files + CCPairManager.run_once( + cc_pair, from_beginning=True, user_performing_action=admin_user + ) + CCPairManager.wait_for_indexing_completion( + cc_pair=cc_pair, after=before, user_performing_action=admin_user + ) + + # Get the indexed documents + with get_session_context_manager() as db_session: + documents = get_documents_for_cc_pair(db_session, cc_pair.id) + + # Expected metadata from the .onyx_metadata.json file + with open(TEST_METADATA_FILE, "r") as f: + expected_metadata = json.load(f) + + # Verify each document has the correct metadata + for doc in documents: + filename = doc.semantic_id + if filename in expected_metadata: + expected = expected_metadata[filename] + assert ( + doc.semantic_id == expected["display_name"] + ), f"Display name mismatch for {filename}" + assert doc.link == expected["link"], f"Link mismatch for {filename}" diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/.onyx_metadata.json b/backend/tests/integration/tests/indexing/file_connector/test_files/.onyx_metadata.json new file mode 100644 index 000000000..8006261f3 --- /dev/null +++ b/backend/tests/integration/tests/indexing/file_connector/test_files/.onyx_metadata.json @@ -0,0 +1,16 @@ +[ + { + "filename": "sample1.txt", + "link": "https://www.google.com", + "file_display_name": "Basically Google", + "primary_owners": ["evan@onyx.app"], + "status": "bingle bongle" + }, + { + "filename": "sample2.txt", + "link": "https://www.youtube.com", + "file_display_name": "Pretty much youtube", + "primary_owners": ["chris@onyx.app"], + "status": "not bingle bongle" + } +] \ No newline at end of file diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/sample1.txt b/backend/tests/integration/tests/indexing/file_connector/test_files/sample1.txt new file mode 100644 index 000000000..2e8c561bd --- /dev/null +++ b/backend/tests/integration/tests/indexing/file_connector/test_files/sample1.txt @@ -0,0 +1,19 @@ +The following contains some excerpts from our docs. + +The File Connector indexes user uploaded files. Currently supports .txt, .pdf, .docx, .pptx, .xlsx, .csv, .md, .mdx, .conf, .log, .json, .tsv, .xml, .yml, .yaml, .eml, and .epub files. +You can also upload a .zip containing these files - If there are other file types in the zip, the other file types are ignored. +There is also an optional metadata line that supports links, document owners, and time updated as metadata for Onyx’s retrieval and AI Answer. + +The metadata line should be placed at the very top of the file and can take one of two formats: + +#ONYX_METADATA={"link": ""} + +Where ONYX_METADATA= is followed by a json. The valid json keys are: + +link +primary_owners +secondary_owners +doc_updated_at +file_display_name +You can also include arbitrary key/value pairs which will be understood as “tags”. +These tags can then be used in the UI as a filter if you want to constrain your search / conversation to only documents with certain tag(s) attached \ No newline at end of file diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/sample2.txt b/backend/tests/integration/tests/indexing/file_connector/test_files/sample2.txt new file mode 100644 index 000000000..d551c03d5 --- /dev/null +++ b/backend/tests/integration/tests/indexing/file_connector/test_files/sample2.txt @@ -0,0 +1 @@ +Hello, I hope you're having a wonderful day! \ No newline at end of file diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/with_meta.zip b/backend/tests/integration/tests/indexing/file_connector/test_files/with_meta.zip new file mode 100644 index 000000000..ff573813c Binary files /dev/null and b/backend/tests/integration/tests/indexing/file_connector/test_files/with_meta.zip differ diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/without_meta.zip b/backend/tests/integration/tests/indexing/file_connector/test_files/without_meta.zip new file mode 100644 index 000000000..f6bc56cc0 Binary files /dev/null and b/backend/tests/integration/tests/indexing/file_connector/test_files/without_meta.zip differ diff --git a/deployment/README.md b/deployment/README.md index d3cbdf4d9..c818bdc5f 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -1,4 +1,4 @@ - + # Deploying Onyx diff --git a/deployment/docker_compose/README.md b/deployment/docker_compose/README.md index de61e4122..c2a95a37c 100644 --- a/deployment/docker_compose/README.md +++ b/deployment/docker_compose/README.md @@ -1,4 +1,4 @@ - + # Deploying Onyx using Docker Compose diff --git a/web/README.md b/web/README.md index 6644db26b..8a7d4def3 100644 --- a/web/README.md +++ b/web/README.md @@ -1,4 +1,4 @@ - + This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).