diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7f3d54513..dea74adb6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-
+
# Contributing to Onyx
diff --git a/README.md b/README.md
index edb32ec67..1b82f3b0e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-
+
diff --git a/backend/alembic/README.md b/backend/alembic/README.md
index d0db92766..b7d294dd4 100644
--- a/backend/alembic/README.md
+++ b/backend/alembic/README.md
@@ -1,4 +1,4 @@
-
+
# Alembic DB Migrations
diff --git a/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py b/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py
index 0ace7836a..ad803edd0 100644
--- a/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py
+++ b/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py
@@ -6,12 +6,6 @@ Create Date: 2025-04-01 07:26:10.539362
"""
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy import inspect
-import datetime
-
-
# revision identifiers, used by Alembic.
revision = "6a804aeb4830"
down_revision = "8e1ac4f39a9f"
@@ -19,99 +13,10 @@ branch_labels = None
depends_on = None
+# Leaving this around only because some people might be on this migration
+# originally was a duplicate of the user files migration
def upgrade() -> None:
- # Check if user_file table already exists
- conn = op.get_bind()
- inspector = inspect(conn)
-
- if not inspector.has_table("user_file"):
- # Create user_folder table without parent_id
- op.create_table(
- "user_folder",
- sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
- sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
- sa.Column("name", sa.String(length=255), nullable=True),
- sa.Column("description", sa.String(length=255), nullable=True),
- sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
- sa.Column(
- "created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
- ),
- )
-
- # Create user_file table with folder_id instead of parent_folder_id
- op.create_table(
- "user_file",
- sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
- sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
- sa.Column(
- "folder_id",
- sa.Integer(),
- sa.ForeignKey("user_folder.id"),
- nullable=True,
- ),
- sa.Column("link_url", sa.String(), nullable=True),
- sa.Column("token_count", sa.Integer(), nullable=True),
- sa.Column("file_type", sa.String(), nullable=True),
- sa.Column("file_id", sa.String(length=255), nullable=False),
- sa.Column("document_id", sa.String(length=255), nullable=False),
- sa.Column("name", sa.String(length=255), nullable=False),
- sa.Column(
- "created_at",
- sa.DateTime(),
- default=datetime.datetime.utcnow,
- ),
- sa.Column(
- "cc_pair_id",
- sa.Integer(),
- sa.ForeignKey("connector_credential_pair.id"),
- nullable=True,
- unique=True,
- ),
- )
-
- # Create persona__user_file table
- op.create_table(
- "persona__user_file",
- sa.Column(
- "persona_id",
- sa.Integer(),
- sa.ForeignKey("persona.id"),
- primary_key=True,
- ),
- sa.Column(
- "user_file_id",
- sa.Integer(),
- sa.ForeignKey("user_file.id"),
- primary_key=True,
- ),
- )
-
- # Create persona__user_folder table
- op.create_table(
- "persona__user_folder",
- sa.Column(
- "persona_id",
- sa.Integer(),
- sa.ForeignKey("persona.id"),
- primary_key=True,
- ),
- sa.Column(
- "user_folder_id",
- sa.Integer(),
- sa.ForeignKey("user_folder.id"),
- primary_key=True,
- ),
- )
-
- op.add_column(
- "connector_credential_pair",
- sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
- )
-
- # Update existing records to have is_user_file=False instead of NULL
- op.execute(
- "UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
- )
+ pass
def downgrade() -> None:
diff --git a/backend/alembic/versions/9aadf32dfeb4_add_user_files.py b/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
index 01b3633c4..ce5ec4eed 100644
--- a/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
+++ b/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
@@ -103,6 +103,7 @@ def upgrade() -> None:
def downgrade() -> None:
+ op.drop_column("connector_credential_pair", "is_user_file")
# Drop the persona__user_folder table
op.drop_table("persona__user_folder")
# Drop the persona__user_file table
@@ -111,4 +112,3 @@ def downgrade() -> None:
op.drop_table("user_file")
# Drop the user_folder table
op.drop_table("user_folder")
- op.drop_column("connector_credential_pair", "is_user_file")
diff --git a/backend/onyx/connectors/README.md b/backend/onyx/connectors/README.md
index b97e7afe6..fc9619a7f 100644
--- a/backend/onyx/connectors/README.md
+++ b/backend/onyx/connectors/README.md
@@ -1,4 +1,4 @@
-
+
# Writing a new Onyx Connector
diff --git a/backend/onyx/connectors/file/connector.py b/backend/onyx/connectors/file/connector.py
index 13ff1fc25..5e1d819d2 100644
--- a/backend/onyx/connectors/file/connector.py
+++ b/backend/onyx/connectors/file/connector.py
@@ -30,7 +30,7 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
-def _read_file_from_postgres(
+def _read_file_from_filestore(
file_name: str,
db_session: Session,
) -> IO | None:
@@ -307,7 +307,7 @@ class LocalFileConnector(LoadConnector):
for file_path in self.file_locations:
current_datetime = datetime.now(timezone.utc)
- file_io = _read_file_from_postgres(
+ file_io = _read_file_from_filestore(
file_name=file_path,
db_session=db_session,
)
diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
index 153c61962..febdf54b2 100644
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -148,7 +148,7 @@ def load_files_from_zip(
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any]]]:
"""
- If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
+ Iterates through files in a zip archive, yielding (ZipInfo, file handle) pairs.
"""
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
for file_info in zip_file.infolist():
diff --git a/backend/tests/integration/common_utils/managers/file.py b/backend/tests/integration/common_utils/managers/file.py
index 29f4bedd6..85ebdafc7 100644
--- a/backend/tests/integration/common_utils/managers/file.py
+++ b/backend/tests/integration/common_utils/managers/file.py
@@ -66,7 +66,10 @@ class FileManager:
@staticmethod
def upload_file_for_connector(
- file_path: str, file_name: str, user_performing_action: DATestUser
+ file_path: str,
+ file_name: str,
+ user_performing_action: DATestUser,
+ content_type: str = "application/octet-stream",
) -> dict:
# Read the file content
with open(file_path, "rb") as f:
@@ -76,7 +79,7 @@ class FileManager:
file_obj = io.BytesIO(file_content)
# The 'files' form field expects a list of files
- files = [("files", (file_name, file_obj, "application/octet-stream"))]
+ files = [("files", (file_name, file_obj, content_type))]
# Use the user's headers but without Content-Type
# as requests will set the correct multipart/form-data Content-Type for us
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py b/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py
new file mode 100644
index 000000000..d3bb4150a
--- /dev/null
+++ b/backend/tests/integration/tests/indexing/file_connector/test_file_connector_zip_metadata.py
@@ -0,0 +1,118 @@
+import json
+import os
+from datetime import datetime
+from datetime import timezone
+
+import pytest
+
+from onyx.connectors.models import InputType
+from onyx.db.document import get_documents_for_cc_pair
+from onyx.db.engine import get_session_context_manager
+from onyx.db.enums import AccessType
+from onyx.server.documents.models import DocumentSource
+from tests.integration.common_utils.managers.cc_pair import CCPairManager
+from tests.integration.common_utils.managers.connector import ConnectorManager
+from tests.integration.common_utils.managers.credential import CredentialManager
+from tests.integration.common_utils.managers.file import FileManager
+from tests.integration.common_utils.managers.user import UserManager
+from tests.integration.common_utils.test_models import DATestUser
+from tests.integration.common_utils.vespa import vespa_fixture
+
+
+# This is a placeholder - you'll need to create this zip file with actual test files
+TEST_FILES_BASE = "tests/integration/tests/indexing/file_connector/test_files"
+TEST_META_ZIP_PATH = f"{TEST_FILES_BASE}/with_meta.zip"
+TEST_NO_META_ZIP_PATH = f"{TEST_FILES_BASE}/without_meta.zip"
+TEST_METADATA_FILE = f"{TEST_FILES_BASE}/.onyx_metadata.json"
+
+
+@pytest.mark.parametrize(
+ "zip_path, has_metadata",
+ [
+ (TEST_META_ZIP_PATH, True),
+ (TEST_NO_META_ZIP_PATH, False),
+ ],
+)
+def test_zip_metadata_handling(
+ reset: None,
+ vespa_client: vespa_fixture,
+ zip_path: str,
+ has_metadata: bool,
+) -> None:
+ before = datetime.now(timezone.utc)
+ # Create an admin user
+ admin_user: DATestUser = UserManager.create(
+ email="admin@onyx-test.com",
+ )
+
+ # Upload the test zip file (simulate this happening from frontend)
+ upload_response = FileManager.upload_file_for_connector(
+ file_path=zip_path,
+ file_name=os.path.basename(zip_path),
+ user_performing_action=admin_user,
+ content_type="application/zip",
+ )
+
+ file_paths = upload_response.get("file_paths", [])
+ assert file_paths, "File upload failed - no file paths returned"
+ if has_metadata:
+ metadata = upload_response.get("zip_metadata", {})
+ assert metadata, "Metadata should be present"
+ else:
+ metadata = {}
+
+ # Create a dummy credential for the file connector
+ credential = CredentialManager.create(
+ source=DocumentSource.FILE,
+ credential_json={},
+ user_performing_action=admin_user,
+ )
+
+ # Create the connector
+ connector_name = f"FileConnector-{int(datetime.now().timestamp())}"
+ connector = ConnectorManager.create(
+ name=connector_name,
+ source=DocumentSource.FILE,
+ input_type=InputType.LOAD_STATE,
+ connector_specific_config={
+ "file_locations": file_paths,
+ "zip_metadata": metadata,
+ },
+ access_type=AccessType.PUBLIC,
+ groups=[],
+ user_performing_action=admin_user,
+ )
+
+ # Link the credential to the connector
+ cc_pair = CCPairManager.create(
+ credential_id=credential.id,
+ connector_id=connector.id,
+ access_type=AccessType.PUBLIC,
+ user_performing_action=admin_user,
+ )
+
+ # Run the connector to index the files
+ CCPairManager.run_once(
+ cc_pair, from_beginning=True, user_performing_action=admin_user
+ )
+ CCPairManager.wait_for_indexing_completion(
+ cc_pair=cc_pair, after=before, user_performing_action=admin_user
+ )
+
+ # Get the indexed documents
+ with get_session_context_manager() as db_session:
+ documents = get_documents_for_cc_pair(db_session, cc_pair.id)
+
+ # Expected metadata from the .onyx_metadata.json file
+ with open(TEST_METADATA_FILE, "r") as f:
+ expected_metadata = json.load(f)
+
+ # Verify each document has the correct metadata
+ for doc in documents:
+ filename = doc.semantic_id
+ if filename in expected_metadata:
+ expected = expected_metadata[filename]
+ assert (
+ doc.semantic_id == expected["display_name"]
+ ), f"Display name mismatch for {filename}"
+ assert doc.link == expected["link"], f"Link mismatch for {filename}"
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/.onyx_metadata.json b/backend/tests/integration/tests/indexing/file_connector/test_files/.onyx_metadata.json
new file mode 100644
index 000000000..8006261f3
--- /dev/null
+++ b/backend/tests/integration/tests/indexing/file_connector/test_files/.onyx_metadata.json
@@ -0,0 +1,16 @@
+[
+ {
+ "filename": "sample1.txt",
+ "link": "https://www.google.com",
+ "file_display_name": "Basically Google",
+ "primary_owners": ["evan@onyx.app"],
+ "status": "bingle bongle"
+ },
+ {
+ "filename": "sample2.txt",
+ "link": "https://www.youtube.com",
+ "file_display_name": "Pretty much youtube",
+ "primary_owners": ["chris@onyx.app"],
+ "status": "not bingle bongle"
+ }
+]
\ No newline at end of file
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/sample1.txt b/backend/tests/integration/tests/indexing/file_connector/test_files/sample1.txt
new file mode 100644
index 000000000..2e8c561bd
--- /dev/null
+++ b/backend/tests/integration/tests/indexing/file_connector/test_files/sample1.txt
@@ -0,0 +1,19 @@
+The following contains some excerpts from our docs.
+
+The File Connector indexes user uploaded files. Currently supports .txt, .pdf, .docx, .pptx, .xlsx, .csv, .md, .mdx, .conf, .log, .json, .tsv, .xml, .yml, .yaml, .eml, and .epub files.
+You can also upload a .zip containing these files - If there are other file types in the zip, the other file types are ignored.
+There is also an optional metadata line that supports links, document owners, and time updated as metadata for Onyx’s retrieval and AI Answer.
+
+The metadata line should be placed at the very top of the file and can take one of two formats:
+
+#ONYX_METADATA={"link": ""}
+
+Where ONYX_METADATA= is followed by a json. The valid json keys are:
+
+link
+primary_owners
+secondary_owners
+doc_updated_at
+file_display_name
+You can also include arbitrary key/value pairs which will be understood as “tags”.
+These tags can then be used in the UI as a filter if you want to constrain your search / conversation to only documents with certain tag(s) attached
\ No newline at end of file
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/sample2.txt b/backend/tests/integration/tests/indexing/file_connector/test_files/sample2.txt
new file mode 100644
index 000000000..d551c03d5
--- /dev/null
+++ b/backend/tests/integration/tests/indexing/file_connector/test_files/sample2.txt
@@ -0,0 +1 @@
+Hello, I hope you're having a wonderful day!
\ No newline at end of file
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/with_meta.zip b/backend/tests/integration/tests/indexing/file_connector/test_files/with_meta.zip
new file mode 100644
index 000000000..ff573813c
Binary files /dev/null and b/backend/tests/integration/tests/indexing/file_connector/test_files/with_meta.zip differ
diff --git a/backend/tests/integration/tests/indexing/file_connector/test_files/without_meta.zip b/backend/tests/integration/tests/indexing/file_connector/test_files/without_meta.zip
new file mode 100644
index 000000000..f6bc56cc0
Binary files /dev/null and b/backend/tests/integration/tests/indexing/file_connector/test_files/without_meta.zip differ
diff --git a/deployment/README.md b/deployment/README.md
index d3cbdf4d9..c818bdc5f 100644
--- a/deployment/README.md
+++ b/deployment/README.md
@@ -1,4 +1,4 @@
-
+
# Deploying Onyx
diff --git a/deployment/docker_compose/README.md b/deployment/docker_compose/README.md
index de61e4122..c2a95a37c 100644
--- a/deployment/docker_compose/README.md
+++ b/deployment/docker_compose/README.md
@@ -1,4 +1,4 @@
-
+
# Deploying Onyx using Docker Compose
diff --git a/web/README.md b/web/README.md
index 6644db26b..8a7d4def3 100644
--- a/web/README.md
+++ b/web/README.md
@@ -1,4 +1,4 @@
-
+
This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).