File connector tests (#4561)

* danswer to onyx plus tests for file connector

* actually add test
This commit is contained in:
evan-danswer 2025-04-19 15:54:30 -07:00 committed by GitHub
parent 5681df9095
commit dc62d83a06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 173 additions and 111 deletions

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
# Contributing to Onyx

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
<a name="readme-top"></a>

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
# Alembic DB Migrations

View File

@ -6,12 +6,6 @@ Create Date: 2025-04-01 07:26:10.539362
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy import inspect
import datetime
# revision identifiers, used by Alembic.
revision = "6a804aeb4830"
down_revision = "8e1ac4f39a9f"
@ -19,99 +13,10 @@ branch_labels = None
depends_on = None
# Leaving this around only because some people might be on this migration
# originally was a duplicate of the user files migration
def upgrade() -> None:
# Check if user_file table already exists
conn = op.get_bind()
inspector = inspect(conn)
if not inspector.has_table("user_file"):
# Create user_folder table without parent_id
op.create_table(
"user_folder",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column("name", sa.String(length=255), nullable=True),
sa.Column("description", sa.String(length=255), nullable=True),
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
sa.Column(
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
),
)
# Create user_file table with folder_id instead of parent_folder_id
op.create_table(
"user_file",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column(
"folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
nullable=True,
),
sa.Column("link_url", sa.String(), nullable=True),
sa.Column("token_count", sa.Integer(), nullable=True),
sa.Column("file_type", sa.String(), nullable=True),
sa.Column("file_id", sa.String(length=255), nullable=False),
sa.Column("document_id", sa.String(length=255), nullable=False),
sa.Column("name", sa.String(length=255), nullable=False),
sa.Column(
"created_at",
sa.DateTime(),
default=datetime.datetime.utcnow,
),
sa.Column(
"cc_pair_id",
sa.Integer(),
sa.ForeignKey("connector_credential_pair.id"),
nullable=True,
unique=True,
),
)
# Create persona__user_file table
op.create_table(
"persona__user_file",
sa.Column(
"persona_id",
sa.Integer(),
sa.ForeignKey("persona.id"),
primary_key=True,
),
sa.Column(
"user_file_id",
sa.Integer(),
sa.ForeignKey("user_file.id"),
primary_key=True,
),
)
# Create persona__user_folder table
op.create_table(
"persona__user_folder",
sa.Column(
"persona_id",
sa.Integer(),
sa.ForeignKey("persona.id"),
primary_key=True,
),
sa.Column(
"user_folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
primary_key=True,
),
)
op.add_column(
"connector_credential_pair",
sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
)
# Update existing records to have is_user_file=False instead of NULL
op.execute(
"UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
)
pass
def downgrade() -> None:

View File

@ -103,6 +103,7 @@ def upgrade() -> None:
def downgrade() -> None:
op.drop_column("connector_credential_pair", "is_user_file")
# Drop the persona__user_folder table
op.drop_table("persona__user_folder")
# Drop the persona__user_file table
@ -111,4 +112,3 @@ def downgrade() -> None:
op.drop_table("user_file")
# Drop the user_folder table
op.drop_table("user_folder")
op.drop_column("connector_credential_pair", "is_user_file")

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/onyx/connectors/README.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/onyx/connectors/README.md"} -->
# Writing a new Onyx Connector

View File

@ -30,7 +30,7 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
def _read_file_from_postgres(
def _read_file_from_filestore(
file_name: str,
db_session: Session,
) -> IO | None:
@ -307,7 +307,7 @@ class LocalFileConnector(LoadConnector):
for file_path in self.file_locations:
current_datetime = datetime.now(timezone.utc)
file_io = _read_file_from_postgres(
file_io = _read_file_from_filestore(
file_name=file_path,
db_session=db_session,
)

View File

@ -148,7 +148,7 @@ def load_files_from_zip(
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any]]]:
"""
If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
Iterates through files in a zip archive, yielding (ZipInfo, file handle) pairs.
"""
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
for file_info in zip_file.infolist():

View File

@ -66,7 +66,10 @@ class FileManager:
@staticmethod
def upload_file_for_connector(
file_path: str, file_name: str, user_performing_action: DATestUser
file_path: str,
file_name: str,
user_performing_action: DATestUser,
content_type: str = "application/octet-stream",
) -> dict:
# Read the file content
with open(file_path, "rb") as f:
@ -76,7 +79,7 @@ class FileManager:
file_obj = io.BytesIO(file_content)
# The 'files' form field expects a list of files
files = [("files", (file_name, file_obj, "application/octet-stream"))]
files = [("files", (file_name, file_obj, content_type))]
# Use the user's headers but without Content-Type
# as requests will set the correct multipart/form-data Content-Type for us

View File

@ -0,0 +1,118 @@
import json
import os
from datetime import datetime
from datetime import timezone
import pytest
from onyx.connectors.models import InputType
from onyx.db.document import get_documents_for_cc_pair
from onyx.db.engine import get_session_context_manager
from onyx.db.enums import AccessType
from onyx.server.documents.models import DocumentSource
from tests.integration.common_utils.managers.cc_pair import CCPairManager
from tests.integration.common_utils.managers.connector import ConnectorManager
from tests.integration.common_utils.managers.credential import CredentialManager
from tests.integration.common_utils.managers.file import FileManager
from tests.integration.common_utils.managers.user import UserManager
from tests.integration.common_utils.test_models import DATestUser
from tests.integration.common_utils.vespa import vespa_fixture
# This is a placeholder - you'll need to create this zip file with actual test files
TEST_FILES_BASE = "tests/integration/tests/indexing/file_connector/test_files"
TEST_META_ZIP_PATH = f"{TEST_FILES_BASE}/with_meta.zip"
TEST_NO_META_ZIP_PATH = f"{TEST_FILES_BASE}/without_meta.zip"
TEST_METADATA_FILE = f"{TEST_FILES_BASE}/.onyx_metadata.json"
@pytest.mark.parametrize(
"zip_path, has_metadata",
[
(TEST_META_ZIP_PATH, True),
(TEST_NO_META_ZIP_PATH, False),
],
)
def test_zip_metadata_handling(
reset: None,
vespa_client: vespa_fixture,
zip_path: str,
has_metadata: bool,
) -> None:
before = datetime.now(timezone.utc)
# Create an admin user
admin_user: DATestUser = UserManager.create(
email="admin@onyx-test.com",
)
# Upload the test zip file (simulate this happening from frontend)
upload_response = FileManager.upload_file_for_connector(
file_path=zip_path,
file_name=os.path.basename(zip_path),
user_performing_action=admin_user,
content_type="application/zip",
)
file_paths = upload_response.get("file_paths", [])
assert file_paths, "File upload failed - no file paths returned"
if has_metadata:
metadata = upload_response.get("zip_metadata", {})
assert metadata, "Metadata should be present"
else:
metadata = {}
# Create a dummy credential for the file connector
credential = CredentialManager.create(
source=DocumentSource.FILE,
credential_json={},
user_performing_action=admin_user,
)
# Create the connector
connector_name = f"FileConnector-{int(datetime.now().timestamp())}"
connector = ConnectorManager.create(
name=connector_name,
source=DocumentSource.FILE,
input_type=InputType.LOAD_STATE,
connector_specific_config={
"file_locations": file_paths,
"zip_metadata": metadata,
},
access_type=AccessType.PUBLIC,
groups=[],
user_performing_action=admin_user,
)
# Link the credential to the connector
cc_pair = CCPairManager.create(
credential_id=credential.id,
connector_id=connector.id,
access_type=AccessType.PUBLIC,
user_performing_action=admin_user,
)
# Run the connector to index the files
CCPairManager.run_once(
cc_pair, from_beginning=True, user_performing_action=admin_user
)
CCPairManager.wait_for_indexing_completion(
cc_pair=cc_pair, after=before, user_performing_action=admin_user
)
# Get the indexed documents
with get_session_context_manager() as db_session:
documents = get_documents_for_cc_pair(db_session, cc_pair.id)
# Expected metadata from the .onyx_metadata.json file
with open(TEST_METADATA_FILE, "r") as f:
expected_metadata = json.load(f)
# Verify each document has the correct metadata
for doc in documents:
filename = doc.semantic_id
if filename in expected_metadata:
expected = expected_metadata[filename]
assert (
doc.semantic_id == expected["display_name"]
), f"Display name mismatch for {filename}"
assert doc.link == expected["link"], f"Link mismatch for {filename}"

View File

@ -0,0 +1,16 @@
[
{
"filename": "sample1.txt",
"link": "https://www.google.com",
"file_display_name": "Basically Google",
"primary_owners": ["evan@onyx.app"],
"status": "bingle bongle"
},
{
"filename": "sample2.txt",
"link": "https://www.youtube.com",
"file_display_name": "Pretty much youtube",
"primary_owners": ["chris@onyx.app"],
"status": "not bingle bongle"
}
]

View File

@ -0,0 +1,19 @@
The following contains some excerpts from our docs.
The File Connector indexes user uploaded files. Currently supports .txt, .pdf, .docx, .pptx, .xlsx, .csv, .md, .mdx, .conf, .log, .json, .tsv, .xml, .yml, .yaml, .eml, and .epub files.
You can also upload a .zip containing these files - If there are other file types in the zip, the other file types are ignored.
There is also an optional metadata line that supports links, document owners, and time updated as metadata for Onyxs retrieval and AI Answer.
The metadata line should be placed at the very top of the file and can take one of two formats:
#ONYX_METADATA={"link": "<LINK>"}
<!-- ONYX_METADATA={"link": "<LINK>"} -->
Where ONYX_METADATA= is followed by a json. The valid json keys are:
link
primary_owners
secondary_owners
doc_updated_at
file_display_name
You can also include arbitrary key/value pairs which will be understood as “tags”.
These tags can then be used in the UI as a filter if you want to constrain your search / conversation to only documents with certain tag(s) attached

View File

@ -0,0 +1 @@
Hello, I hope you're having a wonderful day!

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/README.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/README.md"} -->
# Deploying Onyx

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/docker_compose/README.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/docker_compose/README.md"} -->
# Deploying Onyx using Docker Compose

View File

@ -1,4 +1,4 @@
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/web/README.md"} -->
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/web/README.md"} -->
This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).