mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-05-30 01:30:21 +02:00
File connector tests (#4561)
* danswer to onyx plus tests for file connector * actually add test
This commit is contained in:
parent
5681df9095
commit
dc62d83a06
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
|
||||
|
||||
# Contributing to Onyx
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
|
||||
|
||||
<a name="readme-top"></a>
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
|
||||
|
||||
# Alembic DB Migrations
|
||||
|
||||
|
@ -6,12 +6,6 @@ Create Date: 2025-04-01 07:26:10.539362
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import inspect
|
||||
import datetime
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "6a804aeb4830"
|
||||
down_revision = "8e1ac4f39a9f"
|
||||
@ -19,99 +13,10 @@ branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
# Leaving this around only because some people might be on this migration
|
||||
# originally was a duplicate of the user files migration
|
||||
def upgrade() -> None:
|
||||
# Check if user_file table already exists
|
||||
conn = op.get_bind()
|
||||
inspector = inspect(conn)
|
||||
|
||||
if not inspector.has_table("user_file"):
|
||||
# Create user_folder table without parent_id
|
||||
op.create_table(
|
||||
"user_folder",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
|
||||
sa.Column("name", sa.String(length=255), nullable=True),
|
||||
sa.Column("description", sa.String(length=255), nullable=True),
|
||||
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
|
||||
),
|
||||
)
|
||||
|
||||
# Create user_file table with folder_id instead of parent_folder_id
|
||||
op.create_table(
|
||||
"user_file",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
|
||||
sa.Column(
|
||||
"folder_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_folder.id"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("link_url", sa.String(), nullable=True),
|
||||
sa.Column("token_count", sa.Integer(), nullable=True),
|
||||
sa.Column("file_type", sa.String(), nullable=True),
|
||||
sa.Column("file_id", sa.String(length=255), nullable=False),
|
||||
sa.Column("document_id", sa.String(length=255), nullable=False),
|
||||
sa.Column("name", sa.String(length=255), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(),
|
||||
default=datetime.datetime.utcnow,
|
||||
),
|
||||
sa.Column(
|
||||
"cc_pair_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("connector_credential_pair.id"),
|
||||
nullable=True,
|
||||
unique=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Create persona__user_file table
|
||||
op.create_table(
|
||||
"persona__user_file",
|
||||
sa.Column(
|
||||
"persona_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("persona.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
sa.Column(
|
||||
"user_file_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_file.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Create persona__user_folder table
|
||||
op.create_table(
|
||||
"persona__user_folder",
|
||||
sa.Column(
|
||||
"persona_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("persona.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
sa.Column(
|
||||
"user_folder_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_folder.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
op.add_column(
|
||||
"connector_credential_pair",
|
||||
sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
|
||||
)
|
||||
|
||||
# Update existing records to have is_user_file=False instead of NULL
|
||||
op.execute(
|
||||
"UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
|
||||
)
|
||||
pass
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
|
@ -103,6 +103,7 @@ def upgrade() -> None:
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("connector_credential_pair", "is_user_file")
|
||||
# Drop the persona__user_folder table
|
||||
op.drop_table("persona__user_folder")
|
||||
# Drop the persona__user_file table
|
||||
@ -111,4 +112,3 @@ def downgrade() -> None:
|
||||
op.drop_table("user_file")
|
||||
# Drop the user_folder table
|
||||
op.drop_table("user_folder")
|
||||
op.drop_column("connector_credential_pair", "is_user_file")
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/onyx/connectors/README.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/onyx/connectors/README.md"} -->
|
||||
|
||||
# Writing a new Onyx Connector
|
||||
|
||||
|
@ -30,7 +30,7 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _read_file_from_postgres(
|
||||
def _read_file_from_filestore(
|
||||
file_name: str,
|
||||
db_session: Session,
|
||||
) -> IO | None:
|
||||
@ -307,7 +307,7 @@ class LocalFileConnector(LoadConnector):
|
||||
for file_path in self.file_locations:
|
||||
current_datetime = datetime.now(timezone.utc)
|
||||
|
||||
file_io = _read_file_from_postgres(
|
||||
file_io = _read_file_from_filestore(
|
||||
file_name=file_path,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
@ -148,7 +148,7 @@ def load_files_from_zip(
|
||||
ignore_dirs: bool = True,
|
||||
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any]]]:
|
||||
"""
|
||||
If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
|
||||
Iterates through files in a zip archive, yielding (ZipInfo, file handle) pairs.
|
||||
"""
|
||||
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
||||
for file_info in zip_file.infolist():
|
||||
|
@ -66,7 +66,10 @@ class FileManager:
|
||||
|
||||
@staticmethod
|
||||
def upload_file_for_connector(
|
||||
file_path: str, file_name: str, user_performing_action: DATestUser
|
||||
file_path: str,
|
||||
file_name: str,
|
||||
user_performing_action: DATestUser,
|
||||
content_type: str = "application/octet-stream",
|
||||
) -> dict:
|
||||
# Read the file content
|
||||
with open(file_path, "rb") as f:
|
||||
@ -76,7 +79,7 @@ class FileManager:
|
||||
file_obj = io.BytesIO(file_content)
|
||||
|
||||
# The 'files' form field expects a list of files
|
||||
files = [("files", (file_name, file_obj, "application/octet-stream"))]
|
||||
files = [("files", (file_name, file_obj, content_type))]
|
||||
|
||||
# Use the user's headers but without Content-Type
|
||||
# as requests will set the correct multipart/form-data Content-Type for us
|
||||
|
@ -0,0 +1,118 @@
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.models import InputType
|
||||
from onyx.db.document import get_documents_for_cc_pair
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.server.documents.models import DocumentSource
|
||||
from tests.integration.common_utils.managers.cc_pair import CCPairManager
|
||||
from tests.integration.common_utils.managers.connector import ConnectorManager
|
||||
from tests.integration.common_utils.managers.credential import CredentialManager
|
||||
from tests.integration.common_utils.managers.file import FileManager
|
||||
from tests.integration.common_utils.managers.user import UserManager
|
||||
from tests.integration.common_utils.test_models import DATestUser
|
||||
from tests.integration.common_utils.vespa import vespa_fixture
|
||||
|
||||
|
||||
# This is a placeholder - you'll need to create this zip file with actual test files
|
||||
TEST_FILES_BASE = "tests/integration/tests/indexing/file_connector/test_files"
|
||||
TEST_META_ZIP_PATH = f"{TEST_FILES_BASE}/with_meta.zip"
|
||||
TEST_NO_META_ZIP_PATH = f"{TEST_FILES_BASE}/without_meta.zip"
|
||||
TEST_METADATA_FILE = f"{TEST_FILES_BASE}/.onyx_metadata.json"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"zip_path, has_metadata",
|
||||
[
|
||||
(TEST_META_ZIP_PATH, True),
|
||||
(TEST_NO_META_ZIP_PATH, False),
|
||||
],
|
||||
)
|
||||
def test_zip_metadata_handling(
|
||||
reset: None,
|
||||
vespa_client: vespa_fixture,
|
||||
zip_path: str,
|
||||
has_metadata: bool,
|
||||
) -> None:
|
||||
before = datetime.now(timezone.utc)
|
||||
# Create an admin user
|
||||
admin_user: DATestUser = UserManager.create(
|
||||
email="admin@onyx-test.com",
|
||||
)
|
||||
|
||||
# Upload the test zip file (simulate this happening from frontend)
|
||||
upload_response = FileManager.upload_file_for_connector(
|
||||
file_path=zip_path,
|
||||
file_name=os.path.basename(zip_path),
|
||||
user_performing_action=admin_user,
|
||||
content_type="application/zip",
|
||||
)
|
||||
|
||||
file_paths = upload_response.get("file_paths", [])
|
||||
assert file_paths, "File upload failed - no file paths returned"
|
||||
if has_metadata:
|
||||
metadata = upload_response.get("zip_metadata", {})
|
||||
assert metadata, "Metadata should be present"
|
||||
else:
|
||||
metadata = {}
|
||||
|
||||
# Create a dummy credential for the file connector
|
||||
credential = CredentialManager.create(
|
||||
source=DocumentSource.FILE,
|
||||
credential_json={},
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Create the connector
|
||||
connector_name = f"FileConnector-{int(datetime.now().timestamp())}"
|
||||
connector = ConnectorManager.create(
|
||||
name=connector_name,
|
||||
source=DocumentSource.FILE,
|
||||
input_type=InputType.LOAD_STATE,
|
||||
connector_specific_config={
|
||||
"file_locations": file_paths,
|
||||
"zip_metadata": metadata,
|
||||
},
|
||||
access_type=AccessType.PUBLIC,
|
||||
groups=[],
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Link the credential to the connector
|
||||
cc_pair = CCPairManager.create(
|
||||
credential_id=credential.id,
|
||||
connector_id=connector.id,
|
||||
access_type=AccessType.PUBLIC,
|
||||
user_performing_action=admin_user,
|
||||
)
|
||||
|
||||
# Run the connector to index the files
|
||||
CCPairManager.run_once(
|
||||
cc_pair, from_beginning=True, user_performing_action=admin_user
|
||||
)
|
||||
CCPairManager.wait_for_indexing_completion(
|
||||
cc_pair=cc_pair, after=before, user_performing_action=admin_user
|
||||
)
|
||||
|
||||
# Get the indexed documents
|
||||
with get_session_context_manager() as db_session:
|
||||
documents = get_documents_for_cc_pair(db_session, cc_pair.id)
|
||||
|
||||
# Expected metadata from the .onyx_metadata.json file
|
||||
with open(TEST_METADATA_FILE, "r") as f:
|
||||
expected_metadata = json.load(f)
|
||||
|
||||
# Verify each document has the correct metadata
|
||||
for doc in documents:
|
||||
filename = doc.semantic_id
|
||||
if filename in expected_metadata:
|
||||
expected = expected_metadata[filename]
|
||||
assert (
|
||||
doc.semantic_id == expected["display_name"]
|
||||
), f"Display name mismatch for {filename}"
|
||||
assert doc.link == expected["link"], f"Link mismatch for {filename}"
|
@ -0,0 +1,16 @@
|
||||
[
|
||||
{
|
||||
"filename": "sample1.txt",
|
||||
"link": "https://www.google.com",
|
||||
"file_display_name": "Basically Google",
|
||||
"primary_owners": ["evan@onyx.app"],
|
||||
"status": "bingle bongle"
|
||||
},
|
||||
{
|
||||
"filename": "sample2.txt",
|
||||
"link": "https://www.youtube.com",
|
||||
"file_display_name": "Pretty much youtube",
|
||||
"primary_owners": ["chris@onyx.app"],
|
||||
"status": "not bingle bongle"
|
||||
}
|
||||
]
|
@ -0,0 +1,19 @@
|
||||
The following contains some excerpts from our docs.
|
||||
|
||||
The File Connector indexes user uploaded files. Currently supports .txt, .pdf, .docx, .pptx, .xlsx, .csv, .md, .mdx, .conf, .log, .json, .tsv, .xml, .yml, .yaml, .eml, and .epub files.
|
||||
You can also upload a .zip containing these files - If there are other file types in the zip, the other file types are ignored.
|
||||
There is also an optional metadata line that supports links, document owners, and time updated as metadata for Onyx’s retrieval and AI Answer.
|
||||
|
||||
The metadata line should be placed at the very top of the file and can take one of two formats:
|
||||
|
||||
#ONYX_METADATA={"link": "<LINK>"}
|
||||
<!-- ONYX_METADATA={"link": "<LINK>"} -->
|
||||
Where ONYX_METADATA= is followed by a json. The valid json keys are:
|
||||
|
||||
link
|
||||
primary_owners
|
||||
secondary_owners
|
||||
doc_updated_at
|
||||
file_display_name
|
||||
You can also include arbitrary key/value pairs which will be understood as “tags”.
|
||||
These tags can then be used in the UI as a filter if you want to constrain your search / conversation to only documents with certain tag(s) attached
|
@ -0,0 +1 @@
|
||||
Hello, I hope you're having a wonderful day!
|
Binary file not shown.
Binary file not shown.
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/README.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/README.md"} -->
|
||||
|
||||
# Deploying Onyx
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/docker_compose/README.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/deployment/docker_compose/README.md"} -->
|
||||
|
||||
# Deploying Onyx using Docker Compose
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/web/README.md"} -->
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/web/README.md"} -->
|
||||
|
||||
This is a [Next.js](https://nextjs.org/) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user