mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-06-23 22:41:05 +02:00
feat: add option to treat all non-attachment fields as metadata in Airtable connector (#3817)
* feat: add option to treat all non-attachment fields as metadata in Airtable connector - Added new UI option 'treat_all_non_attachment_fields_as_metadata' - Updated backend logic to support treating all fields except attachments as metadata - Added tests for both default and all-metadata behaviors Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: handle missing environment variables gracefully in airtable tests Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: clean up test file and handle environment variables properly Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: add missing test fixture and fix formatting Co-Authored-By: Chris Weaver <chris@onyx.app> * chore: fix black formatting Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: add type annotation for metadata dict in airtable tests Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: add type annotation for mock_get_api_key fixture Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: update Generator import to use collections.abc Co-Authored-By: Chris Weaver <chris@onyx.app> * refactor: make treat_all_non_attachment_fields_as_metadata a direct required parameter - Move parameter from connector_config to direct class parameter - Place parameter right under table_name_or_id argument - Make parameter required in UI with no default value - Update tests to use new parameter structure Co-Authored-By: Chris Weaver <chris@onyx.app> * chore: fix black formatting Co-Authored-By: Chris Weaver <chris@onyx.app> * chore: rename _METADATA_FIELD_TYPES to DEFAULT_METADATA_FIELD_TYPES and clarify usage Co-Authored-By: Chris Weaver <chris@onyx.app> * chore: fix black formatting in docstring Co-Authored-By: Chris Weaver <chris@onyx.app> * test: make airtable tests fail loudly on missing env vars Co-Authored-By: Chris Weaver <chris@onyx.app> * style: fix black formatting in test file Co-Authored-By: Chris Weaver <chris@onyx.app> * style: add required newline between test functions Co-Authored-By: Chris Weaver <chris@onyx.app> * test: update error message pattern in parameter validation test Co-Authored-By: Chris Weaver <chris@onyx.app> * style: fix black formatting in test file Co-Authored-By: Chris Weaver <chris@onyx.app> * test: fix error message pattern in parameter validation test Co-Authored-By: Chris Weaver <chris@onyx.app> * style: fix line length in test file Co-Authored-By: Chris Weaver <chris@onyx.app> * test: simplify error message pattern in parameter validation test Co-Authored-By: Chris Weaver <chris@onyx.app> * test: add type validation test for treat_all_non_attachment_fields_as_metadata Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: add missing required parameter in test Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: remove parameter from test to properly validate it is required Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: add type validation for treat_all_non_attachment_fields_as_metadata parameter Co-Authored-By: Chris Weaver <chris@onyx.app> * style: fix black formatting in airtable_connector.py Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: update type validation test to handle mypy errors Co-Authored-By: Chris Weaver <chris@onyx.app> * fix: specify mypy ignore type for call-arg Co-Authored-By: Chris Weaver <chris@onyx.app> * Also handle rows w/o sections * style: fix black formatting in test assertion Co-Authored-By: Chris Weaver <chris@onyx.app> * add TODO * Remove unnecessary check * Fix test * Do not break existing airtable connectors --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Chris Weaver <chris@onyx.app> Co-authored-by: Weves <chrisweaver101@gmail.com>
This commit is contained in:
parent
d2aea63573
commit
d903e5912a
@ -20,9 +20,9 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
# NOTE: all are made lowercase to avoid case sensitivity issues
|
||||
# these are the field types that are considered metadata rather
|
||||
# than sections
|
||||
_METADATA_FIELD_TYPES = {
|
||||
# These field types are considered metadata by default when
|
||||
# treat_all_non_attachment_fields_as_metadata is False
|
||||
DEFAULT_METADATA_FIELD_TYPES = {
|
||||
"singlecollaborator",
|
||||
"collaborator",
|
||||
"createdby",
|
||||
@ -60,12 +60,16 @@ class AirtableConnector(LoadConnector):
|
||||
self,
|
||||
base_id: str,
|
||||
table_name_or_id: str,
|
||||
treat_all_non_attachment_fields_as_metadata: bool = False,
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
self.base_id = base_id
|
||||
self.table_name_or_id = table_name_or_id
|
||||
self.batch_size = batch_size
|
||||
self.airtable_client: AirtableApi | None = None
|
||||
self.treat_all_non_attachment_fields_as_metadata = (
|
||||
treat_all_non_attachment_fields_as_metadata
|
||||
)
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
self.airtable_client = AirtableApi(credentials["airtable_access_token"])
|
||||
@ -166,8 +170,14 @@ class AirtableConnector(LoadConnector):
|
||||
return [(str(field_info), default_link)]
|
||||
|
||||
def _should_be_metadata(self, field_type: str) -> bool:
|
||||
"""Determine if a field type should be treated as metadata."""
|
||||
return field_type.lower() in _METADATA_FIELD_TYPES
|
||||
"""Determine if a field type should be treated as metadata.
|
||||
|
||||
When treat_all_non_attachment_fields_as_metadata is True, all fields except
|
||||
attachments are treated as metadata. Otherwise, only fields with types listed
|
||||
in DEFAULT_METADATA_FIELD_TYPES are treated as metadata."""
|
||||
if self.treat_all_non_attachment_fields_as_metadata:
|
||||
return field_type.lower() != "multipleattachments"
|
||||
return field_type.lower() in DEFAULT_METADATA_FIELD_TYPES
|
||||
|
||||
def _process_field(
|
||||
self,
|
||||
@ -233,7 +243,7 @@ class AirtableConnector(LoadConnector):
|
||||
record: RecordDict,
|
||||
table_schema: TableSchema,
|
||||
primary_field_name: str | None,
|
||||
) -> Document:
|
||||
) -> Document | None:
|
||||
"""Process a single Airtable record into a Document.
|
||||
|
||||
Args:
|
||||
@ -277,6 +287,10 @@ class AirtableConnector(LoadConnector):
|
||||
sections.extend(field_sections)
|
||||
metadata.update(field_metadata)
|
||||
|
||||
if not sections:
|
||||
logger.warning(f"No sections found for record {record_id}")
|
||||
return None
|
||||
|
||||
semantic_id = (
|
||||
f"{table_name}: {primary_field_value}"
|
||||
if primary_field_value
|
||||
@ -320,7 +334,8 @@ class AirtableConnector(LoadConnector):
|
||||
table_schema=table_schema,
|
||||
primary_field_name=primary_field_name,
|
||||
)
|
||||
record_documents.append(document)
|
||||
if document:
|
||||
record_documents.append(document)
|
||||
|
||||
if len(record_documents) >= self.batch_size:
|
||||
yield record_documents
|
||||
|
@ -1,8 +1,10 @@
|
||||
import os
|
||||
from collections.abc import Generator
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.airtable.airtable_connector import AirtableConnector
|
||||
@ -10,25 +12,24 @@ from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("table_name", os.environ["AIRTABLE_TEST_TABLE_NAME"]),
|
||||
("table_id", os.environ["AIRTABLE_TEST_TABLE_ID"]),
|
||||
]
|
||||
)
|
||||
def airtable_connector(request: pytest.FixtureRequest) -> AirtableConnector:
|
||||
param_type, table_identifier = request.param
|
||||
connector = AirtableConnector(
|
||||
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
|
||||
table_name_or_id=table_identifier,
|
||||
)
|
||||
class AirtableConfig(BaseModel):
|
||||
base_id: str
|
||||
table_identifier: str
|
||||
access_token: str
|
||||
|
||||
connector.load_credentials(
|
||||
{
|
||||
"airtable_access_token": os.environ["AIRTABLE_ACCESS_TOKEN"],
|
||||
}
|
||||
|
||||
@pytest.fixture(params=[True, False])
|
||||
def airtable_config(request: pytest.FixtureRequest) -> AirtableConfig:
|
||||
table_identifier = (
|
||||
os.environ["AIRTABLE_TEST_TABLE_NAME"]
|
||||
if request.param
|
||||
else os.environ["AIRTABLE_TEST_TABLE_ID"]
|
||||
)
|
||||
return AirtableConfig(
|
||||
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
|
||||
table_identifier=table_identifier,
|
||||
access_token=os.environ["AIRTABLE_ACCESS_TOKEN"],
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
def create_test_document(
|
||||
@ -46,18 +47,37 @@ def create_test_document(
|
||||
assignee: str,
|
||||
days_since_status_change: int | None,
|
||||
attachments: list[tuple[str, str]] | None = None,
|
||||
all_fields_as_metadata: bool = False,
|
||||
) -> Document:
|
||||
link_base = f"https://airtable.com/{os.environ['AIRTABLE_TEST_BASE_ID']}/{os.environ['AIRTABLE_TEST_TABLE_ID']}"
|
||||
sections = [
|
||||
Section(
|
||||
text=f"Title:\n------------------------\n{title}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
Section(
|
||||
text=f"Description:\n------------------------\n{description}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
]
|
||||
base_id = os.environ.get("AIRTABLE_TEST_BASE_ID")
|
||||
table_id = os.environ.get("AIRTABLE_TEST_TABLE_ID")
|
||||
missing_vars = []
|
||||
if not base_id:
|
||||
missing_vars.append("AIRTABLE_TEST_BASE_ID")
|
||||
if not table_id:
|
||||
missing_vars.append("AIRTABLE_TEST_TABLE_ID")
|
||||
|
||||
if missing_vars:
|
||||
raise RuntimeError(
|
||||
f"Required environment variables not set: {', '.join(missing_vars)}. "
|
||||
"These variables are required to run Airtable connector tests."
|
||||
)
|
||||
link_base = f"https://airtable.com/{base_id}/{table_id}"
|
||||
sections = []
|
||||
|
||||
if not all_fields_as_metadata:
|
||||
sections.extend(
|
||||
[
|
||||
Section(
|
||||
text=f"Title:\n------------------------\n{title}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
Section(
|
||||
text=f"Description:\n------------------------\n{description}\n------------------------",
|
||||
link=f"{link_base}/{id}",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
if attachments:
|
||||
for attachment_text, attachment_link in attachments:
|
||||
@ -68,26 +88,36 @@ def create_test_document(
|
||||
),
|
||||
)
|
||||
|
||||
metadata: dict[str, str | list[str]] = {
|
||||
# "Category": category,
|
||||
"Assignee": assignee,
|
||||
"Submitted by": submitted_by,
|
||||
"Priority": priority,
|
||||
"Status": status,
|
||||
"Created time": created_time,
|
||||
"ID": ticket_id,
|
||||
"Status last changed": status_last_changed,
|
||||
**(
|
||||
{"Days since status change": str(days_since_status_change)}
|
||||
if days_since_status_change is not None
|
||||
else {}
|
||||
),
|
||||
}
|
||||
|
||||
if all_fields_as_metadata:
|
||||
metadata.update(
|
||||
{
|
||||
"Title": title,
|
||||
"Description": description,
|
||||
}
|
||||
)
|
||||
|
||||
return Document(
|
||||
id=f"airtable__{id}",
|
||||
sections=sections,
|
||||
source=DocumentSource.AIRTABLE,
|
||||
semantic_identifier=f"{os.environ['AIRTABLE_TEST_TABLE_NAME']}: {title}",
|
||||
metadata={
|
||||
# "Category": category,
|
||||
"Assignee": assignee,
|
||||
"Submitted by": submitted_by,
|
||||
"Priority": priority,
|
||||
"Status": status,
|
||||
"Created time": created_time,
|
||||
"ID": ticket_id,
|
||||
"Status last changed": status_last_changed,
|
||||
**(
|
||||
{"Days since status change": str(days_since_status_change)}
|
||||
if days_since_status_change is not None
|
||||
else {}
|
||||
),
|
||||
},
|
||||
semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}",
|
||||
metadata=metadata,
|
||||
doc_updated_at=None,
|
||||
primary_owners=None,
|
||||
secondary_owners=None,
|
||||
@ -97,15 +127,84 @@ def create_test_document(
|
||||
)
|
||||
|
||||
|
||||
@patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
)
|
||||
def test_airtable_connector_basic(
|
||||
mock_get_api_key: MagicMock, airtable_connector: AirtableConnector
|
||||
) -> None:
|
||||
doc_batch_generator = airtable_connector.load_from_state()
|
||||
@pytest.fixture
|
||||
def mock_get_api_key() -> Generator[MagicMock, None, None]:
|
||||
with patch(
|
||||
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
|
||||
return_value=None,
|
||||
) as mock:
|
||||
yield mock
|
||||
|
||||
|
||||
def compare_documents(
|
||||
actual_docs: list[Document], expected_docs: list[Document]
|
||||
) -> None:
|
||||
"""Utility function to compare actual and expected documents, ignoring order."""
|
||||
actual_docs_dict = {doc.id: doc for doc in actual_docs}
|
||||
expected_docs_dict = {doc.id: doc for doc in expected_docs}
|
||||
|
||||
assert actual_docs_dict.keys() == expected_docs_dict.keys(), "Document ID mismatch"
|
||||
|
||||
for doc_id in actual_docs_dict:
|
||||
actual = actual_docs_dict[doc_id]
|
||||
expected = expected_docs_dict[doc_id]
|
||||
|
||||
assert (
|
||||
actual.source == expected.source
|
||||
), f"Source mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.semantic_identifier == expected.semantic_identifier
|
||||
), f"Semantic identifier mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.metadata == expected.metadata
|
||||
), f"Metadata mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.doc_updated_at == expected.doc_updated_at
|
||||
), f"Updated at mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.primary_owners == expected.primary_owners
|
||||
), f"Primary owners mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.secondary_owners == expected.secondary_owners
|
||||
), f"Secondary owners mismatch for document {doc_id}"
|
||||
assert actual.title == expected.title, f"Title mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.from_ingestion_api == expected.from_ingestion_api
|
||||
), f"Ingestion API flag mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual.additional_info == expected.additional_info
|
||||
), f"Additional info mismatch for document {doc_id}"
|
||||
|
||||
# Compare sections
|
||||
assert len(actual.sections) == len(
|
||||
expected.sections
|
||||
), f"Number of sections mismatch for document {doc_id}"
|
||||
for i, (actual_section, expected_section) in enumerate(
|
||||
zip(actual.sections, expected.sections)
|
||||
):
|
||||
assert (
|
||||
actual_section.text == expected_section.text
|
||||
), f"Section {i} text mismatch for document {doc_id}"
|
||||
assert (
|
||||
actual_section.link == expected_section.link
|
||||
), f"Section {i} link mismatch for document {doc_id}"
|
||||
|
||||
|
||||
def test_airtable_connector_basic(
|
||||
mock_get_api_key: MagicMock, airtable_config: AirtableConfig
|
||||
) -> None:
|
||||
"""Test behavior when all non-attachment fields are treated as metadata."""
|
||||
connector = AirtableConnector(
|
||||
base_id=airtable_config.base_id,
|
||||
table_name_or_id=airtable_config.table_identifier,
|
||||
treat_all_non_attachment_fields_as_metadata=False,
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"airtable_access_token": airtable_config.access_token,
|
||||
}
|
||||
)
|
||||
doc_batch_generator = connector.load_from_state()
|
||||
doc_batch = next(doc_batch_generator)
|
||||
with pytest.raises(StopIteration):
|
||||
next(doc_batch_generator)
|
||||
@ -119,15 +218,62 @@ def test_airtable_connector_basic(
|
||||
description="The internet connection is very slow.",
|
||||
priority="Medium",
|
||||
status="In Progress",
|
||||
# Link to another record is skipped for now
|
||||
# category="Data Science",
|
||||
ticket_id="2",
|
||||
created_time="2024-12-24T21:02:49.000Z",
|
||||
status_last_changed="2024-12-24T21:02:49.000Z",
|
||||
days_since_status_change=0,
|
||||
assignee="Chris Weaver (chris@onyx.app)",
|
||||
submitted_by="Chris Weaver (chris@onyx.app)",
|
||||
all_fields_as_metadata=False,
|
||||
),
|
||||
create_test_document(
|
||||
id="reccSlIA4pZEFxPBg",
|
||||
title="Printer Issue",
|
||||
description="The office printer is not working.",
|
||||
priority="High",
|
||||
status="Open",
|
||||
ticket_id="1",
|
||||
created_time="2024-12-24T21:02:49.000Z",
|
||||
status_last_changed="2024-12-24T21:02:49.000Z",
|
||||
days_since_status_change=0,
|
||||
assignee="Chris Weaver (chris@onyx.app)",
|
||||
submitted_by="Chris Weaver (chris@onyx.app)",
|
||||
attachments=[
|
||||
(
|
||||
"Test.pdf:\ntesting!!!",
|
||||
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
|
||||
)
|
||||
],
|
||||
all_fields_as_metadata=False,
|
||||
),
|
||||
]
|
||||
|
||||
# Compare documents using the utility function
|
||||
compare_documents(doc_batch, expected_docs)
|
||||
|
||||
|
||||
def test_airtable_connector_all_metadata(
|
||||
mock_get_api_key: MagicMock, airtable_config: AirtableConfig
|
||||
) -> None:
|
||||
connector = AirtableConnector(
|
||||
base_id=airtable_config.base_id,
|
||||
table_name_or_id=airtable_config.table_identifier,
|
||||
treat_all_non_attachment_fields_as_metadata=True,
|
||||
)
|
||||
connector.load_credentials(
|
||||
{
|
||||
"airtable_access_token": airtable_config.access_token,
|
||||
}
|
||||
)
|
||||
doc_batch_generator = connector.load_from_state()
|
||||
doc_batch = next(doc_batch_generator)
|
||||
with pytest.raises(StopIteration):
|
||||
next(doc_batch_generator)
|
||||
|
||||
# NOTE: one of the rows has no attachments -> no content -> no document
|
||||
assert len(doc_batch) == 1
|
||||
|
||||
expected_docs = [
|
||||
create_test_document(
|
||||
id="reccSlIA4pZEFxPBg",
|
||||
title="Printer Issue",
|
||||
@ -149,50 +295,9 @@ def test_airtable_connector_basic(
|
||||
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
|
||||
)
|
||||
],
|
||||
all_fields_as_metadata=True,
|
||||
),
|
||||
]
|
||||
|
||||
# Compare each document field by field
|
||||
for actual, expected in zip(doc_batch, expected_docs):
|
||||
assert actual.id == expected.id, f"ID mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.source == expected.source
|
||||
), f"Source mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.semantic_identifier == expected.semantic_identifier
|
||||
), f"Semantic identifier mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.metadata == expected.metadata
|
||||
), f"Metadata mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.doc_updated_at == expected.doc_updated_at
|
||||
), f"Updated at mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.primary_owners == expected.primary_owners
|
||||
), f"Primary owners mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.secondary_owners == expected.secondary_owners
|
||||
), f"Secondary owners mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.title == expected.title
|
||||
), f"Title mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.from_ingestion_api == expected.from_ingestion_api
|
||||
), f"Ingestion API flag mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual.additional_info == expected.additional_info
|
||||
), f"Additional info mismatch for document {actual.id}"
|
||||
|
||||
# Compare sections
|
||||
assert len(actual.sections) == len(
|
||||
expected.sections
|
||||
), f"Number of sections mismatch for document {actual.id}"
|
||||
for i, (actual_section, expected_section) in enumerate(
|
||||
zip(actual.sections, expected.sections)
|
||||
):
|
||||
assert (
|
||||
actual_section.text == expected_section.text
|
||||
), f"Section {i} text mismatch for document {actual.id}"
|
||||
assert (
|
||||
actual_section.link == expected_section.link
|
||||
), f"Section {i} link mismatch for document {actual.id}"
|
||||
# Compare documents using the utility function
|
||||
compare_documents(doc_batch, expected_docs)
|
||||
|
@ -1106,6 +1106,14 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
|
||||
name: "table_name_or_id",
|
||||
optional: false,
|
||||
},
|
||||
{
|
||||
type: "checkbox",
|
||||
label: "Treat all fields except attachments as metadata",
|
||||
name: "treat_all_non_attachment_fields_as_metadata",
|
||||
description:
|
||||
"Choose this if the primary content to index are attachments and all other columns are metadata for these attachments.",
|
||||
optional: false,
|
||||
},
|
||||
],
|
||||
advanced_values: [],
|
||||
overrideDefaultFreq: 60 * 60 * 24,
|
||||
|
Loading…
x
Reference in New Issue
Block a user