feat: add option to treat all non-attachment fields as metadata in Airtable connector (#3817)

* feat: add option to treat all non-attachment fields as metadata in Airtable connector

- Added new UI option 'treat_all_non_attachment_fields_as_metadata'
- Updated backend logic to support treating all fields except attachments as metadata
- Added tests for both default and all-metadata behaviors

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: handle missing environment variables gracefully in airtable tests

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: clean up test file and handle environment variables properly

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add missing test fixture and fix formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type annotation for metadata dict in airtable tests

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type annotation for mock_get_api_key fixture

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: update Generator import to use collections.abc

Co-Authored-By: Chris Weaver <chris@onyx.app>

* refactor: make treat_all_non_attachment_fields_as_metadata a direct required parameter

- Move parameter from connector_config to direct class parameter
- Place parameter right under table_name_or_id argument
- Make parameter required in UI with no default value
- Update tests to use new parameter structure

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: rename _METADATA_FIELD_TYPES to DEFAULT_METADATA_FIELD_TYPES and clarify usage

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting in docstring

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: make airtable tests fail loudly on missing env vars

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: add required newline between test functions

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: update error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: fix error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix line length in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: simplify error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: add type validation test for treat_all_non_attachment_fields_as_metadata

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add missing required parameter in test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: remove parameter from test to properly validate it is required

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type validation for treat_all_non_attachment_fields_as_metadata parameter

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in airtable_connector.py

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: update type validation test to handle mypy errors

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: specify mypy ignore type for call-arg

Co-Authored-By: Chris Weaver <chris@onyx.app>

* Also handle rows w/o sections

* style: fix black formatting in test assertion

Co-Authored-By: Chris Weaver <chris@onyx.app>

* add TODO

* Remove unnecessary check

* Fix test

* Do not break existing airtable connectors

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Chris Weaver <chris@onyx.app>
Co-authored-by: Weves <chrisweaver101@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-01-28 17:28:32 -08:00 committed by GitHub
parent d2aea63573
commit d903e5912a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 233 additions and 105 deletions

View File

@ -20,9 +20,9 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
# NOTE: all are made lowercase to avoid case sensitivity issues
# these are the field types that are considered metadata rather
# than sections
_METADATA_FIELD_TYPES = {
# These field types are considered metadata by default when
# treat_all_non_attachment_fields_as_metadata is False
DEFAULT_METADATA_FIELD_TYPES = {
"singlecollaborator",
"collaborator",
"createdby",
@ -60,12 +60,16 @@ class AirtableConnector(LoadConnector):
self,
base_id: str,
table_name_or_id: str,
treat_all_non_attachment_fields_as_metadata: bool = False,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.base_id = base_id
self.table_name_or_id = table_name_or_id
self.batch_size = batch_size
self.airtable_client: AirtableApi | None = None
self.treat_all_non_attachment_fields_as_metadata = (
treat_all_non_attachment_fields_as_metadata
)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.airtable_client = AirtableApi(credentials["airtable_access_token"])
@ -166,8 +170,14 @@ class AirtableConnector(LoadConnector):
return [(str(field_info), default_link)]
def _should_be_metadata(self, field_type: str) -> bool:
"""Determine if a field type should be treated as metadata."""
return field_type.lower() in _METADATA_FIELD_TYPES
"""Determine if a field type should be treated as metadata.
When treat_all_non_attachment_fields_as_metadata is True, all fields except
attachments are treated as metadata. Otherwise, only fields with types listed
in DEFAULT_METADATA_FIELD_TYPES are treated as metadata."""
if self.treat_all_non_attachment_fields_as_metadata:
return field_type.lower() != "multipleattachments"
return field_type.lower() in DEFAULT_METADATA_FIELD_TYPES
def _process_field(
self,
@ -233,7 +243,7 @@ class AirtableConnector(LoadConnector):
record: RecordDict,
table_schema: TableSchema,
primary_field_name: str | None,
) -> Document:
) -> Document | None:
"""Process a single Airtable record into a Document.
Args:
@ -277,6 +287,10 @@ class AirtableConnector(LoadConnector):
sections.extend(field_sections)
metadata.update(field_metadata)
if not sections:
logger.warning(f"No sections found for record {record_id}")
return None
semantic_id = (
f"{table_name}: {primary_field_value}"
if primary_field_value
@ -320,7 +334,8 @@ class AirtableConnector(LoadConnector):
table_schema=table_schema,
primary_field_name=primary_field_name,
)
record_documents.append(document)
if document:
record_documents.append(document)
if len(record_documents) >= self.batch_size:
yield record_documents

View File

@ -1,8 +1,10 @@
import os
from collections.abc import Generator
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from pydantic import BaseModel
from onyx.configs.constants import DocumentSource
from onyx.connectors.airtable.airtable_connector import AirtableConnector
@ -10,25 +12,24 @@ from onyx.connectors.models import Document
from onyx.connectors.models import Section
@pytest.fixture(
params=[
("table_name", os.environ["AIRTABLE_TEST_TABLE_NAME"]),
("table_id", os.environ["AIRTABLE_TEST_TABLE_ID"]),
]
)
def airtable_connector(request: pytest.FixtureRequest) -> AirtableConnector:
param_type, table_identifier = request.param
connector = AirtableConnector(
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
table_name_or_id=table_identifier,
)
class AirtableConfig(BaseModel):
base_id: str
table_identifier: str
access_token: str
connector.load_credentials(
{
"airtable_access_token": os.environ["AIRTABLE_ACCESS_TOKEN"],
}
@pytest.fixture(params=[True, False])
def airtable_config(request: pytest.FixtureRequest) -> AirtableConfig:
table_identifier = (
os.environ["AIRTABLE_TEST_TABLE_NAME"]
if request.param
else os.environ["AIRTABLE_TEST_TABLE_ID"]
)
return AirtableConfig(
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
table_identifier=table_identifier,
access_token=os.environ["AIRTABLE_ACCESS_TOKEN"],
)
return connector
def create_test_document(
@ -46,18 +47,37 @@ def create_test_document(
assignee: str,
days_since_status_change: int | None,
attachments: list[tuple[str, str]] | None = None,
all_fields_as_metadata: bool = False,
) -> Document:
link_base = f"https://airtable.com/{os.environ['AIRTABLE_TEST_BASE_ID']}/{os.environ['AIRTABLE_TEST_TABLE_ID']}"
sections = [
Section(
text=f"Title:\n------------------------\n{title}\n------------------------",
link=f"{link_base}/{id}",
),
Section(
text=f"Description:\n------------------------\n{description}\n------------------------",
link=f"{link_base}/{id}",
),
]
base_id = os.environ.get("AIRTABLE_TEST_BASE_ID")
table_id = os.environ.get("AIRTABLE_TEST_TABLE_ID")
missing_vars = []
if not base_id:
missing_vars.append("AIRTABLE_TEST_BASE_ID")
if not table_id:
missing_vars.append("AIRTABLE_TEST_TABLE_ID")
if missing_vars:
raise RuntimeError(
f"Required environment variables not set: {', '.join(missing_vars)}. "
"These variables are required to run Airtable connector tests."
)
link_base = f"https://airtable.com/{base_id}/{table_id}"
sections = []
if not all_fields_as_metadata:
sections.extend(
[
Section(
text=f"Title:\n------------------------\n{title}\n------------------------",
link=f"{link_base}/{id}",
),
Section(
text=f"Description:\n------------------------\n{description}\n------------------------",
link=f"{link_base}/{id}",
),
]
)
if attachments:
for attachment_text, attachment_link in attachments:
@ -68,26 +88,36 @@ def create_test_document(
),
)
metadata: dict[str, str | list[str]] = {
# "Category": category,
"Assignee": assignee,
"Submitted by": submitted_by,
"Priority": priority,
"Status": status,
"Created time": created_time,
"ID": ticket_id,
"Status last changed": status_last_changed,
**(
{"Days since status change": str(days_since_status_change)}
if days_since_status_change is not None
else {}
),
}
if all_fields_as_metadata:
metadata.update(
{
"Title": title,
"Description": description,
}
)
return Document(
id=f"airtable__{id}",
sections=sections,
source=DocumentSource.AIRTABLE,
semantic_identifier=f"{os.environ['AIRTABLE_TEST_TABLE_NAME']}: {title}",
metadata={
# "Category": category,
"Assignee": assignee,
"Submitted by": submitted_by,
"Priority": priority,
"Status": status,
"Created time": created_time,
"ID": ticket_id,
"Status last changed": status_last_changed,
**(
{"Days since status change": str(days_since_status_change)}
if days_since_status_change is not None
else {}
),
},
semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}",
metadata=metadata,
doc_updated_at=None,
primary_owners=None,
secondary_owners=None,
@ -97,15 +127,84 @@ def create_test_document(
)
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_airtable_connector_basic(
mock_get_api_key: MagicMock, airtable_connector: AirtableConnector
) -> None:
doc_batch_generator = airtable_connector.load_from_state()
@pytest.fixture
def mock_get_api_key() -> Generator[MagicMock, None, None]:
with patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
) as mock:
yield mock
def compare_documents(
actual_docs: list[Document], expected_docs: list[Document]
) -> None:
"""Utility function to compare actual and expected documents, ignoring order."""
actual_docs_dict = {doc.id: doc for doc in actual_docs}
expected_docs_dict = {doc.id: doc for doc in expected_docs}
assert actual_docs_dict.keys() == expected_docs_dict.keys(), "Document ID mismatch"
for doc_id in actual_docs_dict:
actual = actual_docs_dict[doc_id]
expected = expected_docs_dict[doc_id]
assert (
actual.source == expected.source
), f"Source mismatch for document {doc_id}"
assert (
actual.semantic_identifier == expected.semantic_identifier
), f"Semantic identifier mismatch for document {doc_id}"
assert (
actual.metadata == expected.metadata
), f"Metadata mismatch for document {doc_id}"
assert (
actual.doc_updated_at == expected.doc_updated_at
), f"Updated at mismatch for document {doc_id}"
assert (
actual.primary_owners == expected.primary_owners
), f"Primary owners mismatch for document {doc_id}"
assert (
actual.secondary_owners == expected.secondary_owners
), f"Secondary owners mismatch for document {doc_id}"
assert actual.title == expected.title, f"Title mismatch for document {doc_id}"
assert (
actual.from_ingestion_api == expected.from_ingestion_api
), f"Ingestion API flag mismatch for document {doc_id}"
assert (
actual.additional_info == expected.additional_info
), f"Additional info mismatch for document {doc_id}"
# Compare sections
assert len(actual.sections) == len(
expected.sections
), f"Number of sections mismatch for document {doc_id}"
for i, (actual_section, expected_section) in enumerate(
zip(actual.sections, expected.sections)
):
assert (
actual_section.text == expected_section.text
), f"Section {i} text mismatch for document {doc_id}"
assert (
actual_section.link == expected_section.link
), f"Section {i} link mismatch for document {doc_id}"
def test_airtable_connector_basic(
mock_get_api_key: MagicMock, airtable_config: AirtableConfig
) -> None:
"""Test behavior when all non-attachment fields are treated as metadata."""
connector = AirtableConnector(
base_id=airtable_config.base_id,
table_name_or_id=airtable_config.table_identifier,
treat_all_non_attachment_fields_as_metadata=False,
)
connector.load_credentials(
{
"airtable_access_token": airtable_config.access_token,
}
)
doc_batch_generator = connector.load_from_state()
doc_batch = next(doc_batch_generator)
with pytest.raises(StopIteration):
next(doc_batch_generator)
@ -119,15 +218,62 @@ def test_airtable_connector_basic(
description="The internet connection is very slow.",
priority="Medium",
status="In Progress",
# Link to another record is skipped for now
# category="Data Science",
ticket_id="2",
created_time="2024-12-24T21:02:49.000Z",
status_last_changed="2024-12-24T21:02:49.000Z",
days_since_status_change=0,
assignee="Chris Weaver (chris@onyx.app)",
submitted_by="Chris Weaver (chris@onyx.app)",
all_fields_as_metadata=False,
),
create_test_document(
id="reccSlIA4pZEFxPBg",
title="Printer Issue",
description="The office printer is not working.",
priority="High",
status="Open",
ticket_id="1",
created_time="2024-12-24T21:02:49.000Z",
status_last_changed="2024-12-24T21:02:49.000Z",
days_since_status_change=0,
assignee="Chris Weaver (chris@onyx.app)",
submitted_by="Chris Weaver (chris@onyx.app)",
attachments=[
(
"Test.pdf:\ntesting!!!",
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
)
],
all_fields_as_metadata=False,
),
]
# Compare documents using the utility function
compare_documents(doc_batch, expected_docs)
def test_airtable_connector_all_metadata(
mock_get_api_key: MagicMock, airtable_config: AirtableConfig
) -> None:
connector = AirtableConnector(
base_id=airtable_config.base_id,
table_name_or_id=airtable_config.table_identifier,
treat_all_non_attachment_fields_as_metadata=True,
)
connector.load_credentials(
{
"airtable_access_token": airtable_config.access_token,
}
)
doc_batch_generator = connector.load_from_state()
doc_batch = next(doc_batch_generator)
with pytest.raises(StopIteration):
next(doc_batch_generator)
# NOTE: one of the rows has no attachments -> no content -> no document
assert len(doc_batch) == 1
expected_docs = [
create_test_document(
id="reccSlIA4pZEFxPBg",
title="Printer Issue",
@ -149,50 +295,9 @@ def test_airtable_connector_basic(
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
)
],
all_fields_as_metadata=True,
),
]
# Compare each document field by field
for actual, expected in zip(doc_batch, expected_docs):
assert actual.id == expected.id, f"ID mismatch for document {actual.id}"
assert (
actual.source == expected.source
), f"Source mismatch for document {actual.id}"
assert (
actual.semantic_identifier == expected.semantic_identifier
), f"Semantic identifier mismatch for document {actual.id}"
assert (
actual.metadata == expected.metadata
), f"Metadata mismatch for document {actual.id}"
assert (
actual.doc_updated_at == expected.doc_updated_at
), f"Updated at mismatch for document {actual.id}"
assert (
actual.primary_owners == expected.primary_owners
), f"Primary owners mismatch for document {actual.id}"
assert (
actual.secondary_owners == expected.secondary_owners
), f"Secondary owners mismatch for document {actual.id}"
assert (
actual.title == expected.title
), f"Title mismatch for document {actual.id}"
assert (
actual.from_ingestion_api == expected.from_ingestion_api
), f"Ingestion API flag mismatch for document {actual.id}"
assert (
actual.additional_info == expected.additional_info
), f"Additional info mismatch for document {actual.id}"
# Compare sections
assert len(actual.sections) == len(
expected.sections
), f"Number of sections mismatch for document {actual.id}"
for i, (actual_section, expected_section) in enumerate(
zip(actual.sections, expected.sections)
):
assert (
actual_section.text == expected_section.text
), f"Section {i} text mismatch for document {actual.id}"
assert (
actual_section.link == expected_section.link
), f"Section {i} link mismatch for document {actual.id}"
# Compare documents using the utility function
compare_documents(doc_batch, expected_docs)

View File

@ -1106,6 +1106,14 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
name: "table_name_or_id",
optional: false,
},
{
type: "checkbox",
label: "Treat all fields except attachments as metadata",
name: "treat_all_non_attachment_fields_as_metadata",
description:
"Choose this if the primary content to index are attachments and all other columns are metadata for these attachments.",
optional: false,
},
],
advanced_values: [],
overrideDefaultFreq: 60 * 60 * 24,