feat: add option to treat all non-attachment fields as metadata in Airtable connector (#3817)

* feat: add option to treat all non-attachment fields as metadata in Airtable connector

- Added new UI option 'treat_all_non_attachment_fields_as_metadata'
- Updated backend logic to support treating all fields except attachments as metadata
- Added tests for both default and all-metadata behaviors

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: handle missing environment variables gracefully in airtable tests

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: clean up test file and handle environment variables properly

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add missing test fixture and fix formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type annotation for metadata dict in airtable tests

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type annotation for mock_get_api_key fixture

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: update Generator import to use collections.abc

Co-Authored-By: Chris Weaver <chris@onyx.app>

* refactor: make treat_all_non_attachment_fields_as_metadata a direct required parameter

- Move parameter from connector_config to direct class parameter
- Place parameter right under table_name_or_id argument
- Make parameter required in UI with no default value
- Update tests to use new parameter structure

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: rename _METADATA_FIELD_TYPES to DEFAULT_METADATA_FIELD_TYPES and clarify usage

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting in docstring

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: make airtable tests fail loudly on missing env vars

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: add required newline between test functions

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: update error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: fix error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix line length in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: simplify error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: add type validation test for treat_all_non_attachment_fields_as_metadata

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add missing required parameter in test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: remove parameter from test to properly validate it is required

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type validation for treat_all_non_attachment_fields_as_metadata parameter

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in airtable_connector.py

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: update type validation test to handle mypy errors

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: specify mypy ignore type for call-arg

Co-Authored-By: Chris Weaver <chris@onyx.app>

* Also handle rows w/o sections

* style: fix black formatting in test assertion

Co-Authored-By: Chris Weaver <chris@onyx.app>

* add TODO

* Remove unnecessary check

* Fix test

* Do not break existing airtable connectors

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Chris Weaver <chris@onyx.app>
Co-authored-by: Weves <chrisweaver101@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-01-28 17:28:32 -08:00 committed by GitHub
parent d2aea63573
commit d903e5912a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 233 additions and 105 deletions

View File

@ -20,9 +20,9 @@ from onyx.utils.logger import setup_logger
logger = setup_logger() logger = setup_logger()
# NOTE: all are made lowercase to avoid case sensitivity issues # NOTE: all are made lowercase to avoid case sensitivity issues
# these are the field types that are considered metadata rather # These field types are considered metadata by default when
# than sections # treat_all_non_attachment_fields_as_metadata is False
_METADATA_FIELD_TYPES = { DEFAULT_METADATA_FIELD_TYPES = {
"singlecollaborator", "singlecollaborator",
"collaborator", "collaborator",
"createdby", "createdby",
@ -60,12 +60,16 @@ class AirtableConnector(LoadConnector):
self, self,
base_id: str, base_id: str,
table_name_or_id: str, table_name_or_id: str,
treat_all_non_attachment_fields_as_metadata: bool = False,
batch_size: int = INDEX_BATCH_SIZE, batch_size: int = INDEX_BATCH_SIZE,
) -> None: ) -> None:
self.base_id = base_id self.base_id = base_id
self.table_name_or_id = table_name_or_id self.table_name_or_id = table_name_or_id
self.batch_size = batch_size self.batch_size = batch_size
self.airtable_client: AirtableApi | None = None self.airtable_client: AirtableApi | None = None
self.treat_all_non_attachment_fields_as_metadata = (
treat_all_non_attachment_fields_as_metadata
)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.airtable_client = AirtableApi(credentials["airtable_access_token"]) self.airtable_client = AirtableApi(credentials["airtable_access_token"])
@ -166,8 +170,14 @@ class AirtableConnector(LoadConnector):
return [(str(field_info), default_link)] return [(str(field_info), default_link)]
def _should_be_metadata(self, field_type: str) -> bool: def _should_be_metadata(self, field_type: str) -> bool:
"""Determine if a field type should be treated as metadata.""" """Determine if a field type should be treated as metadata.
return field_type.lower() in _METADATA_FIELD_TYPES
When treat_all_non_attachment_fields_as_metadata is True, all fields except
attachments are treated as metadata. Otherwise, only fields with types listed
in DEFAULT_METADATA_FIELD_TYPES are treated as metadata."""
if self.treat_all_non_attachment_fields_as_metadata:
return field_type.lower() != "multipleattachments"
return field_type.lower() in DEFAULT_METADATA_FIELD_TYPES
def _process_field( def _process_field(
self, self,
@ -233,7 +243,7 @@ class AirtableConnector(LoadConnector):
record: RecordDict, record: RecordDict,
table_schema: TableSchema, table_schema: TableSchema,
primary_field_name: str | None, primary_field_name: str | None,
) -> Document: ) -> Document | None:
"""Process a single Airtable record into a Document. """Process a single Airtable record into a Document.
Args: Args:
@ -277,6 +287,10 @@ class AirtableConnector(LoadConnector):
sections.extend(field_sections) sections.extend(field_sections)
metadata.update(field_metadata) metadata.update(field_metadata)
if not sections:
logger.warning(f"No sections found for record {record_id}")
return None
semantic_id = ( semantic_id = (
f"{table_name}: {primary_field_value}" f"{table_name}: {primary_field_value}"
if primary_field_value if primary_field_value
@ -320,7 +334,8 @@ class AirtableConnector(LoadConnector):
table_schema=table_schema, table_schema=table_schema,
primary_field_name=primary_field_name, primary_field_name=primary_field_name,
) )
record_documents.append(document) if document:
record_documents.append(document)
if len(record_documents) >= self.batch_size: if len(record_documents) >= self.batch_size:
yield record_documents yield record_documents

View File

@ -1,8 +1,10 @@
import os import os
from collections.abc import Generator
from unittest.mock import MagicMock from unittest.mock import MagicMock
from unittest.mock import patch from unittest.mock import patch
import pytest import pytest
from pydantic import BaseModel
from onyx.configs.constants import DocumentSource from onyx.configs.constants import DocumentSource
from onyx.connectors.airtable.airtable_connector import AirtableConnector from onyx.connectors.airtable.airtable_connector import AirtableConnector
@ -10,25 +12,24 @@ from onyx.connectors.models import Document
from onyx.connectors.models import Section from onyx.connectors.models import Section
@pytest.fixture( class AirtableConfig(BaseModel):
params=[ base_id: str
("table_name", os.environ["AIRTABLE_TEST_TABLE_NAME"]), table_identifier: str
("table_id", os.environ["AIRTABLE_TEST_TABLE_ID"]), access_token: str
]
)
def airtable_connector(request: pytest.FixtureRequest) -> AirtableConnector:
param_type, table_identifier = request.param
connector = AirtableConnector(
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
table_name_or_id=table_identifier,
)
connector.load_credentials(
{ @pytest.fixture(params=[True, False])
"airtable_access_token": os.environ["AIRTABLE_ACCESS_TOKEN"], def airtable_config(request: pytest.FixtureRequest) -> AirtableConfig:
} table_identifier = (
os.environ["AIRTABLE_TEST_TABLE_NAME"]
if request.param
else os.environ["AIRTABLE_TEST_TABLE_ID"]
)
return AirtableConfig(
base_id=os.environ["AIRTABLE_TEST_BASE_ID"],
table_identifier=table_identifier,
access_token=os.environ["AIRTABLE_ACCESS_TOKEN"],
) )
return connector
def create_test_document( def create_test_document(
@ -46,18 +47,37 @@ def create_test_document(
assignee: str, assignee: str,
days_since_status_change: int | None, days_since_status_change: int | None,
attachments: list[tuple[str, str]] | None = None, attachments: list[tuple[str, str]] | None = None,
all_fields_as_metadata: bool = False,
) -> Document: ) -> Document:
link_base = f"https://airtable.com/{os.environ['AIRTABLE_TEST_BASE_ID']}/{os.environ['AIRTABLE_TEST_TABLE_ID']}" base_id = os.environ.get("AIRTABLE_TEST_BASE_ID")
sections = [ table_id = os.environ.get("AIRTABLE_TEST_TABLE_ID")
Section( missing_vars = []
text=f"Title:\n------------------------\n{title}\n------------------------", if not base_id:
link=f"{link_base}/{id}", missing_vars.append("AIRTABLE_TEST_BASE_ID")
), if not table_id:
Section( missing_vars.append("AIRTABLE_TEST_TABLE_ID")
text=f"Description:\n------------------------\n{description}\n------------------------",
link=f"{link_base}/{id}", if missing_vars:
), raise RuntimeError(
] f"Required environment variables not set: {', '.join(missing_vars)}. "
"These variables are required to run Airtable connector tests."
)
link_base = f"https://airtable.com/{base_id}/{table_id}"
sections = []
if not all_fields_as_metadata:
sections.extend(
[
Section(
text=f"Title:\n------------------------\n{title}\n------------------------",
link=f"{link_base}/{id}",
),
Section(
text=f"Description:\n------------------------\n{description}\n------------------------",
link=f"{link_base}/{id}",
),
]
)
if attachments: if attachments:
for attachment_text, attachment_link in attachments: for attachment_text, attachment_link in attachments:
@ -68,26 +88,36 @@ def create_test_document(
), ),
) )
metadata: dict[str, str | list[str]] = {
# "Category": category,
"Assignee": assignee,
"Submitted by": submitted_by,
"Priority": priority,
"Status": status,
"Created time": created_time,
"ID": ticket_id,
"Status last changed": status_last_changed,
**(
{"Days since status change": str(days_since_status_change)}
if days_since_status_change is not None
else {}
),
}
if all_fields_as_metadata:
metadata.update(
{
"Title": title,
"Description": description,
}
)
return Document( return Document(
id=f"airtable__{id}", id=f"airtable__{id}",
sections=sections, sections=sections,
source=DocumentSource.AIRTABLE, source=DocumentSource.AIRTABLE,
semantic_identifier=f"{os.environ['AIRTABLE_TEST_TABLE_NAME']}: {title}", semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}",
metadata={ metadata=metadata,
# "Category": category,
"Assignee": assignee,
"Submitted by": submitted_by,
"Priority": priority,
"Status": status,
"Created time": created_time,
"ID": ticket_id,
"Status last changed": status_last_changed,
**(
{"Days since status change": str(days_since_status_change)}
if days_since_status_change is not None
else {}
),
},
doc_updated_at=None, doc_updated_at=None,
primary_owners=None, primary_owners=None,
secondary_owners=None, secondary_owners=None,
@ -97,15 +127,84 @@ def create_test_document(
) )
@patch( @pytest.fixture
"onyx.file_processing.extract_file_text.get_unstructured_api_key", def mock_get_api_key() -> Generator[MagicMock, None, None]:
return_value=None, with patch(
) "onyx.file_processing.extract_file_text.get_unstructured_api_key",
def test_airtable_connector_basic( return_value=None,
mock_get_api_key: MagicMock, airtable_connector: AirtableConnector ) as mock:
) -> None: yield mock
doc_batch_generator = airtable_connector.load_from_state()
def compare_documents(
actual_docs: list[Document], expected_docs: list[Document]
) -> None:
"""Utility function to compare actual and expected documents, ignoring order."""
actual_docs_dict = {doc.id: doc for doc in actual_docs}
expected_docs_dict = {doc.id: doc for doc in expected_docs}
assert actual_docs_dict.keys() == expected_docs_dict.keys(), "Document ID mismatch"
for doc_id in actual_docs_dict:
actual = actual_docs_dict[doc_id]
expected = expected_docs_dict[doc_id]
assert (
actual.source == expected.source
), f"Source mismatch for document {doc_id}"
assert (
actual.semantic_identifier == expected.semantic_identifier
), f"Semantic identifier mismatch for document {doc_id}"
assert (
actual.metadata == expected.metadata
), f"Metadata mismatch for document {doc_id}"
assert (
actual.doc_updated_at == expected.doc_updated_at
), f"Updated at mismatch for document {doc_id}"
assert (
actual.primary_owners == expected.primary_owners
), f"Primary owners mismatch for document {doc_id}"
assert (
actual.secondary_owners == expected.secondary_owners
), f"Secondary owners mismatch for document {doc_id}"
assert actual.title == expected.title, f"Title mismatch for document {doc_id}"
assert (
actual.from_ingestion_api == expected.from_ingestion_api
), f"Ingestion API flag mismatch for document {doc_id}"
assert (
actual.additional_info == expected.additional_info
), f"Additional info mismatch for document {doc_id}"
# Compare sections
assert len(actual.sections) == len(
expected.sections
), f"Number of sections mismatch for document {doc_id}"
for i, (actual_section, expected_section) in enumerate(
zip(actual.sections, expected.sections)
):
assert (
actual_section.text == expected_section.text
), f"Section {i} text mismatch for document {doc_id}"
assert (
actual_section.link == expected_section.link
), f"Section {i} link mismatch for document {doc_id}"
def test_airtable_connector_basic(
mock_get_api_key: MagicMock, airtable_config: AirtableConfig
) -> None:
"""Test behavior when all non-attachment fields are treated as metadata."""
connector = AirtableConnector(
base_id=airtable_config.base_id,
table_name_or_id=airtable_config.table_identifier,
treat_all_non_attachment_fields_as_metadata=False,
)
connector.load_credentials(
{
"airtable_access_token": airtable_config.access_token,
}
)
doc_batch_generator = connector.load_from_state()
doc_batch = next(doc_batch_generator) doc_batch = next(doc_batch_generator)
with pytest.raises(StopIteration): with pytest.raises(StopIteration):
next(doc_batch_generator) next(doc_batch_generator)
@ -119,15 +218,62 @@ def test_airtable_connector_basic(
description="The internet connection is very slow.", description="The internet connection is very slow.",
priority="Medium", priority="Medium",
status="In Progress", status="In Progress",
# Link to another record is skipped for now
# category="Data Science",
ticket_id="2", ticket_id="2",
created_time="2024-12-24T21:02:49.000Z", created_time="2024-12-24T21:02:49.000Z",
status_last_changed="2024-12-24T21:02:49.000Z", status_last_changed="2024-12-24T21:02:49.000Z",
days_since_status_change=0, days_since_status_change=0,
assignee="Chris Weaver (chris@onyx.app)", assignee="Chris Weaver (chris@onyx.app)",
submitted_by="Chris Weaver (chris@onyx.app)", submitted_by="Chris Weaver (chris@onyx.app)",
all_fields_as_metadata=False,
), ),
create_test_document(
id="reccSlIA4pZEFxPBg",
title="Printer Issue",
description="The office printer is not working.",
priority="High",
status="Open",
ticket_id="1",
created_time="2024-12-24T21:02:49.000Z",
status_last_changed="2024-12-24T21:02:49.000Z",
days_since_status_change=0,
assignee="Chris Weaver (chris@onyx.app)",
submitted_by="Chris Weaver (chris@onyx.app)",
attachments=[
(
"Test.pdf:\ntesting!!!",
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
)
],
all_fields_as_metadata=False,
),
]
# Compare documents using the utility function
compare_documents(doc_batch, expected_docs)
def test_airtable_connector_all_metadata(
mock_get_api_key: MagicMock, airtable_config: AirtableConfig
) -> None:
connector = AirtableConnector(
base_id=airtable_config.base_id,
table_name_or_id=airtable_config.table_identifier,
treat_all_non_attachment_fields_as_metadata=True,
)
connector.load_credentials(
{
"airtable_access_token": airtable_config.access_token,
}
)
doc_batch_generator = connector.load_from_state()
doc_batch = next(doc_batch_generator)
with pytest.raises(StopIteration):
next(doc_batch_generator)
# NOTE: one of the rows has no attachments -> no content -> no document
assert len(doc_batch) == 1
expected_docs = [
create_test_document( create_test_document(
id="reccSlIA4pZEFxPBg", id="reccSlIA4pZEFxPBg",
title="Printer Issue", title="Printer Issue",
@ -149,50 +295,9 @@ def test_airtable_connector_basic(
"https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide", "https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide",
) )
], ],
all_fields_as_metadata=True,
), ),
] ]
# Compare each document field by field # Compare documents using the utility function
for actual, expected in zip(doc_batch, expected_docs): compare_documents(doc_batch, expected_docs)
assert actual.id == expected.id, f"ID mismatch for document {actual.id}"
assert (
actual.source == expected.source
), f"Source mismatch for document {actual.id}"
assert (
actual.semantic_identifier == expected.semantic_identifier
), f"Semantic identifier mismatch for document {actual.id}"
assert (
actual.metadata == expected.metadata
), f"Metadata mismatch for document {actual.id}"
assert (
actual.doc_updated_at == expected.doc_updated_at
), f"Updated at mismatch for document {actual.id}"
assert (
actual.primary_owners == expected.primary_owners
), f"Primary owners mismatch for document {actual.id}"
assert (
actual.secondary_owners == expected.secondary_owners
), f"Secondary owners mismatch for document {actual.id}"
assert (
actual.title == expected.title
), f"Title mismatch for document {actual.id}"
assert (
actual.from_ingestion_api == expected.from_ingestion_api
), f"Ingestion API flag mismatch for document {actual.id}"
assert (
actual.additional_info == expected.additional_info
), f"Additional info mismatch for document {actual.id}"
# Compare sections
assert len(actual.sections) == len(
expected.sections
), f"Number of sections mismatch for document {actual.id}"
for i, (actual_section, expected_section) in enumerate(
zip(actual.sections, expected.sections)
):
assert (
actual_section.text == expected_section.text
), f"Section {i} text mismatch for document {actual.id}"
assert (
actual_section.link == expected_section.link
), f"Section {i} link mismatch for document {actual.id}"

View File

@ -1106,6 +1106,14 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
name: "table_name_or_id", name: "table_name_or_id",
optional: false, optional: false,
}, },
{
type: "checkbox",
label: "Treat all fields except attachments as metadata",
name: "treat_all_non_attachment_fields_as_metadata",
description:
"Choose this if the primary content to index are attachments and all other columns are metadata for these attachments.",
optional: false,
},
], ],
advanced_values: [], advanced_values: [],
overrideDefaultFreq: 60 * 60 * 24, overrideDefaultFreq: 60 * 60 * 24,