feat: add option to treat all non-attachment fields as metadata in Airtable connector (#3817)

* feat: add option to treat all non-attachment fields as metadata in Airtable connector

- Added new UI option 'treat_all_non_attachment_fields_as_metadata'
- Updated backend logic to support treating all fields except attachments as metadata
- Added tests for both default and all-metadata behaviors

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: handle missing environment variables gracefully in airtable tests

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: clean up test file and handle environment variables properly

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add missing test fixture and fix formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type annotation for metadata dict in airtable tests

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type annotation for mock_get_api_key fixture

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: update Generator import to use collections.abc

Co-Authored-By: Chris Weaver <chris@onyx.app>

* refactor: make treat_all_non_attachment_fields_as_metadata a direct required parameter

- Move parameter from connector_config to direct class parameter
- Place parameter right under table_name_or_id argument
- Make parameter required in UI with no default value
- Update tests to use new parameter structure

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: rename _METADATA_FIELD_TYPES to DEFAULT_METADATA_FIELD_TYPES and clarify usage

Co-Authored-By: Chris Weaver <chris@onyx.app>

* chore: fix black formatting in docstring

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: make airtable tests fail loudly on missing env vars

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: add required newline between test functions

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: update error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: fix error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix line length in test file

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: simplify error message pattern in parameter validation test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* test: add type validation test for treat_all_non_attachment_fields_as_metadata

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add missing required parameter in test

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: remove parameter from test to properly validate it is required

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: add type validation for treat_all_non_attachment_fields_as_metadata parameter

Co-Authored-By: Chris Weaver <chris@onyx.app>

* style: fix black formatting in airtable_connector.py

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: update type validation test to handle mypy errors

Co-Authored-By: Chris Weaver <chris@onyx.app>

* fix: specify mypy ignore type for call-arg

Co-Authored-By: Chris Weaver <chris@onyx.app>

* Also handle rows w/o sections

* style: fix black formatting in test assertion

Co-Authored-By: Chris Weaver <chris@onyx.app>

* add TODO

* Remove unnecessary check

* Fix test

* Do not break existing airtable connectors

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Chris Weaver <chris@onyx.app>
Co-authored-by: Weves <chrisweaver101@gmail.com>
This commit is contained in:
devin-ai-integration[bot]
2025-01-28 17:28:32 -08:00
committed by GitHub
parent d2aea63573
commit d903e5912a
3 changed files with 233 additions and 105 deletions

View File

@@ -20,9 +20,9 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
# NOTE: all are made lowercase to avoid case sensitivity issues
# these are the field types that are considered metadata rather
# than sections
_METADATA_FIELD_TYPES = {
# These field types are considered metadata by default when
# treat_all_non_attachment_fields_as_metadata is False
DEFAULT_METADATA_FIELD_TYPES = {
"singlecollaborator",
"collaborator",
"createdby",
@@ -60,12 +60,16 @@ class AirtableConnector(LoadConnector):
self,
base_id: str,
table_name_or_id: str,
treat_all_non_attachment_fields_as_metadata: bool = False,
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.base_id = base_id
self.table_name_or_id = table_name_or_id
self.batch_size = batch_size
self.airtable_client: AirtableApi | None = None
self.treat_all_non_attachment_fields_as_metadata = (
treat_all_non_attachment_fields_as_metadata
)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.airtable_client = AirtableApi(credentials["airtable_access_token"])
@@ -166,8 +170,14 @@ class AirtableConnector(LoadConnector):
return [(str(field_info), default_link)]
def _should_be_metadata(self, field_type: str) -> bool:
"""Determine if a field type should be treated as metadata."""
return field_type.lower() in _METADATA_FIELD_TYPES
"""Determine if a field type should be treated as metadata.
When treat_all_non_attachment_fields_as_metadata is True, all fields except
attachments are treated as metadata. Otherwise, only fields with types listed
in DEFAULT_METADATA_FIELD_TYPES are treated as metadata."""
if self.treat_all_non_attachment_fields_as_metadata:
return field_type.lower() != "multipleattachments"
return field_type.lower() in DEFAULT_METADATA_FIELD_TYPES
def _process_field(
self,
@@ -233,7 +243,7 @@ class AirtableConnector(LoadConnector):
record: RecordDict,
table_schema: TableSchema,
primary_field_name: str | None,
) -> Document:
) -> Document | None:
"""Process a single Airtable record into a Document.
Args:
@@ -277,6 +287,10 @@ class AirtableConnector(LoadConnector):
sections.extend(field_sections)
metadata.update(field_metadata)
if not sections:
logger.warning(f"No sections found for record {record_id}")
return None
semantic_id = (
f"{table_name}: {primary_field_value}"
if primary_field_value
@@ -320,7 +334,8 @@ class AirtableConnector(LoadConnector):
table_schema=table_schema,
primary_field_name=primary_field_name,
)
record_documents.append(document)
if document:
record_documents.append(document)
if len(record_documents) >= self.batch_size:
yield record_documents