diff --git a/backend/onyx/connectors/airtable/airtable_connector.py b/backend/onyx/connectors/airtable/airtable_connector.py index 898fb0f311..777f2137fc 100644 --- a/backend/onyx/connectors/airtable/airtable_connector.py +++ b/backend/onyx/connectors/airtable/airtable_connector.py @@ -20,9 +20,9 @@ from onyx.utils.logger import setup_logger logger = setup_logger() # NOTE: all are made lowercase to avoid case sensitivity issues -# these are the field types that are considered metadata rather -# than sections -_METADATA_FIELD_TYPES = { +# These field types are considered metadata by default when +# treat_all_non_attachment_fields_as_metadata is False +DEFAULT_METADATA_FIELD_TYPES = { "singlecollaborator", "collaborator", "createdby", @@ -60,12 +60,16 @@ class AirtableConnector(LoadConnector): self, base_id: str, table_name_or_id: str, + treat_all_non_attachment_fields_as_metadata: bool = False, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.base_id = base_id self.table_name_or_id = table_name_or_id self.batch_size = batch_size self.airtable_client: AirtableApi | None = None + self.treat_all_non_attachment_fields_as_metadata = ( + treat_all_non_attachment_fields_as_metadata + ) def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: self.airtable_client = AirtableApi(credentials["airtable_access_token"]) @@ -166,8 +170,14 @@ class AirtableConnector(LoadConnector): return [(str(field_info), default_link)] def _should_be_metadata(self, field_type: str) -> bool: - """Determine if a field type should be treated as metadata.""" - return field_type.lower() in _METADATA_FIELD_TYPES + """Determine if a field type should be treated as metadata. + + When treat_all_non_attachment_fields_as_metadata is True, all fields except + attachments are treated as metadata. Otherwise, only fields with types listed + in DEFAULT_METADATA_FIELD_TYPES are treated as metadata.""" + if self.treat_all_non_attachment_fields_as_metadata: + return field_type.lower() != "multipleattachments" + return field_type.lower() in DEFAULT_METADATA_FIELD_TYPES def _process_field( self, @@ -233,7 +243,7 @@ class AirtableConnector(LoadConnector): record: RecordDict, table_schema: TableSchema, primary_field_name: str | None, - ) -> Document: + ) -> Document | None: """Process a single Airtable record into a Document. Args: @@ -277,6 +287,10 @@ class AirtableConnector(LoadConnector): sections.extend(field_sections) metadata.update(field_metadata) + if not sections: + logger.warning(f"No sections found for record {record_id}") + return None + semantic_id = ( f"{table_name}: {primary_field_value}" if primary_field_value @@ -320,7 +334,8 @@ class AirtableConnector(LoadConnector): table_schema=table_schema, primary_field_name=primary_field_name, ) - record_documents.append(document) + if document: + record_documents.append(document) if len(record_documents) >= self.batch_size: yield record_documents diff --git a/backend/tests/daily/connectors/airtable/test_airtable_basic.py b/backend/tests/daily/connectors/airtable/test_airtable_basic.py index 078c7206ea..bb5e312f13 100644 --- a/backend/tests/daily/connectors/airtable/test_airtable_basic.py +++ b/backend/tests/daily/connectors/airtable/test_airtable_basic.py @@ -1,8 +1,10 @@ import os +from collections.abc import Generator from unittest.mock import MagicMock from unittest.mock import patch import pytest +from pydantic import BaseModel from onyx.configs.constants import DocumentSource from onyx.connectors.airtable.airtable_connector import AirtableConnector @@ -10,25 +12,24 @@ from onyx.connectors.models import Document from onyx.connectors.models import Section -@pytest.fixture( - params=[ - ("table_name", os.environ["AIRTABLE_TEST_TABLE_NAME"]), - ("table_id", os.environ["AIRTABLE_TEST_TABLE_ID"]), - ] -) -def airtable_connector(request: pytest.FixtureRequest) -> AirtableConnector: - param_type, table_identifier = request.param - connector = AirtableConnector( - base_id=os.environ["AIRTABLE_TEST_BASE_ID"], - table_name_or_id=table_identifier, - ) +class AirtableConfig(BaseModel): + base_id: str + table_identifier: str + access_token: str - connector.load_credentials( - { - "airtable_access_token": os.environ["AIRTABLE_ACCESS_TOKEN"], - } + +@pytest.fixture(params=[True, False]) +def airtable_config(request: pytest.FixtureRequest) -> AirtableConfig: + table_identifier = ( + os.environ["AIRTABLE_TEST_TABLE_NAME"] + if request.param + else os.environ["AIRTABLE_TEST_TABLE_ID"] + ) + return AirtableConfig( + base_id=os.environ["AIRTABLE_TEST_BASE_ID"], + table_identifier=table_identifier, + access_token=os.environ["AIRTABLE_ACCESS_TOKEN"], ) - return connector def create_test_document( @@ -46,18 +47,37 @@ def create_test_document( assignee: str, days_since_status_change: int | None, attachments: list[tuple[str, str]] | None = None, + all_fields_as_metadata: bool = False, ) -> Document: - link_base = f"https://airtable.com/{os.environ['AIRTABLE_TEST_BASE_ID']}/{os.environ['AIRTABLE_TEST_TABLE_ID']}" - sections = [ - Section( - text=f"Title:\n------------------------\n{title}\n------------------------", - link=f"{link_base}/{id}", - ), - Section( - text=f"Description:\n------------------------\n{description}\n------------------------", - link=f"{link_base}/{id}", - ), - ] + base_id = os.environ.get("AIRTABLE_TEST_BASE_ID") + table_id = os.environ.get("AIRTABLE_TEST_TABLE_ID") + missing_vars = [] + if not base_id: + missing_vars.append("AIRTABLE_TEST_BASE_ID") + if not table_id: + missing_vars.append("AIRTABLE_TEST_TABLE_ID") + + if missing_vars: + raise RuntimeError( + f"Required environment variables not set: {', '.join(missing_vars)}. " + "These variables are required to run Airtable connector tests." + ) + link_base = f"https://airtable.com/{base_id}/{table_id}" + sections = [] + + if not all_fields_as_metadata: + sections.extend( + [ + Section( + text=f"Title:\n------------------------\n{title}\n------------------------", + link=f"{link_base}/{id}", + ), + Section( + text=f"Description:\n------------------------\n{description}\n------------------------", + link=f"{link_base}/{id}", + ), + ] + ) if attachments: for attachment_text, attachment_link in attachments: @@ -68,26 +88,36 @@ def create_test_document( ), ) + metadata: dict[str, str | list[str]] = { + # "Category": category, + "Assignee": assignee, + "Submitted by": submitted_by, + "Priority": priority, + "Status": status, + "Created time": created_time, + "ID": ticket_id, + "Status last changed": status_last_changed, + **( + {"Days since status change": str(days_since_status_change)} + if days_since_status_change is not None + else {} + ), + } + + if all_fields_as_metadata: + metadata.update( + { + "Title": title, + "Description": description, + } + ) + return Document( id=f"airtable__{id}", sections=sections, source=DocumentSource.AIRTABLE, - semantic_identifier=f"{os.environ['AIRTABLE_TEST_TABLE_NAME']}: {title}", - metadata={ - # "Category": category, - "Assignee": assignee, - "Submitted by": submitted_by, - "Priority": priority, - "Status": status, - "Created time": created_time, - "ID": ticket_id, - "Status last changed": status_last_changed, - **( - {"Days since status change": str(days_since_status_change)} - if days_since_status_change is not None - else {} - ), - }, + semantic_identifier=f"{os.environ.get('AIRTABLE_TEST_TABLE_NAME', '')}: {title}", + metadata=metadata, doc_updated_at=None, primary_owners=None, secondary_owners=None, @@ -97,15 +127,84 @@ def create_test_document( ) -@patch( - "onyx.file_processing.extract_file_text.get_unstructured_api_key", - return_value=None, -) -def test_airtable_connector_basic( - mock_get_api_key: MagicMock, airtable_connector: AirtableConnector -) -> None: - doc_batch_generator = airtable_connector.load_from_state() +@pytest.fixture +def mock_get_api_key() -> Generator[MagicMock, None, None]: + with patch( + "onyx.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, + ) as mock: + yield mock + +def compare_documents( + actual_docs: list[Document], expected_docs: list[Document] +) -> None: + """Utility function to compare actual and expected documents, ignoring order.""" + actual_docs_dict = {doc.id: doc for doc in actual_docs} + expected_docs_dict = {doc.id: doc for doc in expected_docs} + + assert actual_docs_dict.keys() == expected_docs_dict.keys(), "Document ID mismatch" + + for doc_id in actual_docs_dict: + actual = actual_docs_dict[doc_id] + expected = expected_docs_dict[doc_id] + + assert ( + actual.source == expected.source + ), f"Source mismatch for document {doc_id}" + assert ( + actual.semantic_identifier == expected.semantic_identifier + ), f"Semantic identifier mismatch for document {doc_id}" + assert ( + actual.metadata == expected.metadata + ), f"Metadata mismatch for document {doc_id}" + assert ( + actual.doc_updated_at == expected.doc_updated_at + ), f"Updated at mismatch for document {doc_id}" + assert ( + actual.primary_owners == expected.primary_owners + ), f"Primary owners mismatch for document {doc_id}" + assert ( + actual.secondary_owners == expected.secondary_owners + ), f"Secondary owners mismatch for document {doc_id}" + assert actual.title == expected.title, f"Title mismatch for document {doc_id}" + assert ( + actual.from_ingestion_api == expected.from_ingestion_api + ), f"Ingestion API flag mismatch for document {doc_id}" + assert ( + actual.additional_info == expected.additional_info + ), f"Additional info mismatch for document {doc_id}" + + # Compare sections + assert len(actual.sections) == len( + expected.sections + ), f"Number of sections mismatch for document {doc_id}" + for i, (actual_section, expected_section) in enumerate( + zip(actual.sections, expected.sections) + ): + assert ( + actual_section.text == expected_section.text + ), f"Section {i} text mismatch for document {doc_id}" + assert ( + actual_section.link == expected_section.link + ), f"Section {i} link mismatch for document {doc_id}" + + +def test_airtable_connector_basic( + mock_get_api_key: MagicMock, airtable_config: AirtableConfig +) -> None: + """Test behavior when all non-attachment fields are treated as metadata.""" + connector = AirtableConnector( + base_id=airtable_config.base_id, + table_name_or_id=airtable_config.table_identifier, + treat_all_non_attachment_fields_as_metadata=False, + ) + connector.load_credentials( + { + "airtable_access_token": airtable_config.access_token, + } + ) + doc_batch_generator = connector.load_from_state() doc_batch = next(doc_batch_generator) with pytest.raises(StopIteration): next(doc_batch_generator) @@ -119,15 +218,62 @@ def test_airtable_connector_basic( description="The internet connection is very slow.", priority="Medium", status="In Progress", - # Link to another record is skipped for now - # category="Data Science", ticket_id="2", created_time="2024-12-24T21:02:49.000Z", status_last_changed="2024-12-24T21:02:49.000Z", days_since_status_change=0, assignee="Chris Weaver (chris@onyx.app)", submitted_by="Chris Weaver (chris@onyx.app)", + all_fields_as_metadata=False, ), + create_test_document( + id="reccSlIA4pZEFxPBg", + title="Printer Issue", + description="The office printer is not working.", + priority="High", + status="Open", + ticket_id="1", + created_time="2024-12-24T21:02:49.000Z", + status_last_changed="2024-12-24T21:02:49.000Z", + days_since_status_change=0, + assignee="Chris Weaver (chris@onyx.app)", + submitted_by="Chris Weaver (chris@onyx.app)", + attachments=[ + ( + "Test.pdf:\ntesting!!!", + "https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide", + ) + ], + all_fields_as_metadata=False, + ), + ] + + # Compare documents using the utility function + compare_documents(doc_batch, expected_docs) + + +def test_airtable_connector_all_metadata( + mock_get_api_key: MagicMock, airtable_config: AirtableConfig +) -> None: + connector = AirtableConnector( + base_id=airtable_config.base_id, + table_name_or_id=airtable_config.table_identifier, + treat_all_non_attachment_fields_as_metadata=True, + ) + connector.load_credentials( + { + "airtable_access_token": airtable_config.access_token, + } + ) + doc_batch_generator = connector.load_from_state() + doc_batch = next(doc_batch_generator) + with pytest.raises(StopIteration): + next(doc_batch_generator) + + # NOTE: one of the rows has no attachments -> no content -> no document + assert len(doc_batch) == 1 + + expected_docs = [ create_test_document( id="reccSlIA4pZEFxPBg", title="Printer Issue", @@ -149,50 +295,9 @@ def test_airtable_connector_basic( "https://airtable.com/appCXJqDFS4gea8tn/tblRxFQsTlBBZdRY1/viwVUEJjWPd8XYjh8/reccSlIA4pZEFxPBg/fld1u21zkJACIvAEF/attlj2UBWNEDZngCc?blocks=hide", ) ], + all_fields_as_metadata=True, ), ] - # Compare each document field by field - for actual, expected in zip(doc_batch, expected_docs): - assert actual.id == expected.id, f"ID mismatch for document {actual.id}" - assert ( - actual.source == expected.source - ), f"Source mismatch for document {actual.id}" - assert ( - actual.semantic_identifier == expected.semantic_identifier - ), f"Semantic identifier mismatch for document {actual.id}" - assert ( - actual.metadata == expected.metadata - ), f"Metadata mismatch for document {actual.id}" - assert ( - actual.doc_updated_at == expected.doc_updated_at - ), f"Updated at mismatch for document {actual.id}" - assert ( - actual.primary_owners == expected.primary_owners - ), f"Primary owners mismatch for document {actual.id}" - assert ( - actual.secondary_owners == expected.secondary_owners - ), f"Secondary owners mismatch for document {actual.id}" - assert ( - actual.title == expected.title - ), f"Title mismatch for document {actual.id}" - assert ( - actual.from_ingestion_api == expected.from_ingestion_api - ), f"Ingestion API flag mismatch for document {actual.id}" - assert ( - actual.additional_info == expected.additional_info - ), f"Additional info mismatch for document {actual.id}" - - # Compare sections - assert len(actual.sections) == len( - expected.sections - ), f"Number of sections mismatch for document {actual.id}" - for i, (actual_section, expected_section) in enumerate( - zip(actual.sections, expected.sections) - ): - assert ( - actual_section.text == expected_section.text - ), f"Section {i} text mismatch for document {actual.id}" - assert ( - actual_section.link == expected_section.link - ), f"Section {i} link mismatch for document {actual.id}" + # Compare documents using the utility function + compare_documents(doc_batch, expected_docs) diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 7f6f0050b9..7153104687 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -1106,6 +1106,14 @@ For example, specifying .*-support.* as a "channel" will cause the connector to name: "table_name_or_id", optional: false, }, + { + type: "checkbox", + label: "Treat all fields except attachments as metadata", + name: "treat_all_non_attachment_fields_as_metadata", + description: + "Choose this if the primary content to index are attachments and all other columns are metadata for these attachments.", + optional: false, + }, ], advanced_values: [], overrideDefaultFreq: 60 * 60 * 24,