Add more airtable logging (#3862)

* Add more airtable logging * Add multithreading * Remove empty comment
2025-09-18 19:43:26 +02:00 · 2025-01-30 17:33:42 -08:00
parent 5e21dc6cb3
commit 288daa4e90
6 changed files with 124 additions and 15 deletions
--- a/backend/onyx/connectors/airtable/airtable_connector.py
+++ b/backend/onyx/connectors/airtable/airtable_connector.py
@@ -1,3 +1,5 @@
+from concurrent.futures import as_completed
+from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 from typing import Any

@@ -274,6 +276,11 @@ class AirtableConnector(LoadConnector):
            field_val = fields.get(field_name)
            field_type = field_schema.type

+            logger.debug(
+                f"Processing field '{field_name}' of type '{field_type}' "
+                f"for record '{record_id}'."
+            )
+
            field_sections, field_metadata = self._process_field(
                field_id=field_schema.id,
                field_name=field_name,
@@ -327,19 +334,45 @@ class AirtableConnector(LoadConnector):
                primary_field_name = field.name
                break

-        record_documents: list[Document] = []
-        for record in records:
-            document = self._process_record(
-                record=record,
-                table_schema=table_schema,
-                primary_field_name=primary_field_name,
-            )
-            if document:
-                record_documents.append(document)
+        logger.info(f"Starting to process Airtable records for {table.name}.")

+        # Process records in parallel batches using ThreadPoolExecutor
+        PARALLEL_BATCH_SIZE = 16
+        max_workers = min(PARALLEL_BATCH_SIZE, len(records))
+
+        # Process records in batches
+        for i in range(0, len(records), PARALLEL_BATCH_SIZE):
+            batch_records = records[i : i + PARALLEL_BATCH_SIZE]
+            record_documents: list[Document] = []
+
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # Submit batch tasks
+                future_to_record = {
+                    executor.submit(
+                        self._process_record,
+                        record=record,
+                        table_schema=table_schema,
+                        primary_field_name=primary_field_name,
+                    ): record
+                    for record in batch_records
+                }
+
+                # Wait for all tasks in this batch to complete
+                for future in as_completed(future_to_record):
+                    record = future_to_record[future]
+                    try:
+                        document = future.result()
+                        if document:
+                            record_documents.append(document)
+                    except Exception as e:
+                        logger.exception(f"Failed to process record {record['id']}")
+                        raise e
+
+            # After batch is complete, yield if we've hit the batch size
            if len(record_documents) >= self.batch_size:
                yield record_documents
                record_documents = []

+        # Yield any remaining records
        if record_documents:
            yield record_documents
--- a/backend/onyx/connectors/connector_runner.py
+++ b/backend/onyx/connectors/connector_runner.py
@@ -1,4 +1,5 @@
 import sys
+import time
 from datetime import datetime

 from onyx.connectors.interfaces import BaseConnector
@@ -45,7 +46,17 @@ class ConnectorRunner:
    def run(self) -> GenerateDocumentsOutput:
        """Adds additional exception logging to the connector."""
        try:
-            yield from self.doc_batch_generator
+            start = time.monotonic()
+            for batch in self.doc_batch_generator:
+                # to know how long connector is taking
+                logger.debug(
+                    f"Connector took {time.monotonic() - start} seconds to build a batch."
+                )
+
+                yield batch
+
+                start = time.monotonic()
+
        except Exception:
            exc_type, _, exc_traceback = sys.exc_info()

--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -150,6 +150,16 @@ class Document(DocumentBase):
    id: str  # This must be unique or during indexing/reindexing, chunks will be overwritten
    source: DocumentSource

+    def get_total_char_length(self) -> int:
+        """Calculate the total character length of the document including sections, metadata, and identifiers."""
+        section_length = sum(len(section.text) for section in self.sections)
+        identifier_length = len(self.semantic_identifier) + len(self.title or "")
+        metadata_length = sum(
+            len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v)
+            for k, v in self.metadata.items()
+        )
+        return section_length + identifier_length + metadata_length
+
    def to_short_descriptor(self) -> str:
        """Used when logging the identity of a document"""
        return f"ID: '{self.id}'; Semantic ID: '{self.semantic_identifier}'"