Add more airtable logging (#3862)

* Add more airtable logging

* Add multithreading

* Remove empty comment
This commit is contained in:
Chris Weaver
2025-01-30 17:33:42 -08:00
committed by GitHub
parent 5e21dc6cb3
commit 288daa4e90
6 changed files with 124 additions and 15 deletions

View File

@@ -1,3 +1,5 @@
from concurrent.futures import as_completed
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from typing import Any
@@ -274,6 +276,11 @@ class AirtableConnector(LoadConnector):
field_val = fields.get(field_name)
field_type = field_schema.type
logger.debug(
f"Processing field '{field_name}' of type '{field_type}' "
f"for record '{record_id}'."
)
field_sections, field_metadata = self._process_field(
field_id=field_schema.id,
field_name=field_name,
@@ -327,19 +334,45 @@ class AirtableConnector(LoadConnector):
primary_field_name = field.name
break
record_documents: list[Document] = []
for record in records:
document = self._process_record(
record=record,
table_schema=table_schema,
primary_field_name=primary_field_name,
)
if document:
record_documents.append(document)
logger.info(f"Starting to process Airtable records for {table.name}.")
# Process records in parallel batches using ThreadPoolExecutor
PARALLEL_BATCH_SIZE = 16
max_workers = min(PARALLEL_BATCH_SIZE, len(records))
# Process records in batches
for i in range(0, len(records), PARALLEL_BATCH_SIZE):
batch_records = records[i : i + PARALLEL_BATCH_SIZE]
record_documents: list[Document] = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit batch tasks
future_to_record = {
executor.submit(
self._process_record,
record=record,
table_schema=table_schema,
primary_field_name=primary_field_name,
): record
for record in batch_records
}
# Wait for all tasks in this batch to complete
for future in as_completed(future_to_record):
record = future_to_record[future]
try:
document = future.result()
if document:
record_documents.append(document)
except Exception as e:
logger.exception(f"Failed to process record {record['id']}")
raise e
# After batch is complete, yield if we've hit the batch size
if len(record_documents) >= self.batch_size:
yield record_documents
record_documents = []
# Yield any remaining records
if record_documents:
yield record_documents

View File

@@ -1,4 +1,5 @@
import sys
import time
from datetime import datetime
from onyx.connectors.interfaces import BaseConnector
@@ -45,7 +46,17 @@ class ConnectorRunner:
def run(self) -> GenerateDocumentsOutput:
"""Adds additional exception logging to the connector."""
try:
yield from self.doc_batch_generator
start = time.monotonic()
for batch in self.doc_batch_generator:
# to know how long connector is taking
logger.debug(
f"Connector took {time.monotonic() - start} seconds to build a batch."
)
yield batch
start = time.monotonic()
except Exception:
exc_type, _, exc_traceback = sys.exc_info()

View File

@@ -150,6 +150,16 @@ class Document(DocumentBase):
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
source: DocumentSource
def get_total_char_length(self) -> int:
"""Calculate the total character length of the document including sections, metadata, and identifiers."""
section_length = sum(len(section.text) for section in self.sections)
identifier_length = len(self.semantic_identifier) + len(self.title or "")
metadata_length = sum(
len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v)
for k, v in self.metadata.items()
)
return section_length + identifier_length + metadata_length
def to_short_descriptor(self) -> str:
"""Used when logging the identity of a document"""
return f"ID: '{self.id}'; Semantic ID: '{self.semantic_identifier}'"