mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-09-27 20:38:32 +02:00
Add more airtable logging (#3862)
* Add more airtable logging * Add multithreading * Remove empty comment
This commit is contained in:
@@ -478,6 +478,12 @@ INDEXING_SIZE_WARNING_THRESHOLD = int(
|
|||||||
# 0 disables this behavior and is the default.
|
# 0 disables this behavior and is the default.
|
||||||
INDEXING_TRACER_INTERVAL = int(os.environ.get("INDEXING_TRACER_INTERVAL") or 0)
|
INDEXING_TRACER_INTERVAL = int(os.environ.get("INDEXING_TRACER_INTERVAL") or 0)
|
||||||
|
|
||||||
|
# Enable multi-threaded embedding model calls for parallel processing
|
||||||
|
# Note: only applies for API-based embedding models
|
||||||
|
INDEXING_EMBEDDING_MODEL_NUM_THREADS = int(
|
||||||
|
os.environ.get("INDEXING_EMBEDDING_MODEL_NUM_THREADS") or 1
|
||||||
|
)
|
||||||
|
|
||||||
# During an indexing attempt, specifies the number of batches which are allowed to
|
# During an indexing attempt, specifies the number of batches which are allowed to
|
||||||
# exception without aborting the attempt.
|
# exception without aborting the attempt.
|
||||||
INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT") or 0)
|
INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT") or 0)
|
||||||
|
@@ -1,3 +1,5 @@
|
|||||||
|
from concurrent.futures import as_completed
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -274,6 +276,11 @@ class AirtableConnector(LoadConnector):
|
|||||||
field_val = fields.get(field_name)
|
field_val = fields.get(field_name)
|
||||||
field_type = field_schema.type
|
field_type = field_schema.type
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Processing field '{field_name}' of type '{field_type}' "
|
||||||
|
f"for record '{record_id}'."
|
||||||
|
)
|
||||||
|
|
||||||
field_sections, field_metadata = self._process_field(
|
field_sections, field_metadata = self._process_field(
|
||||||
field_id=field_schema.id,
|
field_id=field_schema.id,
|
||||||
field_name=field_name,
|
field_name=field_name,
|
||||||
@@ -327,19 +334,45 @@ class AirtableConnector(LoadConnector):
|
|||||||
primary_field_name = field.name
|
primary_field_name = field.name
|
||||||
break
|
break
|
||||||
|
|
||||||
|
logger.info(f"Starting to process Airtable records for {table.name}.")
|
||||||
|
|
||||||
|
# Process records in parallel batches using ThreadPoolExecutor
|
||||||
|
PARALLEL_BATCH_SIZE = 16
|
||||||
|
max_workers = min(PARALLEL_BATCH_SIZE, len(records))
|
||||||
|
|
||||||
|
# Process records in batches
|
||||||
|
for i in range(0, len(records), PARALLEL_BATCH_SIZE):
|
||||||
|
batch_records = records[i : i + PARALLEL_BATCH_SIZE]
|
||||||
record_documents: list[Document] = []
|
record_documents: list[Document] = []
|
||||||
for record in records:
|
|
||||||
document = self._process_record(
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
# Submit batch tasks
|
||||||
|
future_to_record = {
|
||||||
|
executor.submit(
|
||||||
|
self._process_record,
|
||||||
record=record,
|
record=record,
|
||||||
table_schema=table_schema,
|
table_schema=table_schema,
|
||||||
primary_field_name=primary_field_name,
|
primary_field_name=primary_field_name,
|
||||||
)
|
): record
|
||||||
|
for record in batch_records
|
||||||
|
}
|
||||||
|
|
||||||
|
# Wait for all tasks in this batch to complete
|
||||||
|
for future in as_completed(future_to_record):
|
||||||
|
record = future_to_record[future]
|
||||||
|
try:
|
||||||
|
document = future.result()
|
||||||
if document:
|
if document:
|
||||||
record_documents.append(document)
|
record_documents.append(document)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"Failed to process record {record['id']}")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# After batch is complete, yield if we've hit the batch size
|
||||||
if len(record_documents) >= self.batch_size:
|
if len(record_documents) >= self.batch_size:
|
||||||
yield record_documents
|
yield record_documents
|
||||||
record_documents = []
|
record_documents = []
|
||||||
|
|
||||||
|
# Yield any remaining records
|
||||||
if record_documents:
|
if record_documents:
|
||||||
yield record_documents
|
yield record_documents
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from onyx.connectors.interfaces import BaseConnector
|
from onyx.connectors.interfaces import BaseConnector
|
||||||
@@ -45,7 +46,17 @@ class ConnectorRunner:
|
|||||||
def run(self) -> GenerateDocumentsOutput:
|
def run(self) -> GenerateDocumentsOutput:
|
||||||
"""Adds additional exception logging to the connector."""
|
"""Adds additional exception logging to the connector."""
|
||||||
try:
|
try:
|
||||||
yield from self.doc_batch_generator
|
start = time.monotonic()
|
||||||
|
for batch in self.doc_batch_generator:
|
||||||
|
# to know how long connector is taking
|
||||||
|
logger.debug(
|
||||||
|
f"Connector took {time.monotonic() - start} seconds to build a batch."
|
||||||
|
)
|
||||||
|
|
||||||
|
yield batch
|
||||||
|
|
||||||
|
start = time.monotonic()
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
exc_type, _, exc_traceback = sys.exc_info()
|
exc_type, _, exc_traceback = sys.exc_info()
|
||||||
|
|
||||||
|
@@ -150,6 +150,16 @@ class Document(DocumentBase):
|
|||||||
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
id: str # This must be unique or during indexing/reindexing, chunks will be overwritten
|
||||||
source: DocumentSource
|
source: DocumentSource
|
||||||
|
|
||||||
|
def get_total_char_length(self) -> int:
|
||||||
|
"""Calculate the total character length of the document including sections, metadata, and identifiers."""
|
||||||
|
section_length = sum(len(section.text) for section in self.sections)
|
||||||
|
identifier_length = len(self.semantic_identifier) + len(self.title or "")
|
||||||
|
metadata_length = sum(
|
||||||
|
len(k) + len(v) if isinstance(v, str) else len(k) + sum(len(x) for x in v)
|
||||||
|
for k, v in self.metadata.items()
|
||||||
|
)
|
||||||
|
return section_length + identifier_length + metadata_length
|
||||||
|
|
||||||
def to_short_descriptor(self) -> str:
|
def to_short_descriptor(self) -> str:
|
||||||
"""Used when logging the identity of a document"""
|
"""Used when logging the identity of a document"""
|
||||||
return f"ID: '{self.id}'; Semantic ID: '{self.semantic_identifier}'"
|
return f"ID: '{self.id}'; Semantic ID: '{self.semantic_identifier}'"
|
||||||
|
@@ -380,6 +380,15 @@ def index_doc_batch(
|
|||||||
new_docs=0, total_docs=len(filtered_documents), total_chunks=0
|
new_docs=0, total_docs=len(filtered_documents), total_chunks=0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
doc_descriptors = [
|
||||||
|
{
|
||||||
|
"doc_id": doc.id,
|
||||||
|
"doc_length": doc.get_total_char_length(),
|
||||||
|
}
|
||||||
|
for doc in ctx.updatable_docs
|
||||||
|
]
|
||||||
|
logger.debug(f"Starting indexing process for documents: {doc_descriptors}")
|
||||||
|
|
||||||
logger.debug("Starting chunking")
|
logger.debug("Starting chunking")
|
||||||
chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs)
|
chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs)
|
||||||
|
|
||||||
|
@@ -1,6 +1,8 @@
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
from concurrent.futures import as_completed
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
@@ -11,6 +13,7 @@ from requests import RequestException
|
|||||||
from requests import Response
|
from requests import Response
|
||||||
from retry import retry
|
from retry import retry
|
||||||
|
|
||||||
|
from onyx.configs.app_configs import INDEXING_EMBEDDING_MODEL_NUM_THREADS
|
||||||
from onyx.configs.app_configs import LARGE_CHUNK_RATIO
|
from onyx.configs.app_configs import LARGE_CHUNK_RATIO
|
||||||
from onyx.configs.app_configs import SKIP_WARM_UP
|
from onyx.configs.app_configs import SKIP_WARM_UP
|
||||||
from onyx.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
|
from onyx.configs.model_configs import BATCH_SIZE_ENCODE_CHUNKS
|
||||||
@@ -155,6 +158,7 @@ class EmbeddingModel:
|
|||||||
text_type: EmbedTextType,
|
text_type: EmbedTextType,
|
||||||
batch_size: int,
|
batch_size: int,
|
||||||
max_seq_length: int,
|
max_seq_length: int,
|
||||||
|
num_threads: int = INDEXING_EMBEDDING_MODEL_NUM_THREADS,
|
||||||
) -> list[Embedding]:
|
) -> list[Embedding]:
|
||||||
text_batches = batch_list(texts, batch_size)
|
text_batches = batch_list(texts, batch_size)
|
||||||
|
|
||||||
@@ -163,12 +167,15 @@ class EmbeddingModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
embeddings: list[Embedding] = []
|
embeddings: list[Embedding] = []
|
||||||
for idx, text_batch in enumerate(text_batches, start=1):
|
|
||||||
|
def process_batch(
|
||||||
|
batch_idx: int, text_batch: list[str]
|
||||||
|
) -> tuple[int, list[Embedding]]:
|
||||||
if self.callback:
|
if self.callback:
|
||||||
if self.callback.should_stop():
|
if self.callback.should_stop():
|
||||||
raise RuntimeError("_batch_encode_texts detected stop signal")
|
raise RuntimeError("_batch_encode_texts detected stop signal")
|
||||||
|
|
||||||
logger.debug(f"Encoding batch {idx} of {len(text_batches)}")
|
logger.debug(f"Encoding batch {batch_idx} of {len(text_batches)}")
|
||||||
embed_request = EmbedRequest(
|
embed_request = EmbedRequest(
|
||||||
model_name=self.model_name,
|
model_name=self.model_name,
|
||||||
texts=text_batch,
|
texts=text_batch,
|
||||||
@@ -185,10 +192,43 @@ class EmbeddingModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
response = self._make_model_server_request(embed_request)
|
response = self._make_model_server_request(embed_request)
|
||||||
embeddings.extend(response.embeddings)
|
return batch_idx, response.embeddings
|
||||||
|
|
||||||
|
# only multi thread if:
|
||||||
|
# 1. num_threads is greater than 1
|
||||||
|
# 2. we are using an API-based embedding model (provider_type is not None)
|
||||||
|
# 3. there are more than 1 batch (no point in threading if only 1)
|
||||||
|
if num_threads >= 1 and self.provider_type and len(text_batches) > 1:
|
||||||
|
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||||
|
future_to_batch = {
|
||||||
|
executor.submit(process_batch, idx, batch): idx
|
||||||
|
for idx, batch in enumerate(text_batches, start=1)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Collect results in order
|
||||||
|
batch_results: list[tuple[int, list[Embedding]]] = []
|
||||||
|
for future in as_completed(future_to_batch):
|
||||||
|
try:
|
||||||
|
result = future.result()
|
||||||
|
batch_results.append(result)
|
||||||
if self.callback:
|
if self.callback:
|
||||||
self.callback.progress("_batch_encode_texts", 1)
|
self.callback.progress("_batch_encode_texts", 1)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Embedding model failed to process batch")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
# Sort by batch index and extend embeddings
|
||||||
|
batch_results.sort(key=lambda x: x[0])
|
||||||
|
for _, batch_embeddings in batch_results:
|
||||||
|
embeddings.extend(batch_embeddings)
|
||||||
|
else:
|
||||||
|
# Original sequential processing
|
||||||
|
for idx, text_batch in enumerate(text_batches, start=1):
|
||||||
|
_, batch_embeddings = process_batch(idx, text_batch)
|
||||||
|
embeddings.extend(batch_embeddings)
|
||||||
|
if self.callback:
|
||||||
|
self.callback.progress("_batch_encode_texts", 1)
|
||||||
|
|
||||||
return embeddings
|
return embeddings
|
||||||
|
|
||||||
def encode(
|
def encode(
|
||||||
|
Reference in New Issue
Block a user