remove tenant id logs (#3063)

This commit is contained in:
pablonyx 2024-12-16 12:24:09 -08:00 committed by GitHub
parent d954914a0a
commit cbbf10f450
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 111 additions and 65 deletions

View File

@ -8,6 +8,8 @@ import sentry_sdk
from celery import Task from celery import Task
from celery.app import trace from celery.app import trace
from celery.exceptions import WorkerShutdown from celery.exceptions import WorkerShutdown
from celery.signals import task_postrun
from celery.signals import task_prerun
from celery.states import READY_STATES from celery.states import READY_STATES
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
from celery.worker import strategy # type: ignore from celery.worker import strategy # type: ignore
@ -34,8 +36,11 @@ from onyx.redis.redis_usergroup import RedisUserGroup
from onyx.utils.logger import ColoredFormatter from onyx.utils.logger import ColoredFormatter
from onyx.utils.logger import PlainFormatter from onyx.utils.logger import PlainFormatter
from onyx.utils.logger import setup_logger from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.configs import SENTRY_DSN from shared_configs.configs import SENTRY_DSN
from shared_configs.configs import TENANT_ID_PREFIX
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
logger = setup_logger() logger = setup_logger()
@ -56,8 +61,8 @@ def on_task_prerun(
sender: Any | None = None, sender: Any | None = None,
task_id: str | None = None, task_id: str | None = None,
task: Task | None = None, task: Task | None = None,
args: tuple | None = None, args: tuple[Any, ...] | None = None,
kwargs: dict | None = None, kwargs: dict[str, Any] | None = None,
**kwds: Any, **kwds: Any,
) -> None: ) -> None:
pass pass
@ -346,26 +351,36 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
def on_setup_logging( def on_setup_logging(
loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any loglevel: int,
logfile: str | None,
format: str,
colorize: bool,
**kwargs: Any,
) -> None: ) -> None:
# TODO: could unhardcode format and colorize and accept these as options from # TODO: could unhardcode format and colorize and accept these as options from
# celery's config # celery's config
# reformats the root logger
root_logger = logging.getLogger() root_logger = logging.getLogger()
root_logger.handlers = []
root_handler = logging.StreamHandler() # Set up a handler for the root logger # Define the log format
log_format = (
"%(levelname)-8s %(asctime)s %(filename)15s:%(lineno)-4d: %(name)s %(message)s"
)
# Set up the root handler
root_handler = logging.StreamHandler()
root_formatter = ColoredFormatter( root_formatter = ColoredFormatter(
"%(asctime)s %(filename)30s %(lineno)4s: %(message)s", log_format,
datefmt="%m/%d/%Y %I:%M:%S %p", datefmt="%m/%d/%Y %I:%M:%S %p",
) )
root_handler.setFormatter(root_formatter) root_handler.setFormatter(root_formatter)
root_logger.addHandler(root_handler) # Apply the handler to the root logger root_logger.addHandler(root_handler)
if logfile: if logfile:
root_file_handler = logging.FileHandler(logfile) root_file_handler = logging.FileHandler(logfile)
root_file_formatter = PlainFormatter( root_file_formatter = PlainFormatter(
"%(asctime)s %(filename)30s %(lineno)4s: %(message)s", log_format,
datefmt="%m/%d/%Y %I:%M:%S %p", datefmt="%m/%d/%Y %I:%M:%S %p",
) )
root_file_handler.setFormatter(root_file_formatter) root_file_handler.setFormatter(root_file_formatter)
@ -373,19 +388,23 @@ def on_setup_logging(
root_logger.setLevel(loglevel) root_logger.setLevel(loglevel)
# reformats celery's task logger # Configure the task logger
task_logger.handlers = []
task_handler = logging.StreamHandler()
task_handler.addFilter(TenantContextFilter())
task_formatter = CeleryTaskColoredFormatter( task_formatter = CeleryTaskColoredFormatter(
"%(asctime)s %(filename)30s %(lineno)4s: %(message)s", log_format,
datefmt="%m/%d/%Y %I:%M:%S %p", datefmt="%m/%d/%Y %I:%M:%S %p",
) )
task_handler = logging.StreamHandler() # Set up a handler for the task logger
task_handler.setFormatter(task_formatter) task_handler.setFormatter(task_formatter)
task_logger.addHandler(task_handler) # Apply the handler to the task logger task_logger.addHandler(task_handler)
if logfile: if logfile:
task_file_handler = logging.FileHandler(logfile) task_file_handler = logging.FileHandler(logfile)
task_file_handler.addFilter(TenantContextFilter())
task_file_formatter = CeleryTaskPlainFormatter( task_file_formatter = CeleryTaskPlainFormatter(
"%(asctime)s %(filename)30s %(lineno)4s: %(message)s", log_format,
datefmt="%m/%d/%Y %I:%M:%S %p", datefmt="%m/%d/%Y %I:%M:%S %p",
) )
task_file_handler.setFormatter(task_file_formatter) task_file_handler.setFormatter(task_file_formatter)
@ -394,10 +413,55 @@ def on_setup_logging(
task_logger.setLevel(loglevel) task_logger.setLevel(loglevel)
task_logger.propagate = False task_logger.propagate = False
# hide celery task received spam # Hide celery task received and succeeded/failed messages
# e.g. "Task check_for_pruning[a1e96171-0ba8-4e00-887b-9fbf7442eab3] received"
strategy.logger.setLevel(logging.WARNING) strategy.logger.setLevel(logging.WARNING)
# hide celery task succeeded/failed spam
# e.g. "Task check_for_pruning[a1e96171-0ba8-4e00-887b-9fbf7442eab3] succeeded in 0.03137450001668185s: None"
trace.logger.setLevel(logging.WARNING) trace.logger.setLevel(logging.WARNING)
class TenantContextFilter(logging.Filter):
"""Logging filter to inject tenant ID into the logger's name."""
def filter(self, record: logging.LogRecord) -> bool:
if not MULTI_TENANT:
record.name = ""
return True
tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
if tenant_id:
tenant_id = tenant_id.split(TENANT_ID_PREFIX)[-1][:5]
record.name = f"[t:{tenant_id}]"
else:
record.name = ""
return True
@task_prerun.connect
def set_tenant_id(
sender: Any | None = None,
task_id: str | None = None,
task: Task | None = None,
args: tuple[Any, ...] | None = None,
kwargs: dict[str, Any] | None = None,
**other_kwargs: Any,
) -> None:
"""Signal handler to set tenant ID in context var before task starts."""
tenant_id = (
kwargs.get("tenant_id", POSTGRES_DEFAULT_SCHEMA)
if kwargs
else POSTGRES_DEFAULT_SCHEMA
)
CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
@task_postrun.connect
def reset_tenant_id(
sender: Any | None = None,
task_id: str | None = None,
task: Task | None = None,
args: tuple[Any, ...] | None = None,
kwargs: dict[str, Any] | None = None,
**other_kwargs: Any,
) -> None:
"""Signal handler to reset tenant ID in context var after task ends."""
CURRENT_TENANT_ID_CONTEXTVAR.set(POSTGRES_DEFAULT_SCHEMA)

View File

@ -44,18 +44,18 @@ class DynamicTenantScheduler(PersistentScheduler):
self._last_reload is None self._last_reload is None
or (now - self._last_reload) > self._reload_interval or (now - self._last_reload) > self._reload_interval
): ):
logger.info("Reload interval reached, initiating tenant task update") logger.info("Reload interval reached, initiating task update")
self._update_tenant_tasks() self._update_tenant_tasks()
self._last_reload = now self._last_reload = now
logger.info("Tenant task update completed, reset reload timer") logger.info("Task update completed, reset reload timer")
return retval return retval
def _update_tenant_tasks(self) -> None: def _update_tenant_tasks(self) -> None:
logger.info("Starting tenant task update process") logger.info("Starting task update process")
try: try:
logger.info("Fetching all tenant IDs") logger.info("Fetching all IDs")
tenant_ids = get_all_tenant_ids() tenant_ids = get_all_tenant_ids()
logger.info(f"Found {len(tenant_ids)} tenants") logger.info(f"Found {len(tenant_ids)} IDs")
logger.info("Fetching tasks to schedule") logger.info("Fetching tasks to schedule")
tasks_to_schedule = fetch_versioned_implementation( tasks_to_schedule = fetch_versioned_implementation(
@ -70,7 +70,7 @@ class DynamicTenantScheduler(PersistentScheduler):
for task_name, _ in current_schedule: for task_name, _ in current_schedule:
if "-" in task_name: if "-" in task_name:
existing_tenants.add(task_name.split("-")[-1]) existing_tenants.add(task_name.split("-")[-1])
logger.info(f"Found {len(existing_tenants)} existing tenants in schedule") logger.info(f"Found {len(existing_tenants)} existing items in schedule")
for tenant_id in tenant_ids: for tenant_id in tenant_ids:
if ( if (
@ -83,7 +83,7 @@ class DynamicTenantScheduler(PersistentScheduler):
continue continue
if tenant_id not in existing_tenants: if tenant_id not in existing_tenants:
logger.info(f"Processing new tenant: {tenant_id}") logger.info(f"Processing new item: {tenant_id}")
for task in tasks_to_schedule(): for task in tasks_to_schedule():
task_name = f"{task['name']}-{tenant_id}" task_name = f"{task['name']}-{tenant_id}"
@ -129,11 +129,10 @@ class DynamicTenantScheduler(PersistentScheduler):
logger.info("Schedule update completed successfully") logger.info("Schedule update completed successfully")
else: else:
logger.info("Schedule is up to date, no changes needed") logger.info("Schedule is up to date, no changes needed")
except (AttributeError, KeyError) as e:
except (AttributeError, KeyError): logger.exception(f"Failed to process task configuration: {str(e)}")
logger.exception("Failed to process task configuration") except Exception as e:
except Exception: logger.exception(f"Unexpected error updating tasks: {str(e)}")
logger.exception("Unexpected error updating tenant tasks")
def _should_update_schedule( def _should_update_schedule(
self, current_schedule: dict, new_schedule: dict self, current_schedule: dict, new_schedule: dict

View File

@ -76,7 +76,7 @@ def check_for_connector_deletion_task(self: Task, *, tenant_id: str | None) -> N
"Soft time limit exceeded, task is being terminated gracefully." "Soft time limit exceeded, task is being terminated gracefully."
) )
except Exception: except Exception:
task_logger.exception(f"Unexpected exception: tenant={tenant_id}") task_logger.exception("Unexpected exception during connector deletion check")
finally: finally:
if lock_beat.owned(): if lock_beat.owned():
lock_beat.release() lock_beat.release()
@ -131,14 +131,14 @@ def try_generate_document_cc_pair_cleanup_tasks(
redis_connector_index = redis_connector.new_index(search_settings.id) redis_connector_index = redis_connector.new_index(search_settings.id)
if redis_connector_index.fenced: if redis_connector_index.fenced:
raise TaskDependencyError( raise TaskDependencyError(
f"Connector deletion - Delayed (indexing in progress): " "Connector deletion - Delayed (indexing in progress): "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings.id}" f"search_settings={search_settings.id}"
) )
if redis_connector.prune.fenced: if redis_connector.prune.fenced:
raise TaskDependencyError( raise TaskDependencyError(
f"Connector deletion - Delayed (pruning in progress): " "Connector deletion - Delayed (pruning in progress): "
f"cc_pair={cc_pair_id}" f"cc_pair={cc_pair_id}"
) )
@ -175,7 +175,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
# return 0 # return 0
task_logger.info( task_logger.info(
f"RedisConnectorDeletion.generate_tasks finished. " "RedisConnectorDeletion.generate_tasks finished. "
f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}" f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}"
) )

View File

@ -309,7 +309,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
"Soft time limit exceeded, task is being terminated gracefully." "Soft time limit exceeded, task is being terminated gracefully."
) )
except Exception: except Exception:
task_logger.exception(f"Unexpected exception: tenant={tenant_id}") task_logger.exception("Unexpected exception during indexing check")
finally: finally:
if locked: if locked:
if lock_beat.owned(): if lock_beat.owned():
@ -508,7 +508,6 @@ def try_creating_indexing_task(
except Exception: except Exception:
task_logger.exception( task_logger.exception(
f"try_creating_indexing_task - Unexpected exception: " f"try_creating_indexing_task - Unexpected exception: "
f"tenant={tenant_id} "
f"cc_pair={cc_pair.id} " f"cc_pair={cc_pair.id} "
f"search_settings={search_settings.id}" f"search_settings={search_settings.id}"
) )
@ -540,7 +539,6 @@ def connector_indexing_proxy_task(
"""celery tasks are forked, but forking is unstable. This proxies work to a spawned task.""" """celery tasks are forked, but forking is unstable. This proxies work to a spawned task."""
task_logger.info( task_logger.info(
f"Indexing watchdog - starting: attempt={index_attempt_id} " f"Indexing watchdog - starting: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}" f"search_settings={search_settings_id}"
) )
@ -563,15 +561,14 @@ def connector_indexing_proxy_task(
if not job: if not job:
task_logger.info( task_logger.info(
f"Indexing watchdog - spawn failed: attempt={index_attempt_id} " f"Indexing watchdog - spawn failed: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}" f"search_settings={search_settings_id}"
) )
return return
task_logger.info( task_logger.info(
f"Indexing proxy - spawn succeeded: attempt={index_attempt_id} "
f"Indexing watchdog - spawn succeeded: attempt={index_attempt_id} " f"Indexing watchdog - spawn succeeded: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}" f"search_settings={search_settings_id}"
) )
@ -586,7 +583,6 @@ def connector_indexing_proxy_task(
task_logger.warning( task_logger.warning(
"Indexing watchdog - termination signal detected: " "Indexing watchdog - termination signal detected: "
f"attempt={index_attempt_id} " f"attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}" f"search_settings={search_settings_id}"
) )
@ -681,7 +677,6 @@ def connector_indexing_proxy_task(
task_logger.info( task_logger.info(
f"Indexing watchdog - finished: attempt={index_attempt_id} " f"Indexing watchdog - finished: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}" f"search_settings={search_settings_id}"
) )
@ -906,7 +901,6 @@ def connector_indexing_task(
logger.info( logger.info(
f"Indexing spawned task finished: attempt={index_attempt_id} " f"Indexing spawned task finished: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}" f"search_settings={search_settings_id}"
) )

View File

@ -122,7 +122,7 @@ def check_for_pruning(self: Task, *, tenant_id: str | None) -> None:
"Soft time limit exceeded, task is being terminated gracefully." "Soft time limit exceeded, task is being terminated gracefully."
) )
except Exception: except Exception:
task_logger.exception(f"Unexpected exception: tenant={tenant_id}") task_logger.exception("Unexpected exception during pruning check")
finally: finally:
if lock_beat.owned(): if lock_beat.owned():
lock_beat.release() lock_beat.release()
@ -308,7 +308,7 @@ def connector_pruning_generator_task(
doc_ids_to_remove = list(all_indexed_document_ids - all_connector_doc_ids) doc_ids_to_remove = list(all_indexed_document_ids - all_connector_doc_ids)
task_logger.info( task_logger.info(
f"Pruning set collected: " "Pruning set collected: "
f"cc_pair={cc_pair_id} " f"cc_pair={cc_pair_id} "
f"connector_source={cc_pair.connector.source} " f"connector_source={cc_pair.connector.source} "
f"docs_to_remove={len(doc_ids_to_remove)}" f"docs_to_remove={len(doc_ids_to_remove)}"
@ -324,7 +324,7 @@ def connector_pruning_generator_task(
return None return None
task_logger.info( task_logger.info(
f"RedisConnector.prune.generate_tasks finished. " "RedisConnector.prune.generate_tasks finished. "
f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}" f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}"
) )

View File

@ -60,7 +60,7 @@ def document_by_cc_pair_cleanup_task(
connector / credential pair from the access list connector / credential pair from the access list
(6) delete all relevant entries from postgres (6) delete all relevant entries from postgres
""" """
task_logger.debug(f"Task start: tenant={tenant_id} doc={document_id}") task_logger.debug(f"Task start: doc={document_id}")
try: try:
with get_session_with_tenant(tenant_id) as db_session: with get_session_with_tenant(tenant_id) as db_session:
@ -129,16 +129,13 @@ def document_by_cc_pair_cleanup_task(
db_session.commit() db_session.commit()
task_logger.info( task_logger.info(
f"tenant={tenant_id} "
f"doc={document_id} " f"doc={document_id} "
f"action={action} " f"action={action} "
f"refcount={count} " f"refcount={count} "
f"chunks={chunks_affected}" f"chunks={chunks_affected}"
) )
except SoftTimeLimitExceeded: except SoftTimeLimitExceeded:
task_logger.info( task_logger.info(f"SoftTimeLimitExceeded exception. doc={document_id}")
f"SoftTimeLimitExceeded exception. tenant={tenant_id} doc={document_id}"
)
return False return False
except Exception as ex: except Exception as ex:
if isinstance(ex, RetryError): if isinstance(ex, RetryError):
@ -157,15 +154,12 @@ def document_by_cc_pair_cleanup_task(
if e.response.status_code == HTTPStatus.BAD_REQUEST: if e.response.status_code == HTTPStatus.BAD_REQUEST:
task_logger.exception( task_logger.exception(
f"Non-retryable HTTPStatusError: " f"Non-retryable HTTPStatusError: "
f"tenant={tenant_id} "
f"doc={document_id} " f"doc={document_id} "
f"status={e.response.status_code}" f"status={e.response.status_code}"
) )
return False return False
task_logger.exception( task_logger.exception(f"Unexpected exception: doc={document_id}")
f"Unexpected exception: tenant={tenant_id} doc={document_id}"
)
if self.request.retries < DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES: if self.request.retries < DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES:
# Still retrying. Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64 # Still retrying. Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64
@ -176,7 +170,7 @@ def document_by_cc_pair_cleanup_task(
# eventually gets fixed out of band via stale document reconciliation # eventually gets fixed out of band via stale document reconciliation
task_logger.warning( task_logger.warning(
f"Max celery task retries reached. Marking doc as dirty for reconciliation: " f"Max celery task retries reached. Marking doc as dirty for reconciliation: "
f"tenant={tenant_id} doc={document_id}" f"doc={document_id}"
) )
with get_session_with_tenant(tenant_id) as db_session: with get_session_with_tenant(tenant_id) as db_session:
# delete the cc pair relationship now and let reconciliation clean it up # delete the cc pair relationship now and let reconciliation clean it up

View File

@ -156,7 +156,7 @@ def check_for_vespa_sync_task(self: Task, *, tenant_id: str | None) -> None:
"Soft time limit exceeded, task is being terminated gracefully." "Soft time limit exceeded, task is being terminated gracefully."
) )
except Exception: except Exception:
task_logger.exception(f"Unexpected exception: tenant={tenant_id}") task_logger.exception("Unexpected exception during vespa metadata sync")
finally: finally:
if lock_beat.owned(): if lock_beat.owned():
lock_beat.release() lock_beat.release()
@ -873,13 +873,9 @@ def vespa_metadata_sync_task(
# the sync might repeat again later # the sync might repeat again later
mark_document_as_synced(document_id, db_session) mark_document_as_synced(document_id, db_session)
task_logger.info( task_logger.info(f"doc={document_id} action=sync chunks={chunks_affected}")
f"tenant={tenant_id} doc={document_id} action=sync chunks={chunks_affected}"
)
except SoftTimeLimitExceeded: except SoftTimeLimitExceeded:
task_logger.info( task_logger.info(f"SoftTimeLimitExceeded exception. doc={document_id}")
f"SoftTimeLimitExceeded exception. tenant={tenant_id} doc={document_id}"
)
except Exception as ex: except Exception as ex:
if isinstance(ex, RetryError): if isinstance(ex, RetryError):
task_logger.warning( task_logger.warning(
@ -897,14 +893,13 @@ def vespa_metadata_sync_task(
if e.response.status_code == HTTPStatus.BAD_REQUEST: if e.response.status_code == HTTPStatus.BAD_REQUEST:
task_logger.exception( task_logger.exception(
f"Non-retryable HTTPStatusError: " f"Non-retryable HTTPStatusError: "
f"tenant={tenant_id} "
f"doc={document_id} " f"doc={document_id} "
f"status={e.response.status_code}" f"status={e.response.status_code}"
) )
return False return False
task_logger.exception( task_logger.exception(
f"Unexpected exception: tenant={tenant_id} doc={document_id}" f"Unexpected exception during vespa metadata sync: doc={document_id}"
) )
# Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64 # Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64