From 0d3c72acbff9fc27001b4415802e4aa70b232ddb Mon Sep 17 00:00:00 2001 From: Chris Weaver <25087905+Weves@users.noreply.github.com> Date: Fri, 7 Mar 2025 19:49:47 -0800 Subject: [PATCH] Add basic memory logging (#4234) * Add basic memory logging * Small tweaks * Switch to monotonic --- .../background/celery/memory_monitoring.py | 60 +++++++++++++++++++ .../background/celery/tasks/indexing/tasks.py | 21 +++++++ backend/supervisord.conf | 1 + .../docker_compose/docker-compose.dev.yml | 4 ++ .../docker_compose/docker-compose.gpu-dev.yml | 4 ++ .../docker-compose.multitenant-dev.yml | 3 + .../docker-compose.prod-cloud.yml | 4 ++ .../docker-compose.prod-no-letsencrypt.yml | 3 + .../docker_compose/docker-compose.prod.yml | 5 ++ .../docker-compose.search-testing.yml | 3 + 10 files changed, 108 insertions(+) create mode 100644 backend/onyx/background/celery/memory_monitoring.py diff --git a/backend/onyx/background/celery/memory_monitoring.py b/backend/onyx/background/celery/memory_monitoring.py new file mode 100644 index 000000000..0ef79b204 --- /dev/null +++ b/backend/onyx/background/celery/memory_monitoring.py @@ -0,0 +1,60 @@ +# backend/onyx/background/celery/memory_monitoring.py +import logging +import os +from logging.handlers import RotatingFileHandler + +import psutil + +from onyx.utils.logger import setup_logger + +# Regular application logger +logger = setup_logger() + +# Set up a dedicated memory monitoring logger +MEMORY_LOG_DIR = "/var/log/persisted-logs/memory" +MEMORY_LOG_FILE = os.path.join(MEMORY_LOG_DIR, "memory_usage.log") +MEMORY_LOG_MAX_BYTES = 10 * 1024 * 1024 # 10MB +MEMORY_LOG_BACKUP_COUNT = 5 # Keep 5 backup files + +# Ensure log directory exists +os.makedirs(MEMORY_LOG_DIR, exist_ok=True) + +# Create a dedicated logger for memory monitoring +memory_logger = logging.getLogger("memory_monitoring") +memory_logger.setLevel(logging.INFO) + +# Create a rotating file handler +memory_handler = RotatingFileHandler( + MEMORY_LOG_FILE, maxBytes=MEMORY_LOG_MAX_BYTES, backupCount=MEMORY_LOG_BACKUP_COUNT +) + +# Create a formatter that includes all relevant information +memory_formatter = logging.Formatter( + "%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S" +) +memory_handler.setFormatter(memory_formatter) +memory_logger.addHandler(memory_handler) + + +def emit_process_memory( + pid: int, process_name: str, additional_metadata: dict[str, str | int] +) -> None: + try: + process = psutil.Process(pid) + memory_info = process.memory_info() + cpu_percent = process.cpu_percent(interval=0.1) + + # Build metadata string from additional_metadata dictionary + metadata_str = " ".join( + [f"{key}={value}" for key, value in additional_metadata.items()] + ) + metadata_str = f" {metadata_str}" if metadata_str else "" + + memory_logger.info( + f"PROCESS_MEMORY process_name={process_name} pid={pid} " + f"rss_mb={memory_info.rss / (1024 * 1024):.2f} " + f"vms_mb={memory_info.vms / (1024 * 1024):.2f} " + f"cpu={cpu_percent:.2f}{metadata_str}" + ) + except Exception: + logger.exception("Error monitoring process memory.") diff --git a/backend/onyx/background/celery/tasks/indexing/tasks.py b/backend/onyx/background/celery/tasks/indexing/tasks.py index 72670a2b2..89c5b2e49 100644 --- a/backend/onyx/background/celery/tasks/indexing/tasks.py +++ b/backend/onyx/background/celery/tasks/indexing/tasks.py @@ -23,6 +23,7 @@ from sqlalchemy.orm import Session from onyx.background.celery.apps.app_base import task_logger from onyx.background.celery.celery_utils import httpx_init_vespa_pool +from onyx.background.celery.memory_monitoring import emit_process_memory from onyx.background.celery.tasks.indexing.utils import get_unfenced_index_attempt_ids from onyx.background.celery.tasks.indexing.utils import IndexingCallback from onyx.background.celery.tasks.indexing.utils import should_index @@ -984,6 +985,9 @@ def connector_indexing_proxy_task( redis_connector = RedisConnector(tenant_id, cc_pair_id) redis_connector_index = redis_connector.new_index(search_settings_id) + # Track the last time memory info was emitted + last_memory_emit_time = 0.0 + try: with get_session_with_current_tenant() as db_session: index_attempt = get_index_attempt( @@ -1024,6 +1028,23 @@ def connector_indexing_proxy_task( job.release() break + # log the memory usage for tracking down memory leaks / connector-specific memory issues + pid = job.process.pid + if pid is not None: + # Only emit memory info once per minute (60 seconds) + current_time = time.monotonic() + if current_time - last_memory_emit_time >= 60.0: + emit_process_memory( + pid, + "indexing_worker", + { + "cc_pair_id": cc_pair_id, + "search_settings_id": search_settings_id, + "index_attempt_id": index_attempt_id, + }, + ) + last_memory_emit_time = current_time + # if a termination signal is detected, break (exit point will clean up) if self.request.id and redis_connector_index.terminating(self.request.id): task_logger.warning( diff --git a/backend/supervisord.conf b/backend/supervisord.conf index d42672096..c8f0e4f44 100644 --- a/backend/supervisord.conf +++ b/backend/supervisord.conf @@ -108,6 +108,7 @@ command=tail -qF /var/log/celery_worker_light.log /var/log/celery_worker_heavy.log /var/log/celery_worker_indexing.log + /var/log/celery_worker_monitoring.log /var/log/slack_bot.log stdout_logfile=/dev/stdout stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml index f36484317..24b58a9c1 100644 --- a/deployment/docker_compose/docker-compose.dev.yml +++ b/deployment/docker_compose/docker-compose.dev.yml @@ -254,6 +254,9 @@ services: # - ./bundle.pem:/app/bundle.pem:ro extra_hosts: - "host.docker.internal:host-gateway" + # optional, only for debugging purposes + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -431,3 +434,4 @@ volumes: model_cache_huggingface: indexing_huggingface_model_cache: + log_store: # for logs that we don't want to lose on container restarts diff --git a/deployment/docker_compose/docker-compose.gpu-dev.yml b/deployment/docker_compose/docker-compose.gpu-dev.yml index ac0c8e522..cc386ee66 100644 --- a/deployment/docker_compose/docker-compose.gpu-dev.yml +++ b/deployment/docker_compose/docker-compose.gpu-dev.yml @@ -209,6 +209,9 @@ services: # - ./bundle.pem:/app/bundle.pem:ro extra_hosts: - "host.docker.internal:host-gateway" + # optional, only for debugging purposes + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -384,3 +387,4 @@ volumes: # Created by the container itself model_cache_huggingface: indexing_huggingface_model_cache: + log_store: # for logs that we don't want to lose on container restarts diff --git a/deployment/docker_compose/docker-compose.multitenant-dev.yml b/deployment/docker_compose/docker-compose.multitenant-dev.yml index cba620ba0..1b3c57b75 100644 --- a/deployment/docker_compose/docker-compose.multitenant-dev.yml +++ b/deployment/docker_compose/docker-compose.multitenant-dev.yml @@ -244,6 +244,8 @@ services: # - ./bundle.pem:/app/bundle.pem:ro extra_hosts: - "host.docker.internal:host-gateway" + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -421,3 +423,4 @@ volumes: model_cache_huggingface: indexing_huggingface_model_cache: + log_store: # for logs that we don't want to lose on container restarts diff --git a/deployment/docker_compose/docker-compose.prod-cloud.yml b/deployment/docker_compose/docker-compose.prod-cloud.yml index 37a032f1d..02bd41ed2 100644 --- a/deployment/docker_compose/docker-compose.prod-cloud.yml +++ b/deployment/docker_compose/docker-compose.prod-cloud.yml @@ -54,6 +54,9 @@ services: - INDEXING_MODEL_SERVER_HOST=${INDEXING_MODEL_SERVER_HOST:-indexing_model_server} extra_hosts: - "host.docker.internal:host-gateway" + # optional, only for debugging purposes + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -233,3 +236,4 @@ volumes: # Created by the container itself model_cache_huggingface: indexing_huggingface_model_cache: + log_store: # for logs that we don't want to lose on container restarts diff --git a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml index 021b279c7..18c283bd2 100644 --- a/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml +++ b/deployment/docker_compose/docker-compose.prod-no-letsencrypt.yml @@ -68,6 +68,8 @@ services: # - ./bundle.pem:/app/bundle.pem:ro extra_hosts: - "host.docker.internal:host-gateway" + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -229,3 +231,4 @@ volumes: # Created by the container itself model_cache_huggingface: indexing_huggingface_model_cache: + log_store: # for logs that we don't want to lose on container restarts diff --git a/deployment/docker_compose/docker-compose.prod.yml b/deployment/docker_compose/docker-compose.prod.yml index 90a1aa9c1..d59114e09 100644 --- a/deployment/docker_compose/docker-compose.prod.yml +++ b/deployment/docker_compose/docker-compose.prod.yml @@ -32,6 +32,8 @@ services: # - ./bundle.pem:/app/bundle.pem:ro extra_hosts: - "host.docker.internal:host-gateway" + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -73,6 +75,8 @@ services: # - ./bundle.pem:/app/bundle.pem:ro extra_hosts: - "host.docker.internal:host-gateway" + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -260,3 +264,4 @@ volumes: # Created by the container itself model_cache_huggingface: indexing_huggingface_model_cache: + log_store: # for logs that we don't want to lose on container restarts diff --git a/deployment/docker_compose/docker-compose.search-testing.yml b/deployment/docker_compose/docker-compose.search-testing.yml index 68a1aaedc..01a6068b7 100644 --- a/deployment/docker_compose/docker-compose.search-testing.yml +++ b/deployment/docker_compose/docker-compose.search-testing.yml @@ -62,6 +62,8 @@ services: - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=True extra_hosts: - "host.docker.internal:host-gateway" + volumes: + - log_store:/var/log/persisted-logs logging: driver: json-file options: @@ -221,3 +223,4 @@ volumes: type: none o: bind device: ${DANSWER_VESPA_DATA_DIR:-./vespa_data} + log_store: # for logs that we don't want to lose on container restarts