fix: enable LMCache metrics visibility with PROMETHEUS_MULTIPROC_DIR (#4654)

keivenchang · web-flow · commit c6555852ee9f · 2025-12-02T18:47:30.000-08:00
Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
Co-authored-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/components/src/dynamo/vllm/main.py b/components/src/dynamo/vllm/main.py
@@ -9,13 +9,15 @@
 from typing import Optional
 
 import uvloop
+from prometheus_client import REGISTRY, CollectorRegistry, multiprocess
 from vllm.distributed.kv_events import ZmqEventPublisher
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 
 from dynamo.common.config_dump import dump_config
 from dynamo.common.utils.endpoint_types import parse_endpoint_types
+from dynamo.common.utils.prometheus import register_engine_metrics_callback
 from dynamo.llm import (
     ModelInput,
     ModelRuntimeConfig,
@@ -106,6 +108,64 @@ def signal_handler():
     logger.debug("Worker function completed, exiting...")
 
 
+def setup_metrics_collection(config: Config, generate_endpoint, logger):
+    """Set up metrics collection for vLLM and LMCache metrics.
+
+    In multiprocess mode (PROMETHEUS_MULTIPROC_DIR set), metrics are stored:
+      1. In-memory: Metric objects in global REGISTRY
+      2. On-disk: Metric values in .db files (PROMETHEUS_MULTIPROC_DIR)
+
+    MultiProcessCollector reads from .db files but adding it to REGISTRY can fail
+    with "Duplicated timeseries" if PROMETHEUS_MULTIPROC_DIR was set before process
+    started (K8s deployments) because metrics are already in REGISTRY.
+
+    Solution: Try adding MultiProcessCollector to REGISTRY. If that fails, use
+    separate registry for multiprocess collection and register callbacks to both
+    registries to ensure all metrics (vllm, lmcache, dynamo_component) are collected.
+    """
+    if config.engine_args.disable_log_stats is False:
+        if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
+            try:
+                # MultiProcessCollector reads metrics from .db files in PROMETHEUS_MULTIPROC_DIR
+                # Adding it to REGISTRY allows collecting both in-memory and .db file metrics
+                multiprocess.MultiProcessCollector(REGISTRY)
+                logger.debug("Added MultiProcessCollector to global REGISTRY")
+                register_engine_metrics_callback(
+                    endpoint=generate_endpoint,
+                    registry=REGISTRY,
+                    metric_prefix_filters=["vllm:", "lmcache:"],
+                )
+            except ValueError as e:
+                # Conflict: metrics already in REGISTRY, MultiProcessCollector tries to add same metrics from .db files
+                # Solution: Use separate registry that ONLY reads from .db files (no in-memory conflicts)
+                logger.debug(
+                    f"Could not add MultiProcessCollector to REGISTRY ({e}), using separate registry"
+                )
+                multiproc_registry = CollectorRegistry()
+                multiprocess.MultiProcessCollector(multiproc_registry)
+
+                # Register both registries to collect all metrics
+                # Global REGISTRY has in-memory metrics (vllm, dynamo_component)
+                register_engine_metrics_callback(
+                    endpoint=generate_endpoint,
+                    registry=REGISTRY,
+                    metric_prefix_filters=["vllm:", "dynamo_component:"],
+                )
+                # Multiproc registry has .db file metrics (lmcache, possibly vllm duplicates)
+                register_engine_metrics_callback(
+                    endpoint=generate_endpoint,
+                    registry=multiproc_registry,
+                    metric_prefix_filters=["vllm:", "lmcache:"],
+                )
+        else:
+            # No multiprocess mode
+            register_engine_metrics_callback(
+                endpoint=generate_endpoint,
+                registry=REGISTRY,
+                metric_prefix_filters=["vllm:", "lmcache:"],
+            )
+
+
 def setup_kv_event_publisher(
     config: Config,
     component,
@@ -176,11 +236,9 @@ def setup_kv_event_publisher(
 
 
 def setup_vllm_engine(config, stat_logger=None):
-    # Existing vLLM v0.11.0 bug: vllm/v1/metrics/prometheus.py:79 passes TemporaryDirectory object instead of
-    # the .name string, causing a false error message when vLLM exits. Therefore, always set
-    # PROMETHEUS_MULTIPROC_DIR first, and we'll do the path cleanup.
-
-    # This vLLM bug causes a false error message when vLLM exits.
+    # vLLM v0.11.0 bug: vllm/v1.metrics/prometheus.py:79 passes TemporaryDirectory object
+    # instead of .name string, causing false error on exit. Set PROMETHEUS_MULTIPROC_DIR
+    # ourselves to avoid this and handle cleanup properly.
     prometheus_temp_dir = None
     if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
         prometheus_temp_dir = tempfile.TemporaryDirectory(prefix="vllm_prometheus_")
@@ -356,31 +414,7 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
     if kv_publishers:
         handler.kv_publishers = kv_publishers
 
-    if config.engine_args.disable_log_stats is False:
-        # vLLM v1 registers its metrics with 'vllm:' prefix
-        from prometheus_client import REGISTRY, multiprocess
-
-        from dynamo.common.utils.prometheus import register_engine_metrics_callback
-
-        # Option 1: Try adding MultiProcessCollector to the global REGISTRY
-        # This would make REGISTRY collect from both its registered metrics AND multiprocess files
-        if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
-            try:
-                # Add MultiProcessCollector to global REGISTRY
-                # This makes REGISTRY collect from .db files in addition to its own metrics
-                multiprocess.MultiProcessCollector(REGISTRY)
-                logger.info("Added MultiProcessCollector to global REGISTRY")
-            except ValueError as e:
-                # Might already be registered or directory issues
-                logger.warning(f"Could not add MultiProcessCollector to REGISTRY: {e}")
-
-        # Register callback with the global REGISTRY
-        # Now it should collect both its own metrics AND multiprocess metrics
-        register_engine_metrics_callback(
-            endpoint=generate_endpoint,
-            registry=REGISTRY,
-            metric_prefix_filters=["vllm:", "lmcache:"],
-        )
+    setup_metrics_collection(config, generate_endpoint, logger)
 
     # Register prefill model with ModelType.Prefill
     if not config.engine_args.data_parallel_rank:  # if rank is 0 or None then register
@@ -493,31 +527,7 @@ async def init(runtime: DistributedRuntime, config: Config):
     if kv_publishers:
         handler.kv_publishers = kv_publishers
 
-    if config.engine_args.disable_log_stats is False:
-        # vLLM v1 registers its metrics with 'vllm:' prefix
-        from prometheus_client import REGISTRY, multiprocess
-
-        from dynamo.common.utils.prometheus import register_engine_metrics_callback
-
-        # Option 1: Try adding MultiProcessCollector to the global REGISTRY
-        # This would make REGISTRY collect from both its registered metrics AND multiprocess files
-        if os.environ.get("PROMETHEUS_MULTIPROC_DIR"):
-            try:
-                # Add MultiProcessCollector to global REGISTRY
-                # This makes REGISTRY collect from .db files in addition to its own metrics
-                multiprocess.MultiProcessCollector(REGISTRY)
-                logger.info("Added MultiProcessCollector to global REGISTRY")
-            except ValueError as e:
-                # Might already be registered or directory issues
-                logger.warning(f"Could not add MultiProcessCollector to REGISTRY: {e}")
-
-        # Register callback with the global REGISTRY
-        # Now it should collect both its own metrics AND multiprocess metrics
-        register_engine_metrics_callback(
-            endpoint=generate_endpoint,
-            registry=REGISTRY,
-            metric_prefix_filters=["vllm:", "lmcache:"],
-        )
+    setup_metrics_collection(config, generate_endpoint, logger)
 
     if not config.engine_args.data_parallel_rank:  # if rank is 0 or None then register
         # Parse endpoint types from --dyn-endpoint-types flag
diff --git a/docs/backends/vllm/LMCache_Integration.md b/docs/backends/vllm/LMCache_Integration.md
@@ -156,6 +156,7 @@ When LMCache is enabled with `--connector lmcache` and `DYN_SYSTEM_PORT` is set,
 **Requirements to access LMCache metrics:**
 - `--connector lmcache` - Enables LMCache
 - `DYN_SYSTEM_PORT=8081` - Enables metrics HTTP endpoint
+- `PROMETHEUS_MULTIPROC_DIR` (optional) - If not set, Dynamo manages it internally. Only set explicitly if you need control over the metrics directory.
 
 For detailed information on LMCache metrics, including the complete list of available metrics and how to access them, see the **[LMCache Metrics section](prometheus.md#lmcache-metrics)** in the vLLM Prometheus Metrics Guide.
 
diff --git a/docs/backends/vllm/prometheus.md b/docs/backends/vllm/prometheus.md
@@ -136,7 +136,7 @@ curl -s localhost:8081/metrics | grep "^lmcache:"
 ## Implementation Details
 
 - vLLM v1 uses multiprocess metrics collection via `prometheus_client.multiprocess`
-- `PROMETHEUS_MULTIPROC_DIR`: vLLM sets this environment variable to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped.
+- `PROMETHEUS_MULTIPROC_DIR`: (optional). By default, Dynamo automatically manages this environment variable, setting it to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped. Users only need to set this explicitly where complete control over the metrics directory is required.
 - Dynamo uses `MultiProcessCollector` to aggregate metrics from all worker processes
 - Metrics are filtered by the `vllm:` and `lmcache:` prefixes before being exposed (when LMCache is enabled)
 - The integration uses Dynamo's `register_engine_metrics_callback()` function with the global `REGISTRY`
diff --git a/examples/backends/vllm/launch/agg_lmcache.sh b/examples/backends/vllm/launch/agg_lmcache.sh
@@ -4,10 +4,13 @@
 set -e
 trap 'echo Cleaning up...; kill 0' EXIT
 
+# Explicitly unset PROMETHEUS_MULTIPROC_DIR to let LMCache or Dynamo manage it internally
+unset PROMETHEUS_MULTIPROC_DIR
+
 # run ingress
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
 
-# run worker with LMCache enabled
+# run worker with LMCache enabled (without PROMETHEUS_MULTIPROC_DIR set externally)
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
   python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
diff --git a/examples/backends/vllm/launch/agg_lmcache_multiproc.sh b/examples/backends/vllm/launch/agg_lmcache_multiproc.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+set -e
+
+# Explicitly set PROMETHEUS_MULTIPROC_DIR (K8s-style deployment)
+# Use unique directory per test run to avoid conflicts
+export PROMETHEUS_MULTIPROC_DIR=${PROMETHEUS_MULTIPROC_DIR:-/tmp/prometheus_multiproc_$$_$RANDOM}
+rm -rf "$PROMETHEUS_MULTIPROC_DIR"
+mkdir -p "$PROMETHEUS_MULTIPROC_DIR"
+
+# Cleanup function to remove the directory on exit
+cleanup() {
+    echo "Cleaning up..."
+    rm -rf "$PROMETHEUS_MULTIPROC_DIR"
+    kill 0
+}
+trap cleanup EXIT
+
+# run ingress
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python -m dynamo.frontend &
+
+# run worker with LMCache enabled and PROMETHEUS_MULTIPROC_DIR explicitly set
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+  PROMETHEUS_MULTIPROC_DIR="$PROMETHEUS_MULTIPROC_DIR" \
+  python -m dynamo.vllm --model Qwen/Qwen3-0.6B --connector lmcache
+
diff --git a/tests/serve/test_vllm.py b/tests/serve/test_vllm.py
@@ -4,6 +4,7 @@
 import base64
 import logging
 import os
+import random
 from dataclasses import dataclass, field
 
 import pytest
@@ -64,6 +65,22 @@ class VLLMConfig(EngineConfig):
             metric_payload_default(min_num_requests=6, backend="lmcache"),
         ],
     ),
+    "aggregated_lmcache_multiproc": VLLMConfig(
+        name="aggregated_lmcache_multiproc",
+        directory=vllm_dir,
+        script_name="agg_lmcache_multiproc.sh",
+        marks=[pytest.mark.gpu_1],
+        model="Qwen/Qwen3-0.6B",
+        env={
+            "PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}"
+        },
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            metric_payload_default(min_num_requests=6, backend="vllm"),
+            metric_payload_default(min_num_requests=6, backend="lmcache"),
+        ],
+    ),
     "agg-request-plane-tcp": VLLMConfig(
         name="agg-request-plane-tcp",
         directory=vllm_dir,