feat: Disable health checks by default; auto-enable in K8s via operator (#4804)

tzulingk · web-flow · commit 7c15166d8236 · 2025-12-09T10:17:07.000-08:00
Signed-off-by: tzulingk@nvidia.com &lt;tzulingk@nvidia.com&gt;
diff --git a/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go b/deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go
@@ -827,6 +827,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 										Args:    []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"},
 										Env: []corev1.EnvVar{
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
+											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
 											{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
 											{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
 											{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
@@ -955,6 +956,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 										Args:    []string{"ray start --address=$LWS_LEADER_ADDRESS:6379 --block"},
 										Env: []corev1.EnvVar{
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
+											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
 											{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
 											{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
 											{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
diff --git a/deploy/cloud/operator/internal/dynamo/component_worker.go b/deploy/cloud/operator/internal/dynamo/component_worker.go
@@ -86,6 +86,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
 			Name:  "DYN_SYSTEM_PORT",
 			Value: fmt.Sprintf("%d", commonconsts.DynamoSystemPort),
 		},
+		{
+			Name:  "DYN_HEALTH_CHECK_ENABLED",
+			Value: "true",
+		},
 	}...)
 
 	return container, nil
diff --git a/deploy/cloud/operator/internal/dynamo/graph_test.go b/deploy/cloud/operator/internal/dynamo/graph_test.go
@@ -1963,6 +1963,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  commonconsts.DynamoComponentEnvVar,
 														Value: commonconsts.ComponentTypeWorker,
 													},
+													{
+														Name:  "DYN_HEALTH_CHECK_ENABLED",
+														Value: "true",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -2140,6 +2144,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  commonconsts.DynamoComponentEnvVar,
 														Value: commonconsts.ComponentTypeWorker,
 													},
+													{
+														Name:  "DYN_HEALTH_CHECK_ENABLED",
+														Value: "true",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -2864,6 +2872,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  commonconsts.DynamoComponentEnvVar,
 														Value: commonconsts.ComponentTypeWorker,
 													},
+													{
+														Name:  "DYN_HEALTH_CHECK_ENABLED",
+														Value: "true",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -3028,6 +3040,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 														Name:  commonconsts.DynamoComponentEnvVar,
 														Value: commonconsts.ComponentTypeWorker,
 													},
+													{
+														Name:  "DYN_HEALTH_CHECK_ENABLED",
+														Value: "true",
+													},
 													{
 														Name:  "DYN_PARENT_DGD_K8S_NAME",
 														Value: "test-dynamo-graph-deployment",
@@ -4989,6 +5005,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
 							{Name: "ANOTHER_COMPONENTENV", Value: "true"},
 							{Name: "ANOTHER_CONTAINER_ENV", Value: "true"},
 							{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
+							{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
 							{Name: commonconsts.DynamoNamespaceEnvVar, Value: ""},
 							{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
 							{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
diff --git a/docs/observability/health-checks.md b/docs/observability/health-checks.md
@@ -20,6 +20,9 @@ orchestration frameworks such as Kubernetes.
 | `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` |
 | `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` |
 | `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` |
+| `DYN_HEALTH_CHECK_ENABLED` | Enable canary health checks | `false` (K8s: `true`) | `true`, `false` |
+| `DYN_CANARY_WAIT_TIME` | Seconds before sending canary health check | `10` | `5`, `30` |
+| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Health check request timeout in seconds | `3` | `5`, `10` |
 
 ## Getting Started Quickly
 
@@ -213,6 +216,127 @@ date: Wed, 03 Sep 2025 13:42:45 GMT
 }
 ```
 
+## Canary Health Checks (Active Monitoring)
+
+In addition to the HTTP endpoints described above, Dynamo includes a **canary health check** system that actively monitors worker endpoints.
+
+### Overview
+
+The canary health check system:
+- **Monitors endpoint health** by sending periodic test requests to worker endpoints
+- **Only activates during idle periods** - if there's ongoing traffic, health checks are skipped to avoid overhead
+- **Automatically enabled in Kubernetes** deployments via the operator
+- **Disabled by default** in local/development environments
+
+### How It Works
+
+1. **Idle Detection**: After no activity on an endpoint for a configurable wait time (default: 10 seconds), a canary health check is triggered
+2. **Health Check Request**: A lightweight test request is sent to the endpoint with a minimal payload (generates 1 token)
+3. **Activity Resets Timer**: If normal requests arrive, the canary timer resets and no health check is sent
+4. **Timeout Handling**: If a health check doesn't respond within the timeout (default: 3 seconds), the endpoint is marked as unhealthy
+
+### Configuration
+
+#### In Kubernetes (Enabled by Default)
+
+Health checks are automatically enabled by the Dynamo operator. No additional configuration is required.
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: my-deployment
+spec:
+  services:
+    VllmWorker:
+      componentType: worker
+      replicas: 2
+      # Health checks automatically enabled by operator
+```
+
+#### In Local/Development Environments (Disabled by Default)
+
+To enable health checks locally:
+
+```bash
+# Enable health checks
+export DYN_HEALTH_CHECK_ENABLED=true
+
+# Optional: Customize timing
+export DYN_CANARY_WAIT_TIME=5  # Wait 5 seconds before sending health check
+export DYN_HEALTH_CHECK_REQUEST_TIMEOUT=5  # 5 second timeout
+
+# Start worker
+python -m dynamo.vllm --model Qwen/Qwen3-0.6B
+```
+
+#### Configuration Options
+
+| Environment Variable | Description | Default | Notes |
+|---------------------|-------------|---------|-------|
+| `DYN_HEALTH_CHECK_ENABLED` | Enable/disable canary health checks | `false` (K8s: `true`) | Automatically set to `true` in K8s |
+| `DYN_CANARY_WAIT_TIME` | Seconds to wait (during idle) before sending health check | `10` | Lower values = more frequent checks |
+| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Max seconds to wait for health check response | `3` | Higher values = more tolerance for slow responses |
+
+### Health Check Payloads
+
+Each backend defines its own minimal health check payload:
+
+- **vLLM**: Single token generation with minimal sampling options
+- **TensorRT-LLM**: Single token with BOS token ID
+- **SGLang**: Single token generation request
+
+These payloads are designed to:
+- Complete quickly (< 100ms typically)
+- Minimize GPU overhead
+- Verify the full inference stack is working
+
+### Observing Health Checks
+
+When health checks are enabled, you'll see logs like:
+
+```
+INFO Health check manager started (canary_wait_time: 10s, request_timeout: 3s)
+INFO Spawned health check task for endpoint: generate
+INFO Canary timer expired for generate, sending health check
+INFO Health check successful for generate
+```
+
+If an endpoint fails:
+
+```
+WARN Health check timeout for generate
+ERROR Health check request failed for generate: connection refused
+```
+
+### When to Use Canary Health Checks
+
+**Enable in production (Kubernetes):**
+- ✅ Detect unhealthy workers before they affect user traffic
+- ✅ Enable faster failure detection and recovery
+- ✅ Monitor worker availability continuously
+
+**Disable in development:**
+- ✅ Reduce log noise during debugging
+- ✅ Avoid overhead when not needed
+- ✅ Simplify local testing
+
+### Troubleshooting
+
+**Health checks timing out:**
+- Increase `DYN_HEALTH_CHECK_REQUEST_TIMEOUT`
+- Check worker logs for errors
+- Verify network connectivity
+
+**Too many health check logs:**
+- Increase `DYN_CANARY_WAIT_TIME` to reduce frequency
+- Or disable with `DYN_HEALTH_CHECK_ENABLED=false` in dev
+
+**Health checks not running:**
+- Verify `DYN_HEALTH_CHECK_ENABLED=true` is set
+- Check that `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` includes the endpoint
+- Ensure the worker is serving the endpoint
+
 ## Related Documentation
 
 - [Distributed Runtime Architecture](../design_docs/distributed_runtime.md)
diff --git a/lib/runtime/src/config.rs b/lib/runtime/src/config.rs
@@ -162,7 +162,7 @@ pub struct RuntimeConfig {
 
     /// Enable active health checking with payloads
     /// Set this at runtime with environment variable DYN_HEALTH_CHECK_ENABLED
-    #[builder(default = "true")]
+    #[builder(default = "false")]
     #[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
     pub health_check_enabled: bool,
 
@@ -358,7 +358,7 @@ impl RuntimeConfig {
             compute_threads: Some(1),
             compute_stack_size: Some(2 * 1024 * 1024),
             compute_thread_prefix: "compute".to_string(),
-            health_check_enabled: true,
+            health_check_enabled: false,
             canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
             health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
         }
@@ -394,7 +394,7 @@ impl Default for RuntimeConfig {
             compute_threads: None,
             compute_stack_size: Some(2 * 1024 * 1024),
             compute_thread_prefix: "compute".to_string(),
-            health_check_enabled: true,
+            health_check_enabled: false,
             canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
             health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
         }