Skip to content

Commit 7c15166

Browse files
authored
feat: Disable health checks by default; auto-enable in K8s via operator (#4804)
Signed-off-by: [email protected] <[email protected]>
1 parent a473402 commit 7c15166

File tree

5 files changed

+150
-3
lines changed

5 files changed

+150
-3
lines changed

deploy/cloud/operator/internal/controller/dynamocomponentdeployment_controller_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
827827
Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"},
828828
Env: []corev1.EnvVar{
829829
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
830+
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
830831
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
831832
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
832833
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
@@ -955,6 +956,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
955956
Args: []string{"ray start --address=$LWS_LEADER_ADDRESS:6379 --block"},
956957
Env: []corev1.EnvVar{
957958
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
959+
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
958960
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
959961
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
960962
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},

deploy/cloud/operator/internal/dynamo/component_worker.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
8686
Name: "DYN_SYSTEM_PORT",
8787
Value: fmt.Sprintf("%d", commonconsts.DynamoSystemPort),
8888
},
89+
{
90+
Name: "DYN_HEALTH_CHECK_ENABLED",
91+
Value: "true",
92+
},
8993
}...)
9094

9195
return container, nil

deploy/cloud/operator/internal/dynamo/graph_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1963,6 +1963,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
19631963
Name: commonconsts.DynamoComponentEnvVar,
19641964
Value: commonconsts.ComponentTypeWorker,
19651965
},
1966+
{
1967+
Name: "DYN_HEALTH_CHECK_ENABLED",
1968+
Value: "true",
1969+
},
19661970
{
19671971
Name: "DYN_PARENT_DGD_K8S_NAME",
19681972
Value: "test-dynamo-graph-deployment",
@@ -2140,6 +2144,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
21402144
Name: commonconsts.DynamoComponentEnvVar,
21412145
Value: commonconsts.ComponentTypeWorker,
21422146
},
2147+
{
2148+
Name: "DYN_HEALTH_CHECK_ENABLED",
2149+
Value: "true",
2150+
},
21432151
{
21442152
Name: "DYN_PARENT_DGD_K8S_NAME",
21452153
Value: "test-dynamo-graph-deployment",
@@ -2864,6 +2872,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
28642872
Name: commonconsts.DynamoComponentEnvVar,
28652873
Value: commonconsts.ComponentTypeWorker,
28662874
},
2875+
{
2876+
Name: "DYN_HEALTH_CHECK_ENABLED",
2877+
Value: "true",
2878+
},
28672879
{
28682880
Name: "DYN_PARENT_DGD_K8S_NAME",
28692881
Value: "test-dynamo-graph-deployment",
@@ -3028,6 +3040,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
30283040
Name: commonconsts.DynamoComponentEnvVar,
30293041
Value: commonconsts.ComponentTypeWorker,
30303042
},
3043+
{
3044+
Name: "DYN_HEALTH_CHECK_ENABLED",
3045+
Value: "true",
3046+
},
30313047
{
30323048
Name: "DYN_PARENT_DGD_K8S_NAME",
30333049
Value: "test-dynamo-graph-deployment",
@@ -4989,6 +5005,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
49895005
{Name: "ANOTHER_COMPONENTENV", Value: "true"},
49905006
{Name: "ANOTHER_CONTAINER_ENV", Value: "true"},
49915007
{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
5008+
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
49925009
{Name: commonconsts.DynamoNamespaceEnvVar, Value: ""},
49935010
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
49945011
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},

docs/observability/health-checks.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ orchestration frameworks such as Kubernetes.
2020
| `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` |
2121
| `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` |
2222
| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` |
23+
| `DYN_HEALTH_CHECK_ENABLED` | Enable canary health checks | `false` (K8s: `true`) | `true`, `false` |
24+
| `DYN_CANARY_WAIT_TIME` | Seconds before sending canary health check | `10` | `5`, `30` |
25+
| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Health check request timeout in seconds | `3` | `5`, `10` |
2326

2427
## Getting Started Quickly
2528

@@ -213,6 +216,127 @@ date: Wed, 03 Sep 2025 13:42:45 GMT
213216
}
214217
```
215218

219+
## Canary Health Checks (Active Monitoring)
220+
221+
In addition to the HTTP endpoints described above, Dynamo includes a **canary health check** system that actively monitors worker endpoints.
222+
223+
### Overview
224+
225+
The canary health check system:
226+
- **Monitors endpoint health** by sending periodic test requests to worker endpoints
227+
- **Only activates during idle periods** - if there's ongoing traffic, health checks are skipped to avoid overhead
228+
- **Automatically enabled in Kubernetes** deployments via the operator
229+
- **Disabled by default** in local/development environments
230+
231+
### How It Works
232+
233+
1. **Idle Detection**: After no activity on an endpoint for a configurable wait time (default: 10 seconds), a canary health check is triggered
234+
2. **Health Check Request**: A lightweight test request is sent to the endpoint with a minimal payload (generates 1 token)
235+
3. **Activity Resets Timer**: If normal requests arrive, the canary timer resets and no health check is sent
236+
4. **Timeout Handling**: If a health check doesn't respond within the timeout (default: 3 seconds), the endpoint is marked as unhealthy
237+
238+
### Configuration
239+
240+
#### In Kubernetes (Enabled by Default)
241+
242+
Health checks are automatically enabled by the Dynamo operator. No additional configuration is required.
243+
244+
```yaml
245+
apiVersion: nvidia.com/v1alpha1
246+
kind: DynamoGraphDeployment
247+
metadata:
248+
name: my-deployment
249+
spec:
250+
services:
251+
VllmWorker:
252+
componentType: worker
253+
replicas: 2
254+
# Health checks automatically enabled by operator
255+
```
256+
257+
#### In Local/Development Environments (Disabled by Default)
258+
259+
To enable health checks locally:
260+
261+
```bash
262+
# Enable health checks
263+
export DYN_HEALTH_CHECK_ENABLED=true
264+
265+
# Optional: Customize timing
266+
export DYN_CANARY_WAIT_TIME=5 # Wait 5 seconds before sending health check
267+
export DYN_HEALTH_CHECK_REQUEST_TIMEOUT=5 # 5 second timeout
268+
269+
# Start worker
270+
python -m dynamo.vllm --model Qwen/Qwen3-0.6B
271+
```
272+
273+
#### Configuration Options
274+
275+
| Environment Variable | Description | Default | Notes |
276+
|---------------------|-------------|---------|-------|
277+
| `DYN_HEALTH_CHECK_ENABLED` | Enable/disable canary health checks | `false` (K8s: `true`) | Automatically set to `true` in K8s |
278+
| `DYN_CANARY_WAIT_TIME` | Seconds to wait (during idle) before sending health check | `10` | Lower values = more frequent checks |
279+
| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Max seconds to wait for health check response | `3` | Higher values = more tolerance for slow responses |
280+
281+
### Health Check Payloads
282+
283+
Each backend defines its own minimal health check payload:
284+
285+
- **vLLM**: Single token generation with minimal sampling options
286+
- **TensorRT-LLM**: Single token with BOS token ID
287+
- **SGLang**: Single token generation request
288+
289+
These payloads are designed to:
290+
- Complete quickly (< 100ms typically)
291+
- Minimize GPU overhead
292+
- Verify the full inference stack is working
293+
294+
### Observing Health Checks
295+
296+
When health checks are enabled, you'll see logs like:
297+
298+
```
299+
INFO Health check manager started (canary_wait_time: 10s, request_timeout: 3s)
300+
INFO Spawned health check task for endpoint: generate
301+
INFO Canary timer expired for generate, sending health check
302+
INFO Health check successful for generate
303+
```
304+
305+
If an endpoint fails:
306+
307+
```
308+
WARN Health check timeout for generate
309+
ERROR Health check request failed for generate: connection refused
310+
```
311+
312+
### When to Use Canary Health Checks
313+
314+
**Enable in production (Kubernetes):**
315+
- ✅ Detect unhealthy workers before they affect user traffic
316+
- ✅ Enable faster failure detection and recovery
317+
- ✅ Monitor worker availability continuously
318+
319+
**Disable in development:**
320+
- ✅ Reduce log noise during debugging
321+
- ✅ Avoid overhead when not needed
322+
- ✅ Simplify local testing
323+
324+
### Troubleshooting
325+
326+
**Health checks timing out:**
327+
- Increase `DYN_HEALTH_CHECK_REQUEST_TIMEOUT`
328+
- Check worker logs for errors
329+
- Verify network connectivity
330+
331+
**Too many health check logs:**
332+
- Increase `DYN_CANARY_WAIT_TIME` to reduce frequency
333+
- Or disable with `DYN_HEALTH_CHECK_ENABLED=false` in dev
334+
335+
**Health checks not running:**
336+
- Verify `DYN_HEALTH_CHECK_ENABLED=true` is set
337+
- Check that `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` includes the endpoint
338+
- Ensure the worker is serving the endpoint
339+
216340
## Related Documentation
217341

218342
- [Distributed Runtime Architecture](../design_docs/distributed_runtime.md)

lib/runtime/src/config.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ pub struct RuntimeConfig {
162162

163163
/// Enable active health checking with payloads
164164
/// Set this at runtime with environment variable DYN_HEALTH_CHECK_ENABLED
165-
#[builder(default = "true")]
165+
#[builder(default = "false")]
166166
#[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
167167
pub health_check_enabled: bool,
168168

@@ -358,7 +358,7 @@ impl RuntimeConfig {
358358
compute_threads: Some(1),
359359
compute_stack_size: Some(2 * 1024 * 1024),
360360
compute_thread_prefix: "compute".to_string(),
361-
health_check_enabled: true,
361+
health_check_enabled: false,
362362
canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
363363
health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
364364
}
@@ -394,7 +394,7 @@ impl Default for RuntimeConfig {
394394
compute_threads: None,
395395
compute_stack_size: Some(2 * 1024 * 1024),
396396
compute_thread_prefix: "compute".to_string(),
397-
health_check_enabled: true,
397+
health_check_enabled: false,
398398
canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
399399
health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
400400
}

0 commit comments

Comments
 (0)