Skip to content

Commit ed905e4

Browse files
authored
Merge branch 'main' into feature/dependency-extraction-DYN-1235
2 parents ab5efbe + 5c69c11 commit ed905e4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1121
-1107
lines changed

Cargo.lock

Lines changed: 24 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ members = [
1010
"lib/async-openai",
1111
"lib/parsers",
1212
"lib/bindings/c",
13+
"lib/bindings/python/codegen",
1314
"lib/engines/*",
1415
]
1516
# Exclude certain packages that are slow to build and we don't ship as flagship

components/src/dynamo/planner/utils/prometheus.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from prometheus_api_client import PrometheusConnect
2020
from pydantic import BaseModel, ValidationError
2121

22-
from dynamo._core import prometheus_names
22+
from dynamo import prometheus_names
2323
from dynamo.runtime.logging import configure_dynamo_logging
2424

2525
configure_dynamo_logging()
@@ -94,23 +94,23 @@ def _get_average_metric(
9494

9595
def get_avg_inter_token_latency(self, interval: str, model_name: str):
9696
return self._get_average_metric(
97-
prometheus_names.frontend.inter_token_latency_seconds,
97+
prometheus_names.frontend_service.INTER_TOKEN_LATENCY_SECONDS,
9898
interval,
9999
"avg inter token latency",
100100
model_name,
101101
)
102102

103103
def get_avg_time_to_first_token(self, interval: str, model_name: str):
104104
return self._get_average_metric(
105-
prometheus_names.frontend.time_to_first_token_seconds,
105+
prometheus_names.frontend_service.TIME_TO_FIRST_TOKEN_SECONDS,
106106
interval,
107107
"avg time to first token",
108108
model_name,
109109
)
110110

111111
def get_avg_request_duration(self, interval: str, model_name: str):
112112
return self._get_average_metric(
113-
prometheus_names.frontend.request_duration_seconds,
113+
prometheus_names.frontend_service.REQUEST_DURATION_SECONDS,
114114
interval,
115115
"avg request duration",
116116
model_name,
@@ -119,7 +119,7 @@ def get_avg_request_duration(self, interval: str, model_name: str):
119119
def get_avg_request_count(self, interval: str, model_name: str):
120120
# This function follows a different query pattern than the other metrics
121121
try:
122-
requests_total_metric = prometheus_names.frontend.requests_total
122+
requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
123123
raw_res = self.prom.custom_query(
124124
query=f"increase({requests_total_metric}[{interval}])"
125125
)
@@ -138,15 +138,15 @@ def get_avg_request_count(self, interval: str, model_name: str):
138138

139139
def get_avg_input_sequence_tokens(self, interval: str, model_name: str):
140140
return self._get_average_metric(
141-
prometheus_names.frontend.input_sequence_tokens,
141+
prometheus_names.frontend_service.INPUT_SEQUENCE_TOKENS,
142142
interval,
143143
"avg input sequence tokens",
144144
model_name,
145145
)
146146

147147
def get_avg_output_sequence_tokens(self, interval: str, model_name: str):
148148
return self._get_average_metric(
149-
prometheus_names.frontend.output_sequence_tokens,
149+
prometheus_names.frontend_service.OUTPUT_SEQUENCE_TOKENS,
150150
interval,
151151
"avg output sequence tokens",
152152
model_name,

container/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ ARG SCCACHE_REGION=""
3939

4040
# NIXL configuration
4141
ARG NIXL_UCX_REF=v1.19.0
42-
ARG NIXL_REF=0.4.1
42+
ARG NIXL_REF=0.6.0
4343

4444
# Python configuration
4545
ARG PYTHON_VERSION=3.12

container/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
115115
SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
116116
SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
117117

118-
NIXL_REF=0.4.1
118+
NIXL_REF=0.6.0
119119
NIXL_UCX_REF=v1.19.0
120120
NIXL_UCX_EFA_REF=9d2b88a1f67faf9876f267658bd077b379b8bb76
121121

deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json

Lines changed: 14 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"editable": true,
2020
"fiscalYearStartMonth": 0,
2121
"graphTooltip": 0,
22-
"id": 6,
22+
"id": 1,
2323
"links": [],
2424
"panels": [
2525
{
@@ -209,7 +209,7 @@
209209
"x": 0,
210210
"y": 10
211211
},
212-
"id": 2,
212+
"id": 3,
213213
"options": {
214214
"legend": {
215215
"calcs": [],
@@ -228,7 +228,7 @@
228228
{
229229
"disableTextWrap": false,
230230
"editorMode": "code",
231-
"expr": "kvbm_offload_requests",
231+
"expr": "kvbm_offload_blocks_d2h",
232232
"fullMetaSearch": false,
233233
"includeNullMetadata": true,
234234
"legendFormat": "__auto",
@@ -237,7 +237,7 @@
237237
"useBackend": false
238238
}
239239
],
240-
"title": "Offload Requests",
240+
"title": "Offload Blocks - Device to Host",
241241
"type": "timeseries"
242242
},
243243
{
@@ -305,7 +305,7 @@
305305
"x": 12,
306306
"y": 10
307307
},
308-
"id": 3,
308+
"id": 11,
309309
"options": {
310310
"legend": {
311311
"calcs": [],
@@ -324,7 +324,7 @@
324324
{
325325
"disableTextWrap": false,
326326
"editorMode": "code",
327-
"expr": "kvbm_offload_blocks_d2h",
327+
"expr": "kvbm_offload_blocks_h2d",
328328
"fullMetaSearch": false,
329329
"includeNullMetadata": true,
330330
"legendFormat": "__auto",
@@ -333,7 +333,7 @@
333333
"useBackend": false
334334
}
335335
],
336-
"title": "Offload Blocks - Device to Host",
336+
"title": "Offload Blocks - Host to Disk",
337337
"type": "timeseries"
338338
},
339339
{
@@ -342,7 +342,7 @@
342342
"h": 1,
343343
"w": 24,
344344
"x": 0,
345-
"y": 26
345+
"y": 18
346346
},
347347
"id": 6,
348348
"panels": [],
@@ -412,103 +412,7 @@
412412
"h": 8,
413413
"w": 12,
414414
"x": 0,
415-
"y": 27
416-
},
417-
"id": 9,
418-
"options": {
419-
"legend": {
420-
"calcs": [],
421-
"displayMode": "list",
422-
"placement": "bottom",
423-
"showLegend": true
424-
},
425-
"tooltip": {
426-
"hideZeros": false,
427-
"mode": "single",
428-
"sort": "none"
429-
}
430-
},
431-
"pluginVersion": "12.0.1",
432-
"targets": [
433-
{
434-
"disableTextWrap": false,
435-
"editorMode": "code",
436-
"expr": "kvbm_onboard_requests",
437-
"fullMetaSearch": false,
438-
"includeNullMetadata": true,
439-
"legendFormat": "__auto",
440-
"range": true,
441-
"refId": "A",
442-
"useBackend": false
443-
}
444-
],
445-
"title": "Onboard Requests",
446-
"type": "timeseries"
447-
},
448-
{
449-
"datasource": {
450-
"type": "prometheus",
451-
"uid": "P1809F7CD0C75ACF3"
452-
},
453-
"fieldConfig": {
454-
"defaults": {
455-
"color": {
456-
"mode": "palette-classic"
457-
},
458-
"custom": {
459-
"axisBorderShow": false,
460-
"axisCenteredZero": false,
461-
"axisColorMode": "text",
462-
"axisLabel": "",
463-
"axisPlacement": "auto",
464-
"barAlignment": 0,
465-
"barWidthFactor": 0.6,
466-
"drawStyle": "line",
467-
"fillOpacity": 0,
468-
"gradientMode": "none",
469-
"hideFrom": {
470-
"legend": false,
471-
"tooltip": false,
472-
"viz": false
473-
},
474-
"insertNulls": false,
475-
"lineInterpolation": "linear",
476-
"lineWidth": 1,
477-
"pointSize": 5,
478-
"scaleDistribution": {
479-
"type": "linear"
480-
},
481-
"showPoints": "auto",
482-
"spanNulls": false,
483-
"stacking": {
484-
"group": "A",
485-
"mode": "none"
486-
},
487-
"thresholdsStyle": {
488-
"mode": "off"
489-
}
490-
},
491-
"mappings": [],
492-
"thresholds": {
493-
"mode": "absolute",
494-
"steps": [
495-
{
496-
"color": "green"
497-
},
498-
{
499-
"color": "red",
500-
"value": 80
501-
}
502-
]
503-
}
504-
},
505-
"overrides": []
506-
},
507-
"gridPos": {
508-
"h": 8,
509-
"w": 12,
510-
"x": 12,
511-
"y": 27
415+
"y": 19
512416
},
513417
"id": 4,
514418
"options": {
@@ -603,8 +507,8 @@
603507
"gridPos": {
604508
"h": 8,
605509
"w": 12,
606-
"x": 0,
607-
"y": 35
510+
"x": 12,
511+
"y": 19
608512
},
609513
"id": 8,
610514
"options": {
@@ -639,7 +543,7 @@
639543
}
640544
],
641545
"preload": false,
642-
"refresh": "auto",
546+
"refresh": "5s",
643547
"schemaVersion": 41,
644548
"tags": [],
645549
"templating": {
@@ -653,5 +557,5 @@
653557
"timezone": "browser",
654558
"title": "KVBM Dashboard",
655559
"uid": "3f679257-70a5-402c-92b4-05382337b548",
656-
"version": 7
657-
}
560+
"version": 14
561+
}

docs/guides/run_kvbm_in_trtllm.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,13 @@ python3 -m dynamo.frontend --http-port 8000 &
8383

8484
# [DYNAMO] To serve an LLM model with dynamo
8585
python3 -m dynamo.trtllm \
86-
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
87-
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
86+
--model-path Qwen/Qwen3-0.6B \
87+
--served-model-name Qwen/Qwen3-0.6B \
8888
--extra-engine-args /tmp/kvbm_llm_api_config.yaml &
8989

9090
# make a call to LLM
9191
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
92-
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
92+
"model": "Qwen/Qwen3-0.6B",
9393
"messages": [
9494
{
9595
"role": "user",
@@ -104,7 +104,7 @@ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json"
104104

105105
Alternatively, can use "trtllm-serve" with KVBM by replacing the above two [DYNAMO] cmds with below:
106106
```bash
107-
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
107+
trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
108108
```
109109

110110
## Enable and View KVBM Metrics
@@ -118,8 +118,8 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
118118
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
119119
DYN_KVBM_METRICS=true \
120120
python3 -m dynamo.trtllm \
121-
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
122-
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
121+
--model-path Qwen/Qwen3-0.6B \
122+
--served-model-name Qwen/Qwen3-0.6B \
123123
--extra-engine-args /tmp/kvbm_llm_api_config.yaml &
124124

125125
# optional if firewall blocks KVBM metrics ports to send prometheus metrics
@@ -138,7 +138,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
138138
# we are passing model, endpoint, output file prefix and qps to the sh script.
139139
cd LMBenchmark/synthetic-multi-round-qa
140140
./long_input_short_output_run.sh \
141-
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
141+
"Qwen/Qwen3-0.6B" \
142142
"http://localhost:8000" \
143143
"benchmark_kvbm" \
144144
1
@@ -160,5 +160,5 @@ kv_cache_config:
160160
EOF
161161

162162
# run trtllm-serve for the baseline for comparison
163-
trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
163+
trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
164164
```

0 commit comments

Comments
 (0)