ai-dynamo
diff --git a/‎Cargo.lock‎
Lines changed: 24 additions & 2 deletions b/‎Cargo.lock‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎components/src/dynamo/planner/utils/prometheus.py‎
Lines changed: 7 additions & 7 deletions b/‎components/src/dynamo/planner/utils/prometheus.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎container/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎container/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎container/build.sh‎
Lines changed: 1 addition & 1 deletion b/‎container/build.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json‎
Lines changed: 14 additions & 110 deletions b/‎deploy/metrics/grafana_dashboards/grafana-kvbm-dashboard.json‎
Lines changed: 14 additions & 110 deletions
diff --git a/‎docs/guides/run_kvbm_in_trtllm.md‎
Lines changed: 8 additions & 8 deletions b/‎docs/guides/run_kvbm_in_trtllm.md‎
Lines changed: 8 additions & 8 deletions
@@ -10,6 +10,7 @@ members = [
     "lib/async-openai",
     "lib/parsers",
     "lib/bindings/c",
+    "lib/bindings/python/codegen",
     "lib/engines/*",
 ]
 # Exclude certain packages that are slow to build and we don't ship as flagship
 
@@ -19,7 +19,7 @@
 from prometheus_api_client import PrometheusConnect
 from pydantic import BaseModel, ValidationError
 
-from dynamo._core import prometheus_names
+from dynamo import prometheus_names
 from dynamo.runtime.logging import configure_dynamo_logging
 
 configure_dynamo_logging()
@@ -94,23 +94,23 @@ def _get_average_metric(
 
     def get_avg_inter_token_latency(self, interval: str, model_name: str):
         return self._get_average_metric(
-            prometheus_names.frontend.inter_token_latency_seconds,
+            prometheus_names.frontend_service.INTER_TOKEN_LATENCY_SECONDS,
             interval,
             "avg inter token latency",
             model_name,
         )
 
     def get_avg_time_to_first_token(self, interval: str, model_name: str):
         return self._get_average_metric(
-            prometheus_names.frontend.time_to_first_token_seconds,
+            prometheus_names.frontend_service.TIME_TO_FIRST_TOKEN_SECONDS,
             interval,
             "avg time to first token",
             model_name,
         )
 
     def get_avg_request_duration(self, interval: str, model_name: str):
         return self._get_average_metric(
-            prometheus_names.frontend.request_duration_seconds,
+            prometheus_names.frontend_service.REQUEST_DURATION_SECONDS,
             interval,
             "avg request duration",
             model_name,
@@ -119,7 +119,7 @@ def get_avg_request_duration(self, interval: str, model_name: str):
     def get_avg_request_count(self, interval: str, model_name: str):
         # This function follows a different query pattern than the other metrics
         try:
-            requests_total_metric = prometheus_names.frontend.requests_total
+            requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
             raw_res = self.prom.custom_query(
                 query=f"increase({requests_total_metric}[{interval}])"
             )
@@ -138,15 +138,15 @@ def get_avg_request_count(self, interval: str, model_name: str):
 
     def get_avg_input_sequence_tokens(self, interval: str, model_name: str):
         return self._get_average_metric(
-            prometheus_names.frontend.input_sequence_tokens,
+            prometheus_names.frontend_service.INPUT_SEQUENCE_TOKENS,
             interval,
             "avg input sequence tokens",
             model_name,
         )
 
     def get_avg_output_sequence_tokens(self, interval: str, model_name: str):
         return self._get_average_metric(
-            prometheus_names.frontend.output_sequence_tokens,
+            prometheus_names.frontend_service.OUTPUT_SEQUENCE_TOKENS,
             interval,
             "avg output sequence tokens",
             model_name,
 
@@ -39,7 +39,7 @@ ARG SCCACHE_REGION=""
 
 # NIXL configuration
 ARG NIXL_UCX_REF=v1.19.0
-ARG NIXL_REF=0.4.1
+ARG NIXL_REF=0.6.0
 
 # Python configuration
 ARG PYTHON_VERSION=3.12
 
@@ -115,7 +115,7 @@ NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
 SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 SGLANG_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
 
-NIXL_REF=0.4.1
+NIXL_REF=0.6.0
 NIXL_UCX_REF=v1.19.0
 NIXL_UCX_EFA_REF=9d2b88a1f67faf9876f267658bd077b379b8bb76
 
 
@@ -19,7 +19,7 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": 6,
+  "id": 1,
   "links": [],
   "panels": [
     {
@@ -209,7 +209,7 @@
         "x": 0,
         "y": 10
       },
-      "id": 2,
+      "id": 3,
       "options": {
         "legend": {
           "calcs": [],
@@ -228,7 +228,7 @@
         {
           "disableTextWrap": false,
           "editorMode": "code",
-          "expr": "kvbm_offload_requests",
+          "expr": "kvbm_offload_blocks_d2h",
           "fullMetaSearch": false,
           "includeNullMetadata": true,
           "legendFormat": "__auto",
@@ -237,7 +237,7 @@
           "useBackend": false
         }
       ],
-      "title": "Offload Requests",
+      "title": "Offload Blocks - Device to Host",
       "type": "timeseries"
     },
     {
@@ -305,7 +305,7 @@
         "x": 12,
         "y": 10
       },
-      "id": 3,
+      "id": 11,
       "options": {
         "legend": {
           "calcs": [],
@@ -324,7 +324,7 @@
         {
           "disableTextWrap": false,
           "editorMode": "code",
-          "expr": "kvbm_offload_blocks_d2h",
+          "expr": "kvbm_offload_blocks_h2d",
           "fullMetaSearch": false,
           "includeNullMetadata": true,
           "legendFormat": "__auto",
@@ -333,7 +333,7 @@
           "useBackend": false
         }
       ],
-      "title": "Offload Blocks - Device to Host",
+      "title": "Offload Blocks - Host to Disk",
       "type": "timeseries"
     },
     {
@@ -342,7 +342,7 @@
         "h": 1,
         "w": 24,
         "x": 0,
-        "y": 26
+        "y": 18
       },
       "id": 6,
       "panels": [],
@@ -412,103 +412,7 @@
         "h": 8,
         "w": 12,
         "x": 0,
-        "y": 27
-      },
-      "id": 9,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "hideZeros": false,
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "pluginVersion": "12.0.1",
-      "targets": [
-        {
-          "disableTextWrap": false,
-          "editorMode": "code",
-          "expr": "kvbm_onboard_requests",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "legendFormat": "__auto",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        }
-      ],
-      "title": "Onboard Requests",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "P1809F7CD0C75ACF3"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "barWidthFactor": 0.6,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green"
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 27
+        "y": 19
       },
       "id": 4,
       "options": {
@@ -603,8 +507,8 @@
       "gridPos": {
         "h": 8,
         "w": 12,
-        "x": 0,
-        "y": 35
+        "x": 12,
+        "y": 19
       },
       "id": 8,
       "options": {
@@ -639,7 +543,7 @@
     }
   ],
   "preload": false,
-  "refresh": "auto",
+  "refresh": "5s",
   "schemaVersion": 41,
   "tags": [],
   "templating": {
@@ -653,5 +557,5 @@
   "timezone": "browser",
   "title": "KVBM Dashboard",
   "uid": "3f679257-70a5-402c-92b4-05382337b548",
-  "version": 7
-}
+  "version": 14
+}
@@ -83,13 +83,13 @@ python3 -m dynamo.frontend --http-port 8000 &
 
 # [DYNAMO] To serve an LLM model with dynamo
 python3 -m dynamo.trtllm \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --extra-engine-args /tmp/kvbm_llm_api_config.yaml &
 
 # make a call to LLM
 curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
-    "model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+    "model": "Qwen/Qwen3-0.6B",
     "messages": [
     {
         "role": "user",
@@ -104,7 +104,7 @@ curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"
 
 Alternatively, can use "trtllm-serve" with KVBM by replacing the above two [DYNAMO] cmds with below:
 ```bash
-trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
+trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/kvbm_llm_api_config.yaml
 ```
 
 ## Enable and View KVBM Metrics
@@ -118,8 +118,8 @@ docker compose -f deploy/docker-compose.yml --profile metrics up -d
 # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
 DYN_KVBM_METRICS=true \
 python3 -m dynamo.trtllm \
-  --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+  --model-path Qwen/Qwen3-0.6B \
+  --served-model-name Qwen/Qwen3-0.6B \
   --extra-engine-args /tmp/kvbm_llm_api_config.yaml &
 
 # optional if firewall blocks KVBM metrics ports to send prometheus metrics
@@ -138,7 +138,7 @@ git clone https://github.com/LMCache/LMBenchmark.git
 # we are passing model, endpoint, output file prefix and qps to the sh script.
 cd LMBenchmark/synthetic-multi-round-qa
 ./long_input_short_output_run.sh \
-    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
+    "Qwen/Qwen3-0.6B" \
     "http://localhost:8000" \
     "benchmark_kvbm" \
     1
@@ -160,5 +160,5 @@ kv_cache_config:
 EOF
 
 # run trtllm-serve for the baseline for comparison
-trtllm-serve deepseek-ai/DeepSeek-R1-Distill-Llama-8B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
+trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
 ```
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ members = [`
`10`	`10`	`"lib/async-openai",`
`11`	`11`	`"lib/parsers",`
`12`	`12`	`"lib/bindings/c",`
	`13`	`+ "lib/bindings/python/codegen",`
`13`	`14`	`"lib/engines/*",`
`14`	`15`	`]`
`15`	`16`	`# Exclude certain packages that are slow to build and we don't ship as flagship`