chore: TRTLLM 1.2.0rc4 (#4836)

dmitry-tokarev-nv · web-flow · commit 525030324e35 · 2025-12-11T03:02:24.000Z
Signed-off-by: Dmitry Tokarev &lt;dtokarev@nvidia.com&gt;
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
     "pydantic>=2",
     "tabulate",
     "types-tabulate",
-    # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc2/rc3 (==4.56.0), SGLang 0.5.6 (==4.57.1)
+    # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6 (==4.57.1)
     "transformers>=4.56.0,<=4.57.1",
     "pytest-mypy",
 ]
diff --git a/components/src/dynamo/trtllm/main.py b/components/src/dynamo/trtllm/main.py
@@ -22,7 +22,6 @@
 import uvloop
 from prometheus_client import REGISTRY
 from tensorrt_llm.llmapi import (
-    BuildConfig,
     CapacitySchedulerPolicy,
     DynamicBatchConfig,
     KvCacheConfig,
@@ -162,13 +161,6 @@ async def init(runtime: DistributedRuntime, config: Config):
     else:
         gpus_per_node = config.gpus_per_node
 
-    build_config = BuildConfig(
-        max_batch_size=config.max_batch_size,
-        max_num_tokens=config.max_num_tokens,
-        max_beam_width=config.max_beam_width,
-        max_seq_len=config.max_seq_len,
-    )
-
     kv_cache_config = KvCacheConfig(
         free_gpu_memory_fraction=config.free_gpu_memory_fraction
     )
@@ -190,7 +182,6 @@ async def init(runtime: DistributedRuntime, config: Config):
         "pipeline_parallel_size": config.pipeline_parallel_size,
         "moe_expert_parallel_size": config.expert_parallel_size,
         "backend": Backend.PYTORCH,
-        "build_config": build_config,
         "kv_cache_config": kv_cache_config,
         "gpus_per_node": gpus_per_node,
         "max_num_tokens": config.max_num_tokens,
diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm
@@ -314,6 +314,7 @@ COPY components/ /opt/dynamo/components/
 
 # Build dynamo wheels
 ARG ENABLE_KVBM
+ARG USE_SCCACHE
 RUN --mount=type=secret,id=aws-key-id,env=AWS_ACCESS_KEY_ID \
     --mount=type=secret,id=aws-secret-id,env=AWS_SECRET_ACCESS_KEY \
     export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
@@ -453,7 +454,7 @@ RUN if [ "$HAS_TRTLLM_CONTEXT" = "1" ]; then \
         sed -i 's/pip3 install/uv pip install/g' /tmp/install_tensorrt.sh && \
         bash /tmp/install_tensorrt.sh && \
         # Install TensorRT-LLM wheel from the provided index URL, allow dependencies from PyPI
-        # TRTLLM 1.2.0rc2 has issues installing from pypi with uv, installing from direct wheel link works best
+        # TRTLLM 1.2.0rc5 has issues installing from pypi with uv, installing from direct wheel link works best
         # explicitly installing triton 3.5.0 as trtllm only lists triton as dependency on x64_64 for some reason
         if echo "${TENSORRTLLM_PIP_WHEEL}" | grep -q '^tensorrt-llm=='; then \
             TRTLLM_VERSION=$(echo "${TENSORRTLLM_PIP_WHEEL}" | sed -E 's/tensorrt-llm==([0-9a-zA-Z.+-]+).*/\1/'); \
diff --git a/container/build.sh b/container/build.sh
@@ -89,7 +89,7 @@ DEFAULT_TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="31116825b39f4e6a6a1e127001f5204b73d1dc32" # 1.2.0rc2
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="e4c707845ff58fcc0b1d87afb4dd0e64885c780a" # 1.2.0rc5
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""
@@ -98,7 +98,7 @@ TRTLLM_GIT_URL=""
 DEFAULT_TENSORRTLLM_INDEX_URL="https://pypi.nvidia.com/"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.trtllm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc3"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.2.0rc5"
 TENSORRTLLM_PIP_WHEEL=""
 
 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
@@ -52,7 +52,7 @@ tensorboard==2.19.0
 tensorboardX==2.6.2.2
 # Transformers version constraint for container builds
 # - vLLM 0.11.0: >=4.55.2, vLLM 0.11.2: >=4.56.0,<5
-# - TensorRT-LLM 1.2.0rc2/rc3: ==4.56.0
+# - TensorRT-LLM 1.2.0rc5: ==4.56.0
 # - SGLang 0.5.6: ==4.57.1
 # Using >=4.56.0 and <=4.57.1 to satisfy all frameworks
 transformers>=4.56.0,<=4.57.1
diff --git a/docs/backends/trtllm/multimodal_support.md b/docs/backends/trtllm/multimodal_support.md
@@ -96,15 +96,6 @@ To deploy `Llama-4-Maverick-17B-128E-Instruct` in disaggregated mode, you will n
 
 For high-performance multimodal inference, Dynamo supports pre-computed embeddings with an **Encode-Prefill-Decode (EPD)** flow using **NIXL (RDMA)** for zero-copy tensor transfer.
 
-### Enabling the Feature
-
-This is an experimental feature that requires using a specific TensorRT-LLM commit.
-To enable it build the dynamo container with the `--tensorrtllm-commit` flag:
-
-```bash
-./container/build.sh --framework trtllm --tensorrtllm-git-url https://github.com/NVIDIA/TensorRT-LLM.git --tensorrtllm-commit v1.2.0rc3
-```
-
 ### Supported File Types
 
 - `.pt` - PyTorch tensor files
diff --git a/docs/reference/support-matrix.md b/docs/reference/support-matrix.md
@@ -58,12 +58,12 @@ If you are using a **GPU**, the following GPU models and architectures are suppo
 
 ### Build Dependency
 
-| **Build Dependency** | **Version as of Dynamo v0.7.0**                                                   |
-| :------------------- | :------------------------------------------------------------------------------- |
-| **SGLang**           | 0.5.3.post4                                                                      |
-| **TensorRT-LLM**     | 1.2.0rc2                                                                         |
-| **vLLM**             | 0.11.0                                                                           |
-| **NIXL**             | 0.7.1                                                                            |
+| **Build Dependency** | **Version as of Dynamo v0.7.0** |
+| :------------------- | :------------------------------ |
+| **SGLang**           | 0.5.3.post4                     |
+| **TensorRT-LLM**     | 1.2.0rc5                        |
+| **vLLM**             | 0.11.0                          |
+| **NIXL**             | 0.7.1                           |
 
 
 > [!Important]
diff --git a/pyproject.toml b/pyproject.toml
@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"
 [project.optional-dependencies]
 trtllm =[
     "uvloop",
-    "tensorrt-llm==1.2.0rc3",
+    "tensorrt-llm==1.2.0rc5",
 ]
 
 vllm = [

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ dependencies = [`
`46`	`46`	`"pydantic>=2",`
`47`	`47`	`"tabulate",`
`48`	`48`	`"types-tabulate",`
`49`		`- # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc2/rc3 (==4.56.0), SGLang 0.5.6 (==4.57.1)`
	`49`	`+ # Satisfies vLLM 0.11.0 (>=4.55.2), vLLM 0.11.2 (>=4.56.0,<5), TRT-LLM 1.2.0rc5 (==4.56.0), SGLang 0.5.6 (==4.57.1)`
`50`	`50`	`"transformers>=4.56.0,<=4.57.1",`
`51`	`51`	`"pytest-mypy",`
`52`	`52`	`]`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ Repository = "https://github.com/ai-dynamo/dynamo.git"`
`50`	`50`	`[project.optional-dependencies]`
`51`	`51`	`trtllm =[`
`52`	`52`	`"uvloop",`
`53`		`- "tensorrt-llm==1.2.0rc3",`
	`53`	`+ "tensorrt-llm==1.2.0rc5",`
`54`	`54`	`]`
`55`	`55`
`56`	`56`	`vllm = [`