@@ -10,17 +10,17 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
1010ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
1111ARG ENABLE_KVBM=false
1212ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
13- ARG RUNTIME_IMAGE_TAG="12.8.1 -runtime-ubuntu24.04"
14- ARG CUDA_VERSION="12.8 "
13+ ARG RUNTIME_IMAGE_TAG="12.9.0 -runtime-ubuntu24.04"
14+ ARG CUDA_VERSION="12.9 "
1515
1616# Make sure to update the dependency version in pyproject.toml when updating this
17- ARG VLLM_REF="v0.11.0"
18- # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
19- ARG FLASHINF_REF="v0.3.1"
20- ARG TORCH_BACKEND="cu128"
17+ ARG VLLM_REF="v0.12.0"
18+ # FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
19+ ARG FLASHINF_REF="v0.5.3"
2120
2221# If left blank, then we will fallback to vLLM defaults
2322ARG DEEPGEMM_REF=""
23+ ARG LMCACHE_REF="0.3.10"
2424
2525# sccache configuration - inherit from base build
2626ARG USE_SCCACHE
@@ -109,7 +109,7 @@ ARG VLLM_REF
109109ARG VLLM_GIT_URL
110110ARG DEEPGEMM_REF
111111ARG FLASHINF_REF
112- ARG TORCH_BACKEND
112+ ARG LMCACHE_REF
113113ARG CUDA_VERSION
114114
115115ARG MAX_JOBS=16
@@ -143,7 +143,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
143143 export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
144144 cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
145145 chmod +x /tmp/install_vllm.sh && \
146- /tmp/install_vllm.sh --editable -- vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
146+ /tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION && \
147147 /tmp/use-sccache.sh show-stats "vLLM";
148148
149149ENV LD_LIBRARY_PATH=\
@@ -206,7 +206,7 @@ RUN apt-get update && \
206206 # prometheus dependencies
207207 ca-certificates \
208208 # DeepGemm uses 'cuobjdump' which does not come with CUDA image
209- cuda-command-line-tools-12-8 && \
209+ cuda-command-line-tools-12-9 && \
210210 rm -rf /var/lib/apt/lists/*
211211
212212# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
@@ -287,8 +287,14 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
287287 --requirement /tmp/requirements.txt \
288288 --requirement /tmp/requirements.test.txt
289289
290- # Copy benchmarks, examples, and tests for CI with correct ownership
291- COPY --chown=dynamo: . /workspace/
290+ # Copy tests, benchmarks, deploy and components for CI
291+ COPY --chown=dynamo: benchmarks /workspace/benchmarks
292+ COPY --chown=dynamo: tests /workspace/tests
293+ COPY --chown=dynamo: examples /workspace/examples
294+ COPY --chown=dynamo: deploy /workspace/deploy
295+ COPY --chown=dynamo: recipes/ /workspace/recipes/
296+ COPY --chown=dynamo: components/ /workspace/components/
297+ COPY --chown=dynamo: lib/ /workspace/lib/
292298
293299# Copy attribution files
294300COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/
@@ -373,6 +379,7 @@ COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
373379
374380# Install maturin, for maturin develop
375381# Editable install of dynamo
382+ COPY pyproject.toml README.md hatch_build.py /workspace/
376383RUN uv pip install maturin[patchelf] && \
377384 uv pip install --no-deps -e .
378385
0 commit comments