@@ -10,17 +10,17 @@ ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
1010ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
1111ARG ENABLE_KVBM=false
1212ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
13- ARG RUNTIME_IMAGE_TAG="12.9.0 -runtime-ubuntu24.04"
14- ARG CUDA_VERSION="12.9 "
13+ ARG RUNTIME_IMAGE_TAG="12.8.1 -runtime-ubuntu24.04"
14+ ARG CUDA_VERSION="12.8 "
1515
1616# Make sure to update the dependency version in pyproject.toml when updating this
17- ARG VLLM_REF="v0.12.0"
18- # FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
19- ARG FLASHINF_REF="v0.5.3"
17+ ARG VLLM_REF="v0.11.0"
18+ # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
19+ ARG FLASHINF_REF="v0.3.1"
20+ ARG TORCH_BACKEND="cu128"
2021
2122# If left blank, then we will fallback to vLLM defaults
2223ARG DEEPGEMM_REF=""
23- ARG LMCACHE_REF="0.3.10"
2424
2525# sccache configuration - inherit from base build
2626ARG USE_SCCACHE
@@ -109,7 +109,7 @@ ARG VLLM_REF
109109ARG VLLM_GIT_URL
110110ARG DEEPGEMM_REF
111111ARG FLASHINF_REF
112- ARG LMCACHE_REF
112+ ARG TORCH_BACKEND
113113ARG CUDA_VERSION
114114
115115ARG MAX_JOBS=16
@@ -143,7 +143,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
143143 export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
144144 cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
145145 chmod +x /tmp/install_vllm.sh && \
146- /tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION && \
146+ /tmp/install_vllm.sh --editable -- vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
147147 /tmp/use-sccache.sh show-stats "vLLM";
148148
149149ENV LD_LIBRARY_PATH=\
@@ -206,7 +206,7 @@ RUN apt-get update && \
206206 # prometheus dependencies
207207 ca-certificates \
208208 # DeepGemm uses 'cuobjdump' which does not come with CUDA image
209- cuda-command-line-tools-12-9 && \
209+ cuda-command-line-tools-12-8 && \
210210 rm -rf /var/lib/apt/lists/*
211211
212212# Copy CUDA development tools (nvcc, headers, dependencies, etc.) from base devel image
@@ -287,14 +287,8 @@ RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requi
287287 --requirement /tmp/requirements.txt \
288288 --requirement /tmp/requirements.test.txt
289289
290- # Copy tests, benchmarks, deploy and components for CI
291- COPY --chown=dynamo: benchmarks /workspace/benchmarks
292- COPY --chown=dynamo: tests /workspace/tests
293- COPY --chown=dynamo: examples /workspace/examples
294- COPY --chown=dynamo: deploy /workspace/deploy
295- COPY --chown=dynamo: recipes/ /workspace/recipes/
296- COPY --chown=dynamo: components/ /workspace/components/
297- COPY --chown=dynamo: lib/ /workspace/lib/
290+ # Copy benchmarks, examples, and tests for CI with correct ownership
291+ COPY --chown=dynamo: . /workspace/
298292
299293# Copy attribution files
300294COPY --chown=dynamo: ATTRIBUTION* LICENSE /workspace/
@@ -379,7 +373,6 @@ COPY --from=dynamo_base /usr/local/cargo /usr/local/cargo
379373
380374# Install maturin, for maturin develop
381375# Editable install of dynamo
382- COPY pyproject.toml README.md hatch_build.py /workspace/
383376RUN uv pip install maturin[patchelf] && \
384377 uv pip install --no-deps -e .
385378
0 commit comments