diff --git a/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml b/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml index 37a3f93e8..45bd826cb 100644 --- a/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml +++ b/test/cases/nvidia/manifests/job-hpc-benchmarks.yaml @@ -53,6 +53,7 @@ spec: resources: limits: nvidia.com/gpu: {{.GpuPerNode}} + vpc.amazonaws.com/efa: {{.EfaPerNode}} env: - name: UCX_TLS value: "^sysv" diff --git a/test/cases/nvidia/unit_test.go b/test/cases/nvidia/unit_test.go index d703620b1..cc3bbaece 100644 --- a/test/cases/nvidia/unit_test.go +++ b/test/cases/nvidia/unit_test.go @@ -36,6 +36,7 @@ type unitTestManifestTplVars struct { type hpcTestManifestTplVars struct { GpuPerNode int + EfaPerNode int } func TestSingleNodeUnitTest(t *testing.T) { @@ -98,6 +99,7 @@ func TestSingleNodeUnitTest(t *testing.T) { var err error renderedJobHpcBenchmarksSingleNodeManifest, err = fwext.RenderManifests(jobHpcBenchmarksSingleNodeManifest, hpcTestManifestTplVars{ GpuPerNode: gpuPerNode, + EfaPerNode: efaPerNode, }) if err != nil { t.Fatal(err) @@ -113,7 +115,8 @@ func TestSingleNodeUnitTest(t *testing.T) { ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"}, } err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), - wait.WithContext(ctx)) + wait.WithContext(ctx), + wait.WithTimeout(20*time.Minute)) if err != nil { t.Fatal(err) } diff --git a/test/images/nvidia-inference/Dockerfile b/test/images/nvidia-inference/Dockerfile index 267b99f77..d3ded5c49 100644 --- a/test/images/nvidia-inference/Dockerfile +++ b/test/images/nvidia-inference/Dockerfile @@ -1,8 +1,8 @@ ############################################################################### # Base image, arguments, and environment ############################################################################### -ARG CUDA_MAJOR_VERSION=12 -ARG CUDA_MINOR_VERSION=8 +ARG CUDA_MAJOR_VERSION=13 +ARG CUDA_MINOR_VERSION=0 FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 diff --git a/test/images/nvidia-training/Dockerfile b/test/images/nvidia-training/Dockerfile index 053a0970c..cdf76a27e 100644 --- a/test/images/nvidia-training/Dockerfile +++ b/test/images/nvidia-training/Dockerfile @@ -1,5 +1,5 @@ -ARG CUDA_MAJOR_VERSION=12 -ARG CUDA_MINOR_VERSION=8 +ARG CUDA_MAJOR_VERSION=13 +ARG CUDA_MINOR_VERSION=0 # Use the NVIDIA CUDA runtime as a parent image FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 @@ -65,7 +65,7 @@ RUN ln -s /usr/local/bin/pip3 /usr/bin/pip \ && pip --no-cache-dir install --upgrade pip setuptools # Install Pytorch from Source -ARG PYTORCH_BRANCH=v2.6.0 +ARG PYTORCH_BRANCH=v2.9.0 ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0" ENV CUDA_HOME=/usr/local/cuda @@ -73,7 +73,7 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extra ENV PATH=$PATH:$CUDA_HOME/bin ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.7;8.9;9.0;10.0;12.0" -RUN pip install typing-extensions sympy pyyaml +RUN pip install typing-extensions sympy pyyaml cmake RUN git clone https://github.com/pytorch/pytorch.git /tmp/pytorch \ --recursive \ --branch $PYTORCH_BRANCH \ @@ -111,7 +111,7 @@ RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLE && cd && rm -rf /tmp/aws-efa-installer # Install NCCL -ARG LIBNCCL_VERSION=2.27.7-1 +ARG LIBNCCL_VERSION=2.28.3-1 RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \ && cd /tmp/nccl \ && make -j $(nproc) \ @@ -119,7 +119,7 @@ RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp && cd && rm -rf /tmp/nccl # Install AWS-OFI-NCCL plugin -ARG AWS_OFI_NCCL_VERSION=1.16.3 +ARG AWS_OFI_NCCL_VERSION=1.17.1 RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp \ && cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \ && ./configure \ diff --git a/test/images/nvidia-training/Dockerfilesimple b/test/images/nvidia-training/Dockerfilesimple new file mode 100644 index 000000000..ffc23e2c8 --- /dev/null +++ b/test/images/nvidia-training/Dockerfilesimple @@ -0,0 +1,84 @@ +FROM nvcr.io/nvidia/pytorch:25.10-py3 + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive + +# Install OpenSSH for MPI Operator +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + openssh-server openssh-client && \ + rm -rf /var/lib/apt/lists/* + +# Configure SSH for MPI +RUN mkdir -p /var/run/sshd && \ + sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Install EFA drivers and libraries for optimal network performance +# Note: EFA installer will place OpenMPI at /opt/amazon/openmpi +ARG EFA_INSTALLER_VERSION=latest +RUN apt-get update && \ + apt-get install -y --no-install-recommends wget curl libhwloc-dev && \ + curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz -C /tmp && \ + cd /tmp/aws-efa-installer && \ + ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ + cd && rm -rf /tmp/aws-efa-installer && \ + rm -rf /var/lib/apt/lists/* + +# Install AWS-OFI-NCCL plugin for EFA optimization +ARG AWS_OFI_NCCL_VERSION=1.17.1 +RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp && \ + cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION && \ + ./configure \ + --prefix=/opt/aws-ofi-nccl/install \ + --with-mpi=/opt/amazon/openmpi \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + --disable-tests && \ + make -j $(nproc) && \ + make install && \ + cd && rm -rf /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION + +# Remove HPCX paths from environment to avoid conflicts with EFA OpenMPI +RUN sed -i '/hpcx/d' /etc/environment || true && \ + sed -i '/hpcx/d' ~/.bashrc || true + +# Update library paths for EFA and AWS-OFI-NCCL +# Place EFA paths FIRST to override any NVIDIA container defaults +ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/lib +ENV OPAL_PREFIX=/opt/amazon/openmpi +ENV MPI_ROOT=/opt/amazon/openmpi +ENV NCCL_PROTO=simple + +ENV FI_PROVIDER=efa +ENV FI_EFA_USE_DEVICE_RDMA=1 +ENV NCCL_DEBUG=INFO +ENV NCCL_SOCKET_IFNAME=^docker0,lo +ENV FI_EFA_FORK_SAFE=1 + +ENV LD_PRELOAD=/opt/aws-ofi-nccl/install/lib/libnccl-net.so + +RUN ldconfig + +# Set working directory +WORKDIR /app + +# Copy training script and requirements +COPY train.py /app/ +COPY requirements.txt /app/ + +# Install additional Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# The base image already includes: +# - PyTorch with CUDA support +# - NCCL for multi-GPU communication +# - OpenMPI for distributed training +# - EFA support +# - All necessary CUDA libraries + +# Default command (can be overridden) +# CMD ["python", "train.py"] diff --git a/test/images/nvidia/Dockerfile b/test/images/nvidia/Dockerfile index 1e5bfed48..1a7bee23e 100644 --- a/test/images/nvidia/Dockerfile +++ b/test/images/nvidia/Dockerfile @@ -1,5 +1,5 @@ -ARG CUDA_MAJOR_VERSION=12 -ARG CUDA_MINOR_VERSION=8 +ARG CUDA_MAJOR_VERSION=13 +ARG CUDA_MINOR_VERSION=0 # Start with the NVIDIA CUDA base image FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 @@ -72,7 +72,7 @@ RUN git clone https://github.com/NVIDIA/nvbandwidth.git --branch $NVBANDWIDTH_VE && cd && rm -rf /tmp/cuda-samples # Install NCCL -ARG LIBNCCL_VERSION=2.27.7-1 +ARG LIBNCCL_VERSION=2.28.3-1 RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \ && cd /tmp/nccl \ && make -j $(nproc) \ @@ -80,7 +80,7 @@ RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp && cd && rm -rf /tmp/nccl # Install AWS-OFI-NCCL plugin -ARG AWS_OFI_NCCL_VERSION=1.16.3 +ARG AWS_OFI_NCCL_VERSION=1.17.1 RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp \ && cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \ && ./configure \