Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ jobs:
# platforms: linux/amd64
# runner: [self-hosted, linux/amd64, rocm]
# build_args: "NUM_MAKE_JOBS=16"
- name: rocm6.3
dockerfile: rocm6.3.x
tags: superbench/main:rocm6.3
platforms: linux/amd64
runner: [self-hosted, linux/amd64, rocm]
build_args: "NUM_MAKE_JOBS=16"
steps:
- name: Checkout
uses: actions/checkout@v2
Expand Down
145 changes: 145 additions & 0 deletions dockerfile/rocm6.3.x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
ARG BASE_IMAGE=rocm/pytorch-training:v25.6

FROM ${BASE_IMAGE}

# Base image: rocm/pytorch-training:v25.6
# Pre-installed by base image:
# - Ubuntu: 22.04
# - Python: 3.10
# - ROCm: 6.3.4
# - openmpi: 4.0.7rc2
# - torch: 2.8.0a0+git7d205b2
# - rccl: 2.21.5.60304-76
# - hipblaslt: 0.15.0-8c69191d
# - transformer_engine: 1.14.0+2f85f5f2
# - flash_attention: 3.0.0.post1
# - cmake: 3.18.5
# - rocm-cmake: 0.14.0.60304-76
# - amd-smi: 25.1.0+8dc45db
# Added by this Dockerfile:
# - Docker Client: 27.5.1
# - mlc: v3.12
# - OFED: 24.10-1.1.4.0 LTS

# Fix base image botocore/urllib3 incompatibility:
# Base image ships botocore 1.22.12 (expects urllib3 1.x) with urllib3 2.6.3,
# causing "cannot import name 'DEFAULT_CIPHERS' from 'urllib3.util.ssl_'".
# Upgrading botocore/boto3 to versions compatible with urllib3 2.x.
RUN python3 -m pip install --upgrade botocore boto3
Comment on lines +27 to +28
Comment on lines +27 to +28
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This upgrades botocore/boto3 to whatever the latest versions are at build time, which makes the image non-reproducible and can break future builds unexpectedly. Consider pinning to known-good versions that are compatible with urllib3 2.x (and optionally using --no-cache-dir).

Suggested change
# Upgrading botocore/boto3 to versions compatible with urllib3 2.x.
RUN python3 -m pip install --upgrade botocore boto3
# Upgrade botocore/boto3 to specific versions compatible with urllib3 2.x.
RUN python3 -m pip install --no-cache-dir "botocore==1.35.98" "boto3==1.35.98"

Copilot uses AI. Check for mistakes.

LABEL maintainer="SuperBench"

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get -q install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
git \
hipify-clang \
iproute2 \
jq \
libaio-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libssl-dev \
libtinfo5 \
libtool \
lshw \
net-tools \
numactl \
openssh-client \
openssh-server \
pciutils \
python3-mpi4py \
rsync \
sudo \
util-linux \
vim \
wget \
&& \
rm -rf /tmp/*

ARG NUM_MAKE_JOBS=64

Comment on lines +69 to +70
# Install Docker
ENV DOCKER_VERSION=27.5.1
RUN cd /tmp && \
Comment on lines +69 to +73
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NUM_MAKE_JOBS is declared as a build arg here but isn’t referenced anywhere in this Dockerfile, so the workflow’s build_args: NUM_MAKE_JOBS=... has no effect. Either remove the ARG or wire it into the build steps (e.g., pass it to make -j / cmake builds) to make parallelism configurable.

Suggested change
ARG NUM_MAKE_JOBS=64
# Install Docker
ENV DOCKER_VERSION=27.5.1
RUN cd /tmp && \
# Install Docker
ENV DOCKER_VERSION=27.5.1
RUN cd /tmp && \
RUN cd /tmp && \

Copilot uses AI. Check for mistakes.
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz

# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
Comment on lines +82 to +86


# Get Ubuntu version and set as an environment variable
RUN echo "Ubuntu version: $(lsb_release -r -s)"
ARG UBUNTU_VERSION=22.04

# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
# Check if ofed_info is present and has a version
RUN if ! command -v ofed_info >/dev/null 2>&1; then \
echo "OFED not found. Installing OFED..."; \
Comment on lines +89 to +97
cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
Comment on lines +94 to +99
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64.tgz && \
PATH=/usr/bin:${PATH} MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu${UBUNTU_VERSION}-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf MLNX_OFED_LINUX-${OFED_VERSION}* ; \
fi
Comment on lines +99 to +103
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The OFED tarball is downloaded over plain HTTP, which is vulnerable to MITM tampering during the image build. Prefer HTTPS if available and consider verifying the downloaded artifact (checksum/signature) before executing the installer.

Copilot uses AI. Check for mistakes.

ENV ROCM_PATH=/opt/rocm

# Target GPU architectures for ROCm builds (space-separated)
ENV AMDGPU_TARGETS="gfx908 gfx90a gfx942"

# Use pre-installed OpenMPI from base image at /opt/ompi
ENV MPI_HOME=/opt/ompi

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

ENV PATH="/opt/ompi/bin:/opt/superbench/bin:/usr/local/bin/:/opt/rocm/hip/bin/:/opt/rocm/bin/:${PATH}" \
LD_LIBRARY_PATH="/opt/ompi/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/:/opt/rocm/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

RUN python3 -m pip install --upgrade pip wheel setuptools==65.7 && \
python3 -c "import pkg_resources" || python3 -m pip install setuptools
Comment on lines +131 to +132

Comment on lines +131 to +133
WORKDIR ${SB_HOME}

ADD third_party third_party

RUN make RCCL_HOME=/opt/rocm ROCBLAS_BRANCH=release-staging/rocm-rel-6.3 HIPBLASLT_BRANCH=release-staging/rocm-rel-6.3 ROCM_VER=rocm-5.5.0 -C third_party rocm -o cpu_hpl -o cpu_stream -o megatron_lm -o rocm_megatron_lm

ADD . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install --no-build-isolation .[amdworker] && \
CXX=/opt/rocm/bin/hipcc make cppbuild && \
make postinstall
Comment on lines +140 to +145
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After ADD . ., this Dockerfile doesn’t remove the .git directory. In this repo the build context includes .git (see existing CUDA/ROCm Dockerfiles which rm -rf .git at the end), so leaving it inflates the final image size and can slow pushes/pulls. Consider deleting .git in the final layer after postinstall.

Suggested change
make postinstall
make postinstall && \
rm -rf .git

Copilot uses AI. Check for mistakes.
9 changes: 9 additions & 0 deletions third_party/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,17 @@ ifneq (,$(wildcard fio/Makefile))
endif

# Build rccl-tests from commit 46375b1 of default branch.
# If AMDGPU_TARGETS env var is set (space-separated, e.g. "gfx908 gfx90a gfx942"),
# explicit --offload-arch flags and include paths are used.
# Otherwise, the original build command is used (relies on hipcc auto-detection).
ROCM_OFFLOAD_ARCH_FLAGS := $(foreach arch,$(AMDGPU_TARGETS),--offload-arch=$(arch))
rocm_rccl_tests: sb_micro_path
ifneq (, $(wildcard rccl-tests/Makefile))
ifdef AMDGPU_TARGETS
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) HIPCUFLAGS="-std=c++14 -O3 $(ROCM_OFFLOAD_ARCH_FLAGS) -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/rccl -I$(ROCM_PATH)/include/hip -DMPI_SUPPORT -I$(MPI_HOME)/include -I$(MPI_HOME)/include/mpi" -j
else
cd ./rccl-tests && make MPI=1 MPI_HOME=$(MPI_HOME) -j
endif
Comment on lines +143 to +147
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ifdef AMDGPU_TARGETS triggers when the variable is defined even if it’s empty, which can unexpectedly switch to the HIPCUFLAGS path and potentially break the build. Consider switching the condition to a non-empty check (e.g., using ifneq ($(strip $(AMDGPU_TARGETS)),)), so the fallback path is used when it’s unset or blank.

Copilot uses AI. Check for mistakes.
cp -v -r ./rccl-tests/build/* $(SB_MICRO_PATH)/bin/
endif

Expand Down Expand Up @@ -168,6 +176,7 @@ rocm_hipblaslt: sb_micro_path
# Build hipBusBandwidth.
# HIP is released with rocm, like rocm-4.2.0 and so on.
# The version we use is the released tag which is consistent with the rocm version in the environment or docker.

rocm_bandwidthTest: sb_micro_path
git clone -b ${ROCM_VER} https://github.com/ROCm-Developer-Tools/HIP.git
cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
Expand Down
Loading