Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Documentation for TransferBench is available at
- Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
- New "hbm" preset which sweeps and tests local HBM read performance
- Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
- Adding nica2a preset (NIC all-to-all over GPUs via NIC executors, multi-node): stride/device grouping and NIC planes.
- `STRIDE` — Step size for stride orbits on rank-major devices (`gcd` with total device count); no traffic between different orbits.
- `GROUP_SIZE` — Devices per subgroup inside each stride orbit (natural rank-major order); must divide orbit size (default: all devices per rank × GPUs).
- `NIC_A2A_SCOPE` — `intra`: transfers only within the same device subgroup; `inter`: only between different subgroups (same stride orbit only).
- `NIC_A2A_NO_SAME_RANK` — When non-zero, omit transfers where source and destination are the same rank.
- `NUM_NIC_PLANES` — Split NIC endpoints into this many disjoint planes (rank-major index modulo planes); traffic only between NICs in the same plane.

### Modified
- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
Expand Down
346 changes: 346 additions & 0 deletions src/client/Presets/NicAllToAll.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,346 @@
/*
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include <cstring>
#include <numeric>

int NicAllToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
// Check for single homogenous group
if (Utils::GetNumRankGroups() > 1) {
Utils::Print("[ERROR] NIC all-to-all preset can only be run across ranks that are homogenous\n");
Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility to scale-out NICs\n");
return 1;
}

int numRanks = TransferBench::GetNumRanks();
int numNicsPerRank = TransferBench::GetNumExecutors(EXE_NIC);
if (numNicsPerRank == 0) {
Utils::Print("[ERROR] No NIC detected. This preset requires NIC executors.\n");
return 1;
}

int useCpuMem = EnvVars::GetEnvVar("USE_CPU_MEM", 0);
// Device count from topology: GFX executors, or CPU executors when USE_CPU_MEM (same pattern as NicRings).
int numMemDevices = TransferBench::GetNumExecutors(useCpuMem ? EXE_CPU : EXE_GPU_GFX);
if (numMemDevices == 0) {
Utils::Print("[ERROR] No %s executors detected for NIC all-to-all.\n", useCpuMem ? "CPU" : "GPU GFX");
return 1;
}

int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS", 0);
int useRdmaRead = EnvVars::GetEnvVar("USE_RDMA_READ", 0);
int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE", 0);
int stride = EnvVars::GetEnvVar("STRIDE" , 1);
int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numMemDevices);
int noSameRank = EnvVars::GetEnvVar("NIC_A2A_NO_SAME_RANK", 1);
int numNicPlanes = EnvVars::GetEnvVar("NUM_NIC_PLANES" , 1);

if (numQueuePairs < 1) {
Utils::Print("[ERROR] NUM_QUEUE_PAIRS must be >= 1 (got %d)\n", numQueuePairs);
return 1;
}
if (groupSize < 1) {
Utils::Print("[ERROR] GROUP_SIZE must be >= 1 (got %d)\n", groupSize);
return 1;
}

bool scopeInter = false;
{
char const* scopeStr = getenv("NIC_A2A_SCOPE");
if (scopeStr && scopeStr[0]) {
if (!strcmp(scopeStr, "inter") || !strcmp(scopeStr, "INTER"))
scopeInter = true;
else if (strcmp(scopeStr, "intra") && strcmp(scopeStr, "INTRA")) {
Utils::Print("[ERROR] NIC_A2A_SCOPE must be \"intra\" or \"inter\"\n");
return 1;
}
}
}

MemType memType = Utils::GetMemType(memTypeIdx, useCpuMem);
std::string memTypeStr = Utils::GetMemTypeStr(memTypeIdx, useCpuMem);

if (numNicPlanes < 1) {
Utils::Print("[ERROR] NUM_NIC_PLANES must be >= 1\n");
return 1;
}

// Same divisibility check as PodAllToAll (total devices = ranks × memory devices per rank).
if (numRanks * numMemDevices % groupSize) {
Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
groupSize, numRanks * numMemDevices, numRanks);
return 1;
}

int const M = numRanks * numMemDevices;

// Stride orbits on devices (rank-major devLin = rank * numMemDevices + memIdx): same gcd structure as PodAllToAll's StrideGenerate,
// but NIC A2A does not use the permuted slot order for GROUP_SIZE — subgroups follow natural order within each orbit.
int kNorm = ((stride % M) + M) % M;
int dCycles;
if (kNorm == 0)
dCycles = 1;
else
dCycles = std::gcd(kNorm, M);

int const orbitSize = M / dCycles;

// Within each stride orbit, partition by natural rank-major device index: orbit lists devLin = r, r+d, r+2d, ...
// (r = devLin %% dCycles). Subgroup id = (index along that list) / GROUP_SIZE.
if (orbitSize % groupSize != 0) {
Utils::Print("[ERROR] GROUP_SIZE (%d) must divide stride-cycle size %d (devices M=%d, orbits=%d).\n",
groupSize, orbitSize, M, dCycles);
Utils::Print("[ERROR] With STRIDE=%d there are %d disjoint cycles; use a GROUP_SIZE that divides each cycle's device count,\n",
stride, dCycles);
Utils::Print("[ERROR] or use STRIDE=1 so the cycle size equals total devices (%d).\n", M);
return 1;
}

std::vector<int> deviceSubgroup(M);
for (int devLin = 0; devLin < M; devLin++) {
int const r = devLin % dCycles;
int const k = (devLin - r) / dCycles; // 0 .. orbitSize-1 along natural order in this orbit
deviceSubgroup[devLin] = k / groupSize;
}

if (Utils::RankDoesOutput()) {
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[NIC A2A Related]\n");
ev.Print("USE_CPU_MEM" , useCpuMem , "Using closest %s memory", useCpuMem ? "CPU" : "GPU");
ev.Print("MEM_TYPE" , memTypeIdx , "Using %s memory (%s)", memTypeStr.c_str(), Utils::GetAllMemTypeStr(useCpuMem).c_str());
ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride);
ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into groups of %d for a2a", groupSize);
ev.Print("NUM_NIC_PLANES", numNicPlanes , "Number of planes on scale-out");
if (scopeInter)
ev.Print("NIC_A2A_SCOPE", "inter",
"Between-group transfers only. Other value: intra");
else
ev.Print("NIC_A2A_SCOPE", "intra",
"Within-group transfers only. Other value: inter");
ev.Print("NIC_A2A_NO_SAME_RANK", noSameRank, "%s transfers where src rank == dst rank",
noSameRank ? "Excluding" : "Allowing");
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("SHOW_DETAILS" , showDetails , "%s full Test details", showDetails ? "Showing" : "Hiding");
ev.Print("USE_RDMA_READ" , useRdmaRead , "Performing RDMA %s", useRdmaRead ? "reads" : "writes");
printf("\n");
}
}

// For each rank/NIC, closest memory device (GPU or CPU NUMA) — several NICs may share the same device (same subgroup).
std::vector<std::vector<int>> nicToMem(numRanks, std::vector<int>(numNicsPerRank, -1));
for (int rank = 0; rank < numRanks; rank++) {
for (int nic = 0; nic < numNicsPerRank; nic++) {
int memIdx = useCpuMem ? TransferBench::GetClosestCpuNumaToNic(nic, rank)
: TransferBench::GetClosestGpuToNic(nic, rank);
if (memIdx < 0) {
Utils::Print("[ERROR] Failed to identify closest %s for Rank %d NIC %d\n",
useCpuMem ? "CPU NUMA node" : "GPU", rank, nic);
return 1;
}
if (memIdx >= numMemDevices) {
Utils::Print("[ERROR] Closest %s index %d for Rank %d NIC %d is out of range [0,%d)\n",
useCpuMem ? "CPU" : "GPU", memIdx, rank, nic, numMemDevices);
return 1;
}
nicToMem[rank][nic] = memIdx;
}
}

auto devLinOf = [&](int rank, int memIdx) -> int { return rank * numMemDevices + memIdx; };

// NIC plane: independent of STRIDE over memory devices. Global rank-major order over NIC endpoints, round-robin into P planes.
auto nicPlaneOf = [&](int rank, int nic) -> int {
int const L = rank * numNicsPerRank + nic;
return L % numNicPlanes;
};

std::vector<Transfer> transfers;
std::vector<int> srcRanks;
std::vector<int> srcNics;
size_t const maxPairs = (size_t)numNicsPerRank * numNicsPerRank * (size_t)numRanks * (size_t)numRanks;
srcRanks.reserve(maxPairs);
srcNics.reserve(maxPairs);

auto const acceptPair = [&](int srcRank, int srcNic, int dstRank, int dstNic) -> bool {
if (nicPlaneOf(srcRank, srcNic) != nicPlaneOf(dstRank, dstNic))
return false;
int srcDevLin = devLinOf(srcRank, nicToMem[srcRank][srcNic]);
int dstDevLin = devLinOf(dstRank, nicToMem[dstRank][dstNic]);
if ((srcDevLin % dCycles) != (dstDevLin % dCycles))
return false;
if (noSameRank && srcRank == dstRank)
return false;
if (scopeInter)
return deviceSubgroup[srcDevLin] != deviceSubgroup[dstDevLin];
return deviceSubgroup[srcDevLin] == deviceSubgroup[dstDevLin];
};

for (int srcRank = 0; srcRank < numRanks; srcRank++) {
for (int srcNic = 0; srcNic < numNicsPerRank; srcNic++) {
int srcMem = nicToMem[srcRank][srcNic];
for (int dstRank = 0; dstRank < numRanks; dstRank++) {
for (int dstNic = 0; dstNic < numNicsPerRank; dstNic++) {
if (!acceptPair(srcRank, srcNic, dstRank, dstNic)) continue;

int dstMem = nicToMem[dstRank][dstNic];

TransferBench::Transfer transfer;
transfer.srcs.push_back({memType, srcMem, srcRank});
transfer.dsts.push_back({memType, dstMem, dstRank});
transfer.exeDevice = {EXE_NIC, useRdmaRead ? dstNic : srcNic, useRdmaRead ? dstRank : srcRank};
transfer.exeSubIndex = useRdmaRead ? srcNic : dstNic;
transfer.numSubExecs = numQueuePairs;
transfer.numBytes = numBytesPerTransfer;

transfers.push_back(transfer);
srcRanks.push_back(srcRank);
srcNics.push_back(srcNic);
}
}
}
}

Utils::Print("NIC All-To-All benchmark\n");
Utils::Print("========================\n");
Utils::Print("%s traffic over NIC executors. %d rank-major devices; STRIDE sets gcd-orbits; GROUP_SIZE chunks each orbit in natural order.\n",
useCpuMem ? "CPU" : "GPU", M);
Utils::Print("NICs map to devices via closest %s;\n", useCpuMem ? "CPU NUMA node" : "GPU");
Utils::Print("NIC planes: %d , traffic only between NICs in the same plane. Stride: %d\n",
numNicPlanes, stride);
Utils::Print("Using closest %s per NIC endpoint and %s memory.\n",
useCpuMem ? "CPU NUMA node" : "GPU", memTypeStr.c_str());
Utils::Print("Visible NICs per rank: %d\n", numNicsPerRank);
Utils::Print("%d queue pairs per NIC. %lu bytes per Transfer. All numbers are GB/s\n",
numQueuePairs, numBytesPerTransfer);
Utils::Print("Total transfers: %lu\n\n", transfers.size());

if (transfers.empty()) {
Utils::Print("[WARN] No transfers were generated for this preset.\n");
return 0;
}

TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
Utils::Print("%s\n", err.errMsg.c_str());
return 1;
} else if (showDetails) {
Utils::PrintResults(ev, 1, transfers, results);
Utils::Print("\n");
}

if (!Utils::RankDoesOutput()) return 0;

int numRows = 6 + numRanks;
int numCols = 3 + numNicsPerRank;
Utils::TableHelper table(numRows, numCols);

table.Set(2, 0, " Rank ");
table.Set(2, 1, " Name ");
table.Set(1, numCols - 1, " TOTAL ");
table.Set(2, numCols - 1, " (GB/s) ");
table.SetColAlignment(1, Utils::TableHelper::ALIGN_LEFT);
for (int rank = 0; rank < numRanks; rank++) {
table.Set(3 + rank, 0, " %d ", rank);
table.Set(3 + rank, 1, " %s ", TransferBench::GetHostname(rank).c_str());
}
table.Set(numRows - 3, 1, " MAX (GB/s) ");
table.Set(numRows - 2, 1, " AVG (GB/s) ");
table.Set(numRows - 1, 1, " MIN (GB/s) ");
for (int row = numRows - 3; row < numRows; row++)
table.SetCellAlignment(row, 1, Utils::TableHelper::ALIGN_RIGHT);
table.DrawRowBorder(3);
table.DrawRowBorder(numRows - 3);

std::vector<std::vector<double>> bwByRankNic(numRanks, std::vector<double>(numNicsPerRank, 0.0));
for (size_t i = 0; i < results.tfrResults.size(); i++) {
int nicIdx = 0;
if (useRdmaRead) {
nicIdx = results.tfrResults[i].exeDstDevice.exeIndex;
} else {
nicIdx = results.tfrResults[i].exeDevice.exeIndex;
}
bwByRankNic[srcRanks[i]][nicIdx] += results.tfrResults[i].avgBandwidthGbPerSec;
}

std::vector<double> rankTotal(numRanks, 0.0);
int colIdx = 2;
table.DrawColBorder(colIdx);
for (int nic = 0; nic < numNicsPerRank; nic++) {
table.Set(0, colIdx, " NIC %02d ", nic);
if (useCpuMem) {
table.Set(1, colIdx, " CPU %02d ", nicToMem[0][nic]);
} else {
table.Set(1, colIdx, " GPU %02d ", nicToMem[0][nic]);
}
table.Set(2, colIdx, " %s ", TransferBench::GetExecutorName({EXE_NIC, nic}).c_str());

double nicMin = std::numeric_limits<double>::max();
double nicAvg = 0.0;
double nicMax = std::numeric_limits<double>::min();
for (int rank = 0; rank < numRanks; rank++) {
double bw = bwByRankNic[rank][nic];
table.Set(3 + rank, colIdx, " %.2f ", bw);
nicMin = std::min(nicMin, bw);
nicAvg += bw;
nicMax = std::max(nicMax, bw);
rankTotal[rank] += bw;
}

table.Set(numRows - 3, colIdx, " %.2f ", nicMax);
table.Set(numRows - 2, colIdx, " %.2f ", nicAvg / numRanks);
table.Set(numRows - 1, colIdx, " %.2f ", nicMin);
colIdx++;
}
table.DrawColBorder(colIdx);

double rankMin = std::numeric_limits<double>::max();
double rankAvg = 0.0;
double rankMax = std::numeric_limits<double>::min();
for (int rank = 0; rank < numRanks; rank++) {
table.Set(3 + rank, numCols - 1, " %.2f ", rankTotal[rank]);
rankMin = std::min(rankMin, rankTotal[rank]);
rankAvg += rankTotal[rank];
rankMax = std::max(rankMax, rankTotal[rank]);
}
table.Set(numRows - 3, numCols - 1, " %.2f ", rankMax);
table.Set(numRows - 2, numCols - 1, " %.2f ", rankAvg / numRanks);
table.Set(numRows - 1, numCols - 1, " %.2f ", rankMin);

table.PrintTable(ev.outputToCsv, ev.showBorders);
Utils::Print("\n");
Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
Utils::PrintErrors(results.errResults);

if (Utils::HasDuplicateHostname()) {
printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
}

return 0;
}
2 changes: 2 additions & 0 deletions src/client/Presets/Presets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ THE SOFTWARE.
#include "AllToAllSweep.hpp"
#include "HbmBandwidth.hpp"
#include "HealthCheck.hpp"
#include "NicAllToAll.hpp"
#include "NicRings.hpp"
#include "NicPeerToPeer.hpp"
#include "OneToAll.hpp"
Expand All @@ -51,6 +52,7 @@ std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
{
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
{"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
{"nica2a", {NicAllToAllPreset, "All-to-all GPU traffic over NIC transfers using each NIC's closest GPU endpoint"}},
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: spacing

Suggested change
{"nica2a", {NicAllToAllPreset, "All-to-all GPU traffic over NIC transfers using each NIC's closest GPU endpoint"}},
{"nica2a", {NicAllToAllPreset, "All-to-all GPU traffic over NIC transfers using each NIC's closest GPU endpoint"}},

{"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
{"hbm", {HbmBandwidthPreset, "Tests HBM bandwidth"}},
{"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
Expand Down