Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -662,13 +662,28 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
}

NVTE_CHECK(comm->nvsize <= 8, "CUDA IPC supports only up to 8 GPUs in an NVLink domain.");
cudaIpcMemHandle_t memhndl;
NVTE_CHECK_CUDA(cudaIpcGetMemHandle(&memhndl, *gpubuff));

cudaIpcMemHandle_t *tmp =
reinterpret_cast<cudaIpcMemHandle_t *>(malloc(comm->nvsize * sizeof(cudaIpcMemHandle_t)));
// Use cudaMallocHost (pinned host memory) so these buffers are CPU-accessible (plain memcpy)
// and GPU DMA-accessible, allowing the allgather callback to pass them directly to NCCL
// without additional staging copies. RAII guards ensure the pinned pages are released on
// every exit path, including exceptions thrown by NVTE_CHECK_CUDA / NVTE_ERROR.
struct PinnedDeleter {
void operator()(void *p) const {
if (p) cudaFreeHost(p);
}
};
cudaIpcMemHandle_t *memhndl;
NVTE_CHECK_CUDA(
cudaMallocHost(reinterpret_cast<void **>(&memhndl), sizeof(cudaIpcMemHandle_t)));
std::unique_ptr<void, PinnedDeleter> memhndl_guard(memhndl);
NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff));

cudaIpcMemHandle_t *tmp;
NVTE_CHECK_CUDA(
cudaMallocHost(reinterpret_cast<void **>(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t)));
std::unique_ptr<void, PinnedDeleter> tmp_guard(tmp);
comm->_allgather(reinterpret_cast<void *>(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t),
reinterpret_cast<void *>(&memhndl), sizeof(cudaIpcMemHandle_t),
reinterpret_cast<void *>(memhndl), sizeof(cudaIpcMemHandle_t),
comm->comm_intra);

// Check for NVLINK support before attempting IPC operations
Expand All @@ -689,7 +704,6 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
}
}
if (!peer_access_available) {
free(tmp);
NVTE_ERROR(
"No peer-to-peer access available between GPUs. This platform does not support the "
"GPU-to-GPU "
Expand All @@ -712,7 +726,6 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice));

NVTE_CHECK_CUDA(cudaDeviceSynchronize());
free(tmp);
#if CUDART_VERSION >= 12010
}
#endif
Expand Down
Loading