From 661aa52bd10b770b16d10e3e8c20b344f24fb6ff Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Wed, 8 Apr 2026 21:49:28 +0000 Subject: [PATCH 1/4] fixed mem alloc for AG Signed-off-by: Phuong Nguyen --- .../userbuffers/userbuffers-host.cpp | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index 6ff9d63a2d..f3225e0ae5 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -662,13 +662,21 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * } NVTE_CHECK(comm->nvsize <= 8, "CUDA IPC supports only up to 8 GPUs in an NVLink domain."); - cudaIpcMemHandle_t memhndl; - NVTE_CHECK_CUDA(cudaIpcGetMemHandle(&memhndl, *gpubuff)); - cudaIpcMemHandle_t *tmp = - reinterpret_cast(malloc(comm->nvsize * sizeof(cudaIpcMemHandle_t))); + // Use cudaMallocHost (pinned host memory) so these buffers are CPU-accessible (plain memcpy) + // and GPU DMA-accessible, allowing the allgather callback to pass them directly to NCCL + // without additional staging copies. + cudaIpcMemHandle_t *memhndl; + NVTE_CHECK_CUDA( + cudaMallocHost(reinterpret_cast(&memhndl), sizeof(cudaIpcMemHandle_t))); + NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff)); + + cudaIpcMemHandle_t *tmp; + NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast(&tmp), + comm->nvsize * sizeof(cudaIpcMemHandle_t))); + comm->_allgather(reinterpret_cast(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t), - reinterpret_cast(&memhndl), sizeof(cudaIpcMemHandle_t), + reinterpret_cast(memhndl), sizeof(cudaIpcMemHandle_t), comm->comm_intra); // Check for NVLINK support before attempting IPC operations @@ -689,7 +697,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * } } if (!peer_access_available) { - free(tmp); + cudaFreeHost(tmp); + cudaFreeHost(memhndl); NVTE_ERROR( "No peer-to-peer access available between GPUs. This platform does not support the " "GPU-to-GPU " @@ -712,7 +721,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice)); NVTE_CHECK_CUDA(cudaDeviceSynchronize()); - free(tmp); + cudaFreeHost(tmp); + cudaFreeHost(memhndl); #if CUDART_VERSION >= 12010 } #endif From 6753c338d864ad976457f1abf3ad6f70369618f5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 21:54:27 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index f3225e0ae5..e2d23ff4cc 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -672,8 +672,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff)); cudaIpcMemHandle_t *tmp; - NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast(&tmp), - comm->nvsize * sizeof(cudaIpcMemHandle_t))); + NVTE_CHECK_CUDA( + cudaMallocHost(reinterpret_cast(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t))); comm->_allgather(reinterpret_cast(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t), reinterpret_cast(memhndl), sizeof(cudaIpcMemHandle_t), From 30a7e537b5c74366890f99addacfe4579424663f Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Wed, 8 Apr 2026 22:07:14 +0000 Subject: [PATCH 3/4] use raid Signed-off-by: Phuong Nguyen --- .../userbuffers/userbuffers-host.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index e2d23ff4cc..e934ce83ce 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -665,16 +665,21 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * // Use cudaMallocHost (pinned host memory) so these buffers are CPU-accessible (plain memcpy) // and GPU DMA-accessible, allowing the allgather callback to pass them directly to NCCL - // without additional staging copies. + // without additional staging copies. RAII guards ensure the pinned pages are released on + // every exit path, including exceptions thrown by NVTE_CHECK_CUDA / NVTE_ERROR. + struct PinnedDeleter { + void operator()(void *p) const { if (p) cudaFreeHost(p); } + }; cudaIpcMemHandle_t *memhndl; NVTE_CHECK_CUDA( cudaMallocHost(reinterpret_cast(&memhndl), sizeof(cudaIpcMemHandle_t))); + std::unique_ptr memhndl_guard(memhndl); NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff)); cudaIpcMemHandle_t *tmp; - NVTE_CHECK_CUDA( - cudaMallocHost(reinterpret_cast(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t))); - + NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast(&tmp), + comm->nvsize * sizeof(cudaIpcMemHandle_t))); + std::unique_ptr tmp_guard(tmp); comm->_allgather(reinterpret_cast(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t), reinterpret_cast(memhndl), sizeof(cudaIpcMemHandle_t), comm->comm_intra); @@ -697,8 +702,6 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * } } if (!peer_access_available) { - cudaFreeHost(tmp); - cudaFreeHost(memhndl); NVTE_ERROR( "No peer-to-peer access available between GPUs. This platform does not support the " "GPU-to-GPU " @@ -721,8 +724,6 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice)); NVTE_CHECK_CUDA(cudaDeviceSynchronize()); - cudaFreeHost(tmp); - cudaFreeHost(memhndl); #if CUDART_VERSION >= 12010 } #endif From b5eb5876a4d7cb5c4d5a7ecada553e75f11334d7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 22:09:36 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../comm_gemm_overlap/userbuffers/userbuffers-host.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp index e934ce83ce..1dcde51d4b 100644 --- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp +++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp @@ -668,7 +668,9 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * // without additional staging copies. RAII guards ensure the pinned pages are released on // every exit path, including exceptions thrown by NVTE_CHECK_CUDA / NVTE_ERROR. struct PinnedDeleter { - void operator()(void *p) const { if (p) cudaFreeHost(p); } + void operator()(void *p) const { + if (p) cudaFreeHost(p); + } }; cudaIpcMemHandle_t *memhndl; NVTE_CHECK_CUDA( @@ -677,8 +679,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator * NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff)); cudaIpcMemHandle_t *tmp; - NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast(&tmp), - comm->nvsize * sizeof(cudaIpcMemHandle_t))); + NVTE_CHECK_CUDA( + cudaMallocHost(reinterpret_cast(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t))); std::unique_ptr tmp_guard(tmp); comm->_allgather(reinterpret_cast(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t), reinterpret_cast(memhndl), sizeof(cudaIpcMemHandle_t),