From 661aa52bd10b770b16d10e3e8c20b344f24fb6ff Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuonguyen@nvidia.com>
Date: Wed, 8 Apr 2026 21:49:28 +0000
Subject: [PATCH 1/4] fixed mem alloc for AG

Signed-off-by: Phuong Nguyen <phuonguyen@nvidia.com>
---
 .../userbuffers/userbuffers-host.cpp          | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
index 6ff9d63a2d..f3225e0ae5 100644
--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
@@ -662,13 +662,21 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
     }
 
     NVTE_CHECK(comm->nvsize <= 8, "CUDA IPC supports only up to 8 GPUs in an NVLink domain.");
-    cudaIpcMemHandle_t memhndl;
-    NVTE_CHECK_CUDA(cudaIpcGetMemHandle(&memhndl, *gpubuff));
 
-    cudaIpcMemHandle_t *tmp =
-        reinterpret_cast<cudaIpcMemHandle_t *>(malloc(comm->nvsize * sizeof(cudaIpcMemHandle_t)));
+    // Use cudaMallocHost (pinned host memory) so these buffers are CPU-accessible (plain memcpy)
+    // and GPU DMA-accessible, allowing the allgather callback to pass them directly to NCCL
+    // without additional staging copies.
+    cudaIpcMemHandle_t *memhndl;
+    NVTE_CHECK_CUDA(
+        cudaMallocHost(reinterpret_cast<void **>(&memhndl), sizeof(cudaIpcMemHandle_t)));
+    NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff));
+
+    cudaIpcMemHandle_t *tmp;
+    NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast<void **>(&tmp),
+          comm->nvsize * sizeof(cudaIpcMemHandle_t)));
+
     comm->_allgather(reinterpret_cast<void *>(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t),
-                     reinterpret_cast<void *>(&memhndl), sizeof(cudaIpcMemHandle_t),
+                     reinterpret_cast<void *>(memhndl), sizeof(cudaIpcMemHandle_t),
                      comm->comm_intra);
 
     // Check for NVLINK support before attempting IPC operations
@@ -689,7 +697,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
         }
       }
       if (!peer_access_available) {
-        free(tmp);
+        cudaFreeHost(tmp);
+        cudaFreeHost(memhndl);
         NVTE_ERROR(
             "No peer-to-peer access available between GPUs. This platform does not support the "
             "GPU-to-GPU "
@@ -712,7 +721,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
         comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice));
 
     NVTE_CHECK_CUDA(cudaDeviceSynchronize());
-    free(tmp);
+    cudaFreeHost(tmp);
+    cudaFreeHost(memhndl);
 #if CUDART_VERSION >= 12010
   }
 #endif

From 6753c338d864ad976457f1abf3ad6f70369618f5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 21:54:27 +0000
Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
index f3225e0ae5..e2d23ff4cc 100644
--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
@@ -672,8 +672,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
     NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff));
 
     cudaIpcMemHandle_t *tmp;
-    NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast<void **>(&tmp),
-          comm->nvsize * sizeof(cudaIpcMemHandle_t)));
+    NVTE_CHECK_CUDA(
+        cudaMallocHost(reinterpret_cast<void **>(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t)));
 
     comm->_allgather(reinterpret_cast<void *>(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t),
                      reinterpret_cast<void *>(memhndl), sizeof(cudaIpcMemHandle_t),

From 30a7e537b5c74366890f99addacfe4579424663f Mon Sep 17 00:00:00 2001
From: Phuong Nguyen <phuonguyen@nvidia.com>
Date: Wed, 8 Apr 2026 22:07:14 +0000
Subject: [PATCH 3/4] use raid

Signed-off-by: Phuong Nguyen <phuonguyen@nvidia.com>
---
 .../userbuffers/userbuffers-host.cpp            | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
index e2d23ff4cc..e934ce83ce 100644
--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
@@ -665,16 +665,21 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
 
     // Use cudaMallocHost (pinned host memory) so these buffers are CPU-accessible (plain memcpy)
     // and GPU DMA-accessible, allowing the allgather callback to pass them directly to NCCL
-    // without additional staging copies.
+    // without additional staging copies. RAII guards ensure the pinned pages are released on
+    // every exit path, including exceptions thrown by NVTE_CHECK_CUDA / NVTE_ERROR.
+    struct PinnedDeleter {
+      void operator()(void *p) const { if (p) cudaFreeHost(p); }
+    };
     cudaIpcMemHandle_t *memhndl;
     NVTE_CHECK_CUDA(
         cudaMallocHost(reinterpret_cast<void **>(&memhndl), sizeof(cudaIpcMemHandle_t)));
+    std::unique_ptr<void, PinnedDeleter> memhndl_guard(memhndl);
     NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff));
 
     cudaIpcMemHandle_t *tmp;
-    NVTE_CHECK_CUDA(
-        cudaMallocHost(reinterpret_cast<void **>(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t)));
-
+    NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast<void **>(&tmp),
+          comm->nvsize * sizeof(cudaIpcMemHandle_t)));
+    std::unique_ptr<void, PinnedDeleter> tmp_guard(tmp);
     comm->_allgather(reinterpret_cast<void *>(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t),
                      reinterpret_cast<void *>(memhndl), sizeof(cudaIpcMemHandle_t),
                      comm->comm_intra);
@@ -697,8 +702,6 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
         }
       }
       if (!peer_access_available) {
-        cudaFreeHost(tmp);
-        cudaFreeHost(memhndl);
         NVTE_ERROR(
             "No peer-to-peer access available between GPUs. This platform does not support the "
             "GPU-to-GPU "
@@ -721,8 +724,6 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
         comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice));
 
     NVTE_CHECK_CUDA(cudaDeviceSynchronize());
-    cudaFreeHost(tmp);
-    cudaFreeHost(memhndl);
 #if CUDART_VERSION >= 12010
   }
 #endif

From b5eb5876a4d7cb5c4d5a7ecada553e75f11334d7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 22:09:36 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../comm_gemm_overlap/userbuffers/userbuffers-host.cpp    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
index e934ce83ce..1dcde51d4b 100644
--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
@@ -668,7 +668,9 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
     // without additional staging copies. RAII guards ensure the pinned pages are released on
     // every exit path, including exceptions thrown by NVTE_CHECK_CUDA / NVTE_ERROR.
     struct PinnedDeleter {
-      void operator()(void *p) const { if (p) cudaFreeHost(p); }
+      void operator()(void *p) const {
+        if (p) cudaFreeHost(p);
+      }
     };
     cudaIpcMemHandle_t *memhndl;
     NVTE_CHECK_CUDA(
@@ -677,8 +679,8 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
     NVTE_CHECK_CUDA(cudaIpcGetMemHandle(memhndl, *gpubuff));
 
     cudaIpcMemHandle_t *tmp;
-    NVTE_CHECK_CUDA(cudaMallocHost(reinterpret_cast<void **>(&tmp),
-          comm->nvsize * sizeof(cudaIpcMemHandle_t)));
+    NVTE_CHECK_CUDA(
+        cudaMallocHost(reinterpret_cast<void **>(&tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t)));
     std::unique_ptr<void, PinnedDeleter> tmp_guard(tmp);
     comm->_allgather(reinterpret_cast<void *>(tmp), comm->nvsize * sizeof(cudaIpcMemHandle_t),
                      reinterpret_cast<void *>(memhndl), sizeof(cudaIpcMemHandle_t),