From 5bcd28dbefd33688a24c0c8ce8ebed263f00ff5c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Sun, 15 Mar 2026 15:07:36 -0400
Subject: [PATCH 01/13] coll/base: add allocator param to internal collective
 algorithms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thread an mca_allocator_base_module_t *allocator parameter through all
internal coll/base algorithm functions that allocate Pattern A scratch
buffers (sized via opal_datatype_span, gap-adjusted, passed to PML or
ompi_op_reduce). COLL_BASE_ALLOC/COLL_BASE_FREE macros dispatch to the
allocator when non-NULL, or fall back to malloc/free when NULL.

All existing callers (coll/tuned, coll/basic, coll/acoll) pass NULL,
preserving current host-malloc behavior with no functional change.

opal/mca/accelerator/base: add opal_accelerator_base_get_device_allocator()
helper that returns a cached, per-device bucket allocator backed by
opal_accelerator.mem_alloc/mem_release. Created lazily on first use.

coll/tuned decision functions: for gather (scratch is data-movement only,
no ompi_op_reduce), detect device buffers via opal_accelerator.check_addr
and pass the device allocator to gather_intra_do_this. For all reduction
operations (allreduce, reduce, reduce_scatter, reduce_scatter_block, scan,
exscan) always pass NULL — device-side reduction via ompi_op_reduce is
not yet supported.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ompi/mca/coll/acoll/coll_acoll_allreduce.c    | 30 +++----
 ompi/mca/coll/acoll/coll_acoll_reduce.c       | 14 ++--
 ompi/mca/coll/acoll/coll_acoll_utils.h        | 19 ++++-
 ompi/mca/coll/base/coll_base_allreduce.c      | 66 ++++++++--------
 ompi/mca/coll/base/coll_base_exscan.c         | 12 ++-
 ompi/mca/coll/base/coll_base_functions.h      | 60 +++++++-------
 ompi/mca/coll/base/coll_base_gather.c         | 14 ++--
 ompi/mca/coll/base/coll_base_reduce.c         | 78 +++++++++----------
 ompi/mca/coll/base/coll_base_reduce_scatter.c | 53 ++++++-------
 .../base/coll_base_reduce_scatter_block.c     | 65 +++++++---------
 ompi/mca/coll/base/coll_base_scan.c           | 12 ++-
 .../basic/coll_basic_reduce_scatter_block.c   |  2 +-
 ompi/mca/coll/tuned/coll_tuned.h              | 14 ++--
 .../tuned/coll_tuned_allreduce_decision.c     | 13 ++--
 .../coll/tuned/coll_tuned_decision_dynamic.c  | 68 ++++++++++++----
 .../coll/tuned/coll_tuned_decision_fixed.c    | 32 ++++++--
 .../coll/tuned/coll_tuned_exscan_decision.c   |  4 +-
 .../coll/tuned/coll_tuned_gather_decision.c   |  5 +-
 .../coll/tuned/coll_tuned_reduce_decision.c   | 23 ++++--
 ...coll_tuned_reduce_scatter_block_decision.c | 14 ++--
 .../coll_tuned_reduce_scatter_decision.c      | 15 ++--
 .../mca/coll/tuned/coll_tuned_scan_decision.c |  4 +-
 .../accelerator/base/accelerator_base_frame.c | 75 ++++++++++++++++++
 opal/mca/accelerator/base/base.h              |  9 +++
 24 files changed, 439 insertions(+), 262 deletions(-)

diff --git a/ompi/mca/coll/acoll/coll_acoll_allreduce.c b/ompi/mca/coll/acoll/coll_acoll_allreduce.c
index 3b40fef39f9..6da452ec719 100644
--- a/ompi/mca/coll/acoll/coll_acoll_allreduce.c
+++ b/ompi/mca/coll/acoll/coll_acoll_allreduce.c
@@ -483,7 +483,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Falling back to recursivedoubling for non-commutative operators to be safe */
     if (!ompi_op_is_commute(op)) {
         return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm,
-                                                                module);
+                                                                module, NULL);
     }
 
     /* Obtain the subcomms structure */
@@ -497,7 +497,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Fallback to knomial if subc is not obtained */
     if (NULL == subc) {
         return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm,
-                                                                module);
+                                                                module, NULL);
     }
     if (!subc->initialized) {
         err = mca_coll_acoll_comm_split_init(comm, acoll_module, subc, 0);
@@ -513,7 +513,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     if (num_nodes > 1) {
         if (total_dsize > 16384) {
             return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         }
         int use_socket = acoll_module->use_socket != -1 ? acoll_module->use_socket : 0;
         coll_acoll_subcomms_t *soc_subc = NULL;
@@ -525,7 +525,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
         /* Validate communicator hierarchy before proceeding */
         if (NULL == soc_comm || NULL == ldr_comm) {
             return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         }
 
         err = check_and_create_subc(soc_comm, acoll_module, &soc_subc);
@@ -573,10 +573,10 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
             if (ompi_comm_size(ldr_comm) > 1 && -1 != ldr_root) {
                 if ((MPI_IN_PLACE == sbuf)) {
                     err = ompi_coll_base_allreduce_intra_recursivedoubling(MPI_IN_PLACE, rbuf, count, dtype, op,
-                                                                           ldr_comm, module);
+                                                                           ldr_comm, module, NULL);
                 } else {
                     err = ompi_coll_base_allreduce_intra_recursivedoubling(tmp_sbuf, rbuf, count, dtype, op,
-                                                                           ldr_comm, module);
+                                                                           ldr_comm, module, NULL);
                 }
                 if (MPI_SUCCESS != err) {
                     if (NULL != inplacebuf_free) {
@@ -607,23 +607,23 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
     if (1 == num_nodes) {
         if (total_dsize < 32) {
             return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         } else if ((total_dsize < 512) && is_opt) {
             return mca_coll_acoll_allreduce_small_msgs_h(sbuf, rbuf, count, dtype, op, comm, module,
                                                          subc, 1);
         } else if (total_dsize <= 2048) {
             return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
-                                                                    comm, module);
+                                                                    comm, module, NULL);
         } else if (total_dsize < 65536) {
             if (1 == alg) {
                 return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             } else if (2 == alg) {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             } else { /*3 == alg */
                 return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op,
-                                                                     comm, module, 0);
+                                                                     comm, module, 0, NULL);
             }
         } else if (total_dsize < 4194304) {
             if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
@@ -631,7 +631,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
                 return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc);
             } else {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             }
         } else if (total_dsize <= 16777216) {
             if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
@@ -640,7 +640,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
                 return mca_coll_acoll_bcast(rbuf, count, dtype, 0, comm, module);
             } else {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             }
         } else {
             if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
@@ -648,13 +648,13 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
                 return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc);
             } else {
                 return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
-                                                                        op, comm, module);
+                                                                        op, comm, module, NULL);
             }
         }
 
     } else {
         return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm,
-                                                                module);
+                                                                module, NULL);
     }
     return MPI_SUCCESS;
 }
diff --git a/ompi/mca/coll/acoll/coll_acoll_reduce.c b/ompi/mca/coll/acoll/coll_acoll_reduce.c
index 28fc3c62c6a..8fee8f7c2a2 100644
--- a/ompi/mca/coll/acoll/coll_acoll_reduce.c
+++ b/ompi/mca/coll/acoll/coll_acoll_reduce.c
@@ -360,11 +360,11 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Falling back to inorder binary for non-commutative operators to be safe */
     if (!ompi_op_is_commute(op)) {
         return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm,
-                                                           module, 0, 0);
+                                                           module, 0, 0, NULL);
     }
     if (0 != root) { // ToDo: support non-zero root
         return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
-                                                    module, 0, 0);
+                                                    module, 0, 0, NULL);
     }
 
     /* Disable shm/xpmem based optimizations if: */
@@ -396,7 +396,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
     /* Fallback to knomial if subc is not obtained */
     if (NULL == subc) {
         return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
-                                                    module, 0, 0);
+                                                    module, 0, 0, NULL);
     }
 
     if (!subc->initialized || (root != subc->prev_init_root)) {
@@ -422,10 +422,10 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
                                                                 comm, module);
             } else if (2 == alg) {
                 return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root,
-                                                            comm, module, 0, 0);
+                                                            comm, module, 0, 0, NULL);
             } else { /* either 3 == alg or acoll_module->red_algo is not 0, 1, 2*/
                 return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op,
-                                                                   root, comm, module, 0, 0);
+                                                                   root, comm, module, 0, 0, NULL);
             }
         } else {
             if ((((0 != subc->smsc_use_sr_buf)
@@ -437,7 +437,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
                                                    module, subc);
             } else {
                 return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op,
-                                                                   root, comm, module, 0, 0);
+                                                                   root, comm, module, 0, 0, NULL);
             }
         }
     } else {
@@ -446,7 +446,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
                                           subc);
         } else {
             return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
-                                                        module, 0, 0);
+                                                        module, 0, 0, NULL);
         }
     }
     return MPI_SUCCESS;
diff --git a/ompi/mca/coll/acoll/coll_acoll_utils.h b/ompi/mca/coll/acoll/coll_acoll_utils.h
index 41d02381b5f..1fc9af3a931 100644
--- a/ompi/mca/coll/acoll/coll_acoll_utils.h
+++ b/ompi/mca/coll/acoll/coll_acoll_utils.h
@@ -33,6 +33,19 @@ extern int mca_coll_acoll_without_smsc;
 extern int mca_coll_acoll_smsc_use_sr_buf;
 extern int mca_coll_acoll_barrier_algo;
 
+/* Wrapper so recursivedoubling can be stored as a module function pointer
+ * despite having gained an allocator parameter in coll_base_functions.h. */
+static int
+ompi_coll_acoll_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, size_t count,
+                                                  struct ompi_datatype_t *dtype,
+                                                  struct ompi_op_t *op,
+                                                  struct ompi_communicator_t *comm,
+                                                  mca_coll_base_module_t *module)
+{
+    return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
+                                                            comm, module, NULL);
+}
+
 /*
  * Hybrid backoff spin-wait with adaptive progress calls.
  * Optimized for intra-node shared memory synchronization.
@@ -440,7 +453,7 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm,
     int rank = ompi_comm_rank(comm);
 
     (comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
-    (comm)->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling;
+    (comm)->c_coll->coll_allreduce = ompi_coll_acoll_allreduce_intra_recursivedoubling;
     (comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
     if (!subc->initialized) {
         OBJ_CONSTRUCT(&comm_info, opal_info_t);
@@ -538,14 +551,14 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm,
         coll_bcast_loc = (subc->local_comm)->c_coll->coll_bcast;
         (subc->local_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
         (subc->local_comm)->c_coll->coll_allreduce
-            = ompi_coll_base_allreduce_intra_recursivedoubling;
+            = ompi_coll_acoll_allreduce_intra_recursivedoubling;
         (subc->local_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
         coll_allreduce_soc = (subc->socket_comm)->c_coll->coll_allreduce;
         coll_allgather_soc = (subc->socket_comm)->c_coll->coll_allgather;
         coll_bcast_soc = (subc->socket_comm)->c_coll->coll_bcast;
         (subc->socket_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
         (subc->socket_comm)->c_coll->coll_allreduce
-            = ompi_coll_base_allreduce_intra_recursivedoubling;
+            = ompi_coll_acoll_allreduce_intra_recursivedoubling;
         (subc->socket_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
     }
 
diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index 82d57328b2e..ae1e27aac40 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -136,7 +136,8 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                                   struct ompi_datatype_t *dtype,
                                                   struct ompi_op_t *op,
                                                   struct ompi_communicator_t *comm,
-                                                  mca_coll_base_module_t *module)
+                                                  mca_coll_base_module_t *module,
+                                                  mca_allocator_base_module_t *allocator)
 {
     int ret, line, rank, size, adjsize, remote, distance;
     int newrank, newremote, extra_ranks;
@@ -160,7 +161,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Allocate and initialize temporary send buffer */
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = (char*) malloc(span);
+    inplacebuf_free = (char*) COLL_BASE_ALLOC(allocator, span);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
@@ -266,14 +267,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    if (NULL != inplacebuf_free) free(inplacebuf_free);
+    COLL_BASE_FREE(allocator, inplacebuf_free);
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    if (NULL != inplacebuf_free) free(inplacebuf_free);
+    COLL_BASE_FREE(allocator, inplacebuf_free);
     return ret;
 }
 
@@ -346,7 +347,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                                      struct ompi_datatype_t *dtype,
                                      struct ompi_op_t *op,
                                      struct ompi_communicator_t *comm,
-                                     mca_coll_base_module_t *module)
+                                     mca_coll_base_module_t *module,
+                                     mca_allocator_base_module_t *allocator)
 {
     int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
     int early_segcount, late_segcount, split_rank, max_segcount;
@@ -377,7 +379,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
         return (ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf,
                                                                   count,
                                                                   dtype, op,
-                                                                  comm, module));
+                                                                  comm, module,
+                                                                  allocator));
     }
 
     /* Allocate and initialize temporary buffers */
@@ -401,10 +404,10 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
 
-    inbuf[0] = (char*)malloc(max_real_segsize);
+    inbuf[0] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)malloc(max_real_segsize);
+        inbuf[1] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -524,8 +527,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
 
     }
 
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_BASE_FREE(allocator, inbuf[0]);
+    COLL_BASE_FREE(allocator, inbuf[1]);
 
     return MPI_SUCCESS;
 
@@ -534,8 +537,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_BASE_FREE(allocator, inbuf[0]);
+    COLL_BASE_FREE(allocator, inbuf[1]);
     return ret;
 }
 
@@ -624,7 +627,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                                                struct ompi_op_t *op,
                                                struct ompi_communicator_t *comm,
                                                mca_coll_base_module_t *module,
-                                               uint32_t segsize)
+                                               uint32_t segsize,
+                                               mca_allocator_base_module_t *allocator)
 {
     int ret, line, rank, size, k, recv_from, send_to;
     int early_blockcount, late_blockcount, split_rank;
@@ -660,7 +664,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
         if (count < (size_t) (size * segcount)) {
             OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %zu, switching to regular ring", rank, size, count));
             return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
-                                                         comm, module));
+                                                         comm, module, allocator));
         }
 
     /* Determine the number of phases of the algorithm */
@@ -689,10 +693,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = (char*)malloc(max_real_segsize);
+    inbuf[0] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)malloc(max_real_segsize);
+        inbuf[1] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -844,8 +848,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
 
     }
 
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_BASE_FREE(allocator, inbuf[0]);
+    COLL_BASE_FREE(allocator, inbuf[1]);
 
     return MPI_SUCCESS;
 
@@ -854,8 +858,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    if (NULL != inbuf[0]) free(inbuf[0]);
-    if (NULL != inbuf[1]) free(inbuf[1]);
+    COLL_BASE_FREE(allocator, inbuf[0]);
+    COLL_BASE_FREE(allocator, inbuf[1]);
     return ret;
 }
 
@@ -974,7 +978,7 @@ ompi_coll_base_allreduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t
 int ompi_coll_base_allreduce_intra_redscat_allgather(
     const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
 
@@ -1006,7 +1010,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = (char *)malloc(dsize);
+    char *tmp_buf_raw = (char *)COLL_BASE_ALLOC(allocator, dsize);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
@@ -1234,8 +1238,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     }
 
   cleanup_and_return:
-    if (NULL != tmp_buf_raw)
-        free(tmp_buf_raw);
+    COLL_BASE_FREE(allocator, tmp_buf_raw);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
@@ -1268,7 +1271,8 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
                                                     struct ompi_datatype_t *dtype,
                                                     struct ompi_op_t *op,
                                                     struct ompi_communicator_t *comm,
-                                                    mca_coll_base_module_t *module)
+                                                    mca_coll_base_module_t *module,
+                                                    mca_allocator_base_module_t *allocator)
 {
     int line = -1;
     char *partial_buf = NULL;
@@ -1289,10 +1293,10 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
     }
     ptrdiff_t buf_size, gap = 0;
     buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap);
-    partial_buf = (char *) malloc(buf_size);
+    partial_buf = (char *) COLL_BASE_ALLOC(allocator, buf_size);
     partial_buf_start = partial_buf - gap;
     buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap);
-    tmpsend = (char *) malloc(buf_size);
+    tmpsend = (char *) COLL_BASE_ALLOC(allocator, buf_size);
     tmpsend_start = tmpsend - gap;
 
     err = ompi_datatype_copy_content_same_ddt(dtype, count,
@@ -1320,18 +1324,18 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
                                               (char*)partial_buf_start);
     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
-    if (NULL != partial_buf) free(partial_buf);
-    if (NULL != tmpsend) free(tmpsend);
+    COLL_BASE_FREE(allocator, partial_buf);
+    COLL_BASE_FREE(allocator, tmpsend);
     return MPI_SUCCESS;
 
 err_hndl:
     if (NULL != partial_buf) {
-        free(partial_buf);
+        COLL_BASE_FREE(allocator, partial_buf);
         partial_buf = NULL;
         partial_buf_start = NULL;
     }
      if (NULL != tmpsend) {
-        free(tmpsend);
+        COLL_BASE_FREE(allocator, tmpsend);
         tmpsend = NULL;
         tmpsend_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_exscan.c b/ompi/mca/coll/base/coll_base_exscan.c
index d702eb361b9..1e9cab4b942 100644
--- a/ompi/mca/coll/base/coll_base_exscan.c
+++ b/ompi/mca/coll/base/coll_base_exscan.c
@@ -142,7 +142,7 @@ ompi_coll_base_exscan_intra_linear(const void *sbuf, void *rbuf, size_t count,
 int ompi_coll_base_exscan_intra_recursivedoubling(
     const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     int err = MPI_SUCCESS;
     char *tmpsend_raw = NULL, *tmprecv_raw = NULL;
@@ -158,8 +158,8 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
 
     ptrdiff_t dsize, gap;
     dsize = opal_datatype_span(&datatype->super, count, &gap);
-    tmpsend_raw = malloc(dsize);
-    tmprecv_raw = malloc(dsize);
+    tmpsend_raw = COLL_BASE_ALLOC(allocator, dsize);
+    tmprecv_raw = COLL_BASE_ALLOC(allocator, dsize);
     if (NULL == tmpsend_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -215,9 +215,7 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
     }
 
 cleanup_and_return:
-    if (NULL != tmpsend_raw)
-        free(tmpsend_raw);
-    if (NULL != tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_BASE_FREE(allocator, tmpsend_raw);
+    COLL_BASE_FREE(allocator, tmprecv_raw);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index b59ef55f437..c0195eafd0f 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -36,6 +36,16 @@
 #include "ompi/mca/coll/coll.h"
 #include "ompi/info/info.h"
 #include "ompi/request/request.h"
+#include "opal/mca/allocator/allocator.h"
+
+/* Allocator-aware helpers for Pattern-A scratch buffers.
+ * Pass allocator=NULL to fall back to plain malloc/free. */
+#define COLL_BASE_ALLOC(allocator, size) \
+    ((allocator) ? (allocator)->alc_alloc((allocator), (size), 0) : malloc(size))
+
+#define COLL_BASE_FREE(allocator, ptr) \
+    do { if (ptr) { if (allocator) (allocator)->alc_free((allocator), (ptr)); \
+                    else free(ptr); } } while (0)
 
 /* need to include our own topo prototypes so we can malloc data on the comm correctly */
 #include "coll_base_topo.h"
@@ -206,12 +216,12 @@ int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
 
 /* All Reduce */
 int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize);
+int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS);
+int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
 
 /* AlltoAll */
 int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
@@ -255,44 +265,42 @@ int ompi_coll_base_bcast_intra_scatter_allgather(BCAST_ARGS, uint32_t segsize);
 int ompi_coll_base_bcast_intra_scatter_allgather_ring(BCAST_ARGS, uint32_t segsize);
 
 /* Exscan */
-int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS);
+int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_exscan_intra_linear(EXSCAN_ARGS);
-int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS);
 
 /* Gather */
 int ompi_coll_base_gather_intra_basic_linear(GATHER_ARGS);
-int ompi_coll_base_gather_intra_binomial(GATHER_ARGS);
+int ompi_coll_base_gather_intra_binomial(GATHER_ARGS, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size);
 
 /* GatherV */
 
 /* Reduce */
-int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs);
+int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS);
-int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs );
-int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS);
-int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix);
+int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix, mca_allocator_base_module_t *allocator);
 
 /* Reduce_scatter */
-int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS);
-int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS);
-int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS);
-int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS);
+int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
 
 /* Reduce_scatter_block */
-int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS);
+int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
 
 /* Scan */
-int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
+int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_scan_intra_linear(SCAN_ARGS);
-int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
 
 /* Scatter */
 int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
diff --git a/ompi/mca/coll/base/coll_base_gather.c b/ompi/mca/coll/base/coll_base_gather.c
index a4486152157..894af0a085a 100644
--- a/ompi/mca/coll/base/coll_base_gather.c
+++ b/ompi/mca/coll/base/coll_base_gather.c
@@ -44,7 +44,8 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
                                       struct ompi_datatype_t *rdtype,
                                       int root,
                                       struct ompi_communicator_t *comm,
-                                      mca_coll_base_module_t *module)
+                                      mca_coll_base_module_t *module,
+                                      mca_allocator_base_module_t *allocator)
 {
     int line = -1, i, rank, vrank, size, err;
     size_t total_recv = 0;
@@ -82,7 +83,7 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
         } else {
             /* root is not on 0, allocate temp buffer for recv,
              * rotate data at the end */
-            tempbuf = (char *) malloc(rsize);
+            tempbuf = (char *) COLL_BASE_ALLOC(allocator, rsize);
             if (NULL == tempbuf) {
                 err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
             }
@@ -107,7 +108,7 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
          * to the property of binimoal tree */
         ompi_datatype_type_extent(sdtype, &sextent);
         ssize = opal_datatype_span(&sdtype->super, (int64_t)scount * size, &sgap);
-        tempbuf = (char *) malloc(ssize);
+        tempbuf = (char *) COLL_BASE_ALLOC(allocator, ssize);
         if (NULL == tempbuf) {
             err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
         }
@@ -180,17 +181,16 @@ ompi_coll_base_gather_intra_binomial(const void *sbuf, size_t scount,
                                                       (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root));
             if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
-            free(tempbuf);
+            COLL_BASE_FREE(allocator, tempbuf);
         }
     } else if (!(vrank % 2)) {
         /* other non-leaf nodes */
-        free(tempbuf);
+        COLL_BASE_FREE(allocator, tempbuf);
     }
     return MPI_SUCCESS;
 
  err_hndl:
-    if (NULL != tempbuf)
-        free(tempbuf);
+    COLL_BASE_FREE(allocator, tempbuf);
 
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank));
diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c
index f127abef311..588973aa832 100644
--- a/ompi/mca/coll/base/coll_base_reduce.c
+++ b/ompi/mca/coll/base/coll_base_reduce.c
@@ -66,7 +66,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                                     int root, ompi_communicator_t* comm,
                                     mca_coll_base_module_t *module,
                                     ompi_coll_tree_t* tree, size_t count_by_segment,
-                                    int max_outstanding_reqs )
+                                    int max_outstanding_reqs, mca_allocator_base_module_t *allocator )
 {
     char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
     char *accumbuf = NULL, *accumbuf_free = NULL;
@@ -106,7 +106,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         if( (NULL == accumbuf) || (root != rank) ) {
             /* Allocate temporary accumulator buffer. */
             size = opal_datatype_span(&datatype->super, original_count, &gap);
-            accumbuf_free = (char*)malloc(size);
+            accumbuf_free = (char*)COLL_BASE_ALLOC(allocator, size);
             if (accumbuf_free == NULL) {
                 line = __LINE__; ret = -1; goto error_hndl;
             }
@@ -123,7 +123,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         }
         /* Allocate two buffers for incoming segments */
         real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap);
-        inbuf_free[0] = (char*) malloc(real_segment_size);
+        inbuf_free[0] = (char*) COLL_BASE_ALLOC(allocator, real_segment_size);
         if( inbuf_free[0] == NULL ) {
             line = __LINE__; ret = -1; goto error_hndl;
         }
@@ -131,7 +131,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         /* if there is chance to overlap communication -
            allocate second buffer */
         if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
-            inbuf_free[1] = (char*) malloc(real_segment_size);
+            inbuf_free[1] = (char*) COLL_BASE_ALLOC(allocator, real_segment_size);
             if( inbuf_free[1] == NULL ) {
                 line = __LINE__; ret = -1; goto error_hndl;
             }
@@ -242,9 +242,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         } /* end of for each segment */
 
         /* clean up */
-        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
-        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
-        if( accumbuf_free != NULL ) free(accumbuf_free);
+        COLL_BASE_FREE(allocator, inbuf_free[0]);
+        COLL_BASE_FREE(allocator, inbuf_free[1]);
+        COLL_BASE_FREE(allocator, accumbuf_free);
     }
 
     /* leaf nodes
@@ -365,9 +365,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         }
         ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
     }
-    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
-    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
-    if( accumbuf_free != NULL ) free(accumbuf);
+    COLL_BASE_FREE(allocator, inbuf_free[0]);
+    COLL_BASE_FREE(allocator, inbuf_free[1]);
+    COLL_BASE_FREE(allocator, accumbuf_free);
     OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
                    "ERROR_HNDL: node %d file %s line %d error %d\n",
                    rank, __FILE__, line, ret ));
@@ -388,7 +388,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_
                                         ompi_communicator_t* comm,
                                         mca_coll_base_module_t *module,
                                         uint32_t segsize, int fanout,
-                                        int max_outstanding_reqs )
+                                        int max_outstanding_reqs, mca_allocator_base_module_t *allocator )
 {
     size_t segcount = count;
     size_t typelng;
@@ -408,7 +408,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_chain,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, allocator );
 }
 
 
@@ -418,7 +418,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs  )
+                                           int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -440,7 +440,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_pipeline,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, allocator );
 }
 
 int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
@@ -449,7 +449,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
                                          ompi_communicator_t* comm,
                                          mca_coll_base_module_t *module,
                                          uint32_t segsize,
-                                         int max_outstanding_reqs  )
+                                         int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -471,7 +471,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_bintree,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, allocator );
 }
 
 int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
@@ -480,7 +480,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs  )
+                                           int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -502,7 +502,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_in_order_bmtree,
-                                           segcount, max_outstanding_reqs );
+                                           segcount, max_outstanding_reqs, allocator );
 }
 
 /*
@@ -519,7 +519,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                                   ompi_communicator_t* comm,
                                                   mca_coll_base_module_t *module,
                                                   uint32_t segsize,
-                                                  int max_outstanding_reqs  )
+                                                  int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
 {
     int ret, rank, size, io_root, segcount = count;
     void *use_this_sendbuf = NULL;
@@ -560,7 +560,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
         dsize = opal_datatype_span(&datatype->super, count, &gap);
 
         if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
-            tmpbuf_free = (char *) malloc(dsize);
+            tmpbuf_free = (char *) COLL_BASE_ALLOC(allocator, dsize);
             if (NULL == tmpbuf_free) {
                 return MPI_ERR_INTERN;
             }
@@ -570,7 +570,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                                 (char*)recvbuf);
             use_this_sendbuf = tmpbuf;
         } else if (io_root == rank) {
-            tmpbuf_free = (char *) malloc(dsize);
+            tmpbuf_free = (char *) COLL_BASE_ALLOC(allocator, dsize);
             if (NULL == tmpbuf_free) {
                 return MPI_ERR_INTERN;
             }
@@ -583,9 +583,9 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
     ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                           op, io_root, comm, module,
                                           data->cached_in_order_bintree,
-                                          segcount, max_outstanding_reqs );
+                                          segcount, max_outstanding_reqs, allocator );
     if (MPI_SUCCESS != ret) {
-        free(tmpbuf_free);
+        COLL_BASE_FREE(allocator, tmpbuf_free);
         return ret;
     }
 
@@ -597,7 +597,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                     MCA_COLL_BASE_TAG_REDUCE, comm,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != ret) {
-                free(tmpbuf_free);
+                COLL_BASE_FREE(allocator, tmpbuf_free);
                 return ret;
             }
 
@@ -607,13 +607,13 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                     MCA_COLL_BASE_TAG_REDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
             if (MPI_SUCCESS != ret) {
-                free(tmpbuf_free);
+                COLL_BASE_FREE(allocator, tmpbuf_free);
                 return ret;
             }
         }
     }
     if (NULL != tmpbuf_free) {
-        free(tmpbuf_free);
+        COLL_BASE_FREE(allocator, tmpbuf_free);
     }
 
     return MPI_SUCCESS;
@@ -812,7 +812,7 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t co
 int ompi_coll_base_reduce_intra_redscat_gather(
     const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, int root, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     int comm_size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
@@ -844,7 +844,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
 
     /* Temporary buffers */
     char *tmp_buf_raw = NULL, *rbuf_raw = NULL;
-    tmp_buf_raw = malloc(dsize);
+    tmp_buf_raw = COLL_BASE_ALLOC(allocator, dsize);
     if (NULL == tmp_buf_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -852,7 +852,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
     char *tmp_buf = tmp_buf_raw - gap;
 
     if (rank != root) {
-        rbuf_raw = malloc(dsize);
+        rbuf_raw = COLL_BASE_ALLOC(allocator, dsize);
         if (NULL == rbuf_raw) {
             err = OMPI_ERR_OUT_OF_RESOURCE;
             goto cleanup_and_return;
@@ -1129,10 +1129,8 @@ int ompi_coll_base_reduce_intra_redscat_gather(
     }
 
   cleanup_and_return:
-    if (NULL != tmp_buf_raw)
-        free(tmp_buf_raw);
-    if (NULL != rbuf_raw)
-        free(rbuf_raw);
+    COLL_BASE_FREE(allocator, tmp_buf_raw);
+    COLL_BASE_FREE(allocator, rbuf_raw);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
@@ -1170,7 +1168,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs, int radix)
+                                           int max_outstanding_reqs, int radix, mca_allocator_base_module_t *allocator)
 {
     int err = OMPI_SUCCESS, rank, line;
     ptrdiff_t extent, lb;
@@ -1215,7 +1213,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
         sendtmpbuf = (char *)recvbuf;
     }
     buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap);
-    reduce_buf = (char *)malloc(buf_size);
+    reduce_buf = (char *)COLL_BASE_ALLOC(allocator, buf_size);
     reduce_buf_start = reduce_buf - gap;
     err = ompi_datatype_copy_content_same_ddt(datatype, count,
                                               (char*)reduce_buf_start,
@@ -1227,7 +1225,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
     max_reqs = num_children;
     if(!is_leaf) {
         buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap);
-        child_buf = (char *)malloc(buf_size);
+        child_buf = (char *)COLL_BASE_ALLOC(allocator, buf_size);
         child_buf_start = child_buf - gap;
         reqs = ompi_coll_base_comm_get_reqs(data, max_reqs);
     }
@@ -1275,18 +1273,18 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
     }
 
-    if (NULL != child_buf) free(child_buf);
-    if (NULL != reduce_buf) free(reduce_buf);
+    COLL_BASE_FREE(allocator, child_buf);
+    COLL_BASE_FREE(allocator, reduce_buf);
     return MPI_SUCCESS;
 
  err_hndl:
     if (NULL != child_buf) {
-        free(child_buf);
+        COLL_BASE_FREE(allocator, child_buf);
         child_buf = NULL;
         child_buf_start = NULL;
     }
     if (NULL != reduce_buf) {
-        free(reduce_buf);
+        COLL_BASE_FREE(allocator, reduce_buf);
         reduce_buf = NULL;
         reduce_buf_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c
index 7a838936378..2d9119bb1da 100644
--- a/ompi/mca/coll/base/coll_base_reduce_scatter.c
+++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c
@@ -49,7 +49,8 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
                                                         struct ompi_datatype_t *dtype,
                                                         struct ompi_op_t *op,
                                                         struct ompi_communicator_t *comm,
-                                                        mca_coll_base_module_t *module)
+                                                        mca_coll_base_module_t *module,
+                                                        mca_allocator_base_module_t *allocator)
 {
     int err, i, rank, size, total_count;
     ptrdiff_t *displs = NULL;
@@ -82,14 +83,14 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
             ptrdiff_t dsize, gap = 0;
             dsize = opal_datatype_span(&dtype->super, total_count, &gap);
 
-            tmprbuf_free = (char*) malloc(dsize);
+            tmprbuf_free = (char*) COLL_BASE_ALLOC(allocator, dsize);
             tmprbuf = tmprbuf_free - gap;
         }
         err = comm->c_coll->coll_reduce (sbuf, tmprbuf, total_count,
                                         dtype, op, root, comm, comm->c_coll->coll_reduce_module);
     }
     if (MPI_SUCCESS != err) {
-        if (NULL != tmprbuf_free) free(tmprbuf_free);
+        COLL_BASE_FREE(allocator, tmprbuf_free);
         return err;
     }
 
@@ -109,7 +110,7 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
                                            root, comm, comm->c_coll->coll_scatterv_module);
     }
     free(displs);
-    if (NULL != tmprbuf_free) free(tmprbuf_free);
+    COLL_BASE_FREE(allocator, tmprbuf_free);
 
     return err;
 }
@@ -138,7 +139,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                                                             struct ompi_datatype_t *dtype,
                                                             struct ompi_op_t *op,
                                                             struct ompi_communicator_t *comm,
-                                                            mca_coll_base_module_t *module)
+                                                            mca_coll_base_module_t *module,
+                                                            mca_allocator_base_module_t *allocator)
 {
     int i, rank, size, err = OMPI_SUCCESS;
     int tmp_size, remain = 0, tmp_rank;
@@ -180,7 +182,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
     }
 
     /* Allocate temporary receive buffer. */
-    recv_buf_free = (char*) malloc(buf_size);
+    recv_buf_free = (char*) COLL_BASE_ALLOC(allocator, buf_size);
     recv_buf = recv_buf_free - gap;
     if (NULL == recv_buf_free) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
@@ -188,7 +190,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
     }
 
     /* allocate temporary buffer for results */
-    result_buf_free = (char*) malloc(buf_size);
+    result_buf_free = (char*) COLL_BASE_ALLOC(allocator, buf_size);
     result_buf = result_buf_free - gap;
 
     /* copy local buffer into the temporary results */
@@ -391,8 +393,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
 
  cleanup:
     if (NULL != disps) free(disps);
-    if (NULL != recv_buf_free) free(recv_buf_free);
-    if (NULL != result_buf_free) free(result_buf_free);
+    COLL_BASE_FREE(allocator, recv_buf_free);
+    COLL_BASE_FREE(allocator, result_buf_free);
 
     return err;
 }
@@ -464,7 +466,8 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
                                           struct ompi_datatype_t *dtype,
                                           struct ompi_op_t *op,
                                           struct ompi_communicator_t *comm,
-                                          mca_coll_base_module_t *module)
+                                          mca_coll_base_module_t *module,
+                                          mca_allocator_base_module_t *allocator)
 {
     int ret, line, rank, size, i, k, recv_from, send_to;
     int inbi;
@@ -518,15 +521,15 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     max_real_segsize = opal_datatype_span(&dtype->super, max_block_count, &gap);
     dsize = opal_datatype_span(&dtype->super, total_count, &gap);
 
-    accumbuf_free = (char*)malloc(dsize);
+    accumbuf_free = (char*)COLL_BASE_ALLOC(allocator, dsize);
     if (NULL == accumbuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     accumbuf = accumbuf_free - gap;
 
-    inbuf_free[0] = (char*)malloc(max_real_segsize);
+    inbuf_free[0] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
     if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     inbuf[0] = inbuf_free[0] - gap;
     if (size > 2) {
-        inbuf_free[1] = (char*)malloc(max_real_segsize);
+        inbuf_free[1] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
         if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; }
         inbuf[1] = inbuf_free[1] - gap;
     }
@@ -615,9 +618,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     if (ret < 0) { line = __LINE__; goto error_hndl; }
 
     if (NULL != displs) free(displs);
-    if (NULL != accumbuf_free) free(accumbuf_free);
-    if (NULL != inbuf_free[0]) free(inbuf_free[0]);
-    if (NULL != inbuf_free[1]) free(inbuf_free[1]);
+    COLL_BASE_FREE(allocator, accumbuf_free);
+    COLL_BASE_FREE(allocator, inbuf_free[0]);
+    COLL_BASE_FREE(allocator, inbuf_free[1]);
 
     return MPI_SUCCESS;
 
@@ -626,9 +629,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
     if (NULL != displs) free(displs);
-    if (NULL != accumbuf_free) free(accumbuf_free);
-    if (NULL != inbuf_free[0]) free(inbuf_free[0]);
-    if (NULL != inbuf_free[1]) free(inbuf_free[1]);
+    COLL_BASE_FREE(allocator, accumbuf_free);
+    COLL_BASE_FREE(allocator, inbuf_free[0]);
+    COLL_BASE_FREE(allocator, inbuf_free[1]);
     return ret;
 }
 
@@ -701,7 +704,7 @@ int
 ompi_coll_base_reduce_scatter_intra_butterfly(
     const void *sbuf, void *rbuf, ompi_count_array_t rcounts, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t *displs = NULL, index;
@@ -729,8 +732,8 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = malloc(span);
-    tmpbuf[1] = malloc(span);
+    tmpbuf[0] = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf[1] = COLL_BASE_ALLOC(allocator, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -899,9 +902,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 cleanup_and_return:
     if (displs)
         free(displs);
-    if (tmpbuf[0])
-        free(tmpbuf[0]);
-    if (tmpbuf[1])
-        free(tmpbuf[1]);
+    COLL_BASE_FREE(allocator, tmpbuf[0]);
+    COLL_BASE_FREE(allocator, tmpbuf[1]);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
index ca4a6989bec..47b143a24ec 100644
--- a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
+++ b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
@@ -59,7 +59,8 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
                                                  struct ompi_datatype_t *dtype,
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
-                                                 mca_coll_base_module_t *module)
+                                                 mca_coll_base_module_t *module,
+                                                 mca_allocator_base_module_t *allocator)
 {
     int rank, size, err = OMPI_SUCCESS;
     size_t count;
@@ -101,7 +102,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
         if (0 == rank) {
             /* temporary receive buffer.  See coll_basic_reduce.c for
                details on sizing */
-            recv_buf_free = (char*) malloc(span);
+            recv_buf_free = (char*) COLL_BASE_ALLOC(allocator, span);
             if (NULL == recv_buf_free) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto cleanup;
@@ -151,7 +152,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
         if (0 == rank) {
             /* temporary receive buffer.  See coll_basic_reduce.c for
                details on sizing */
-            recv_buf_free = (char*) malloc(span);
+            recv_buf_free = (char*) COLL_BASE_ALLOC(allocator, span);
             if (NULL == recv_buf_free) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto cleanup;
@@ -174,7 +175,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
     }
 
  cleanup:
-    if (NULL != recv_buf_free) free(recv_buf_free);
+    COLL_BASE_FREE(allocator, recv_buf_free);
 
     return err;
 }
@@ -198,7 +199,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     struct ompi_datatype_t *dtypesend = NULL, *dtyperecv = NULL;
     char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
@@ -225,12 +226,12 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
          * will overflow an int data type.
          * Fallback to the linear algorithm.
          */
-        return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module);
+        return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, allocator);
     }
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf_raw = malloc(span);
-    tmprecv_raw = malloc(span);
+    tmpbuf_raw = COLL_BASE_ALLOC(allocator, span);
+    tmprecv_raw = COLL_BASE_ALLOC(allocator, span);
     if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -368,10 +369,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
         ompi_datatype_destroy(&dtypesend);
     if (dtyperecv)
         ompi_datatype_destroy(&dtyperecv);
-    if (tmpbuf_raw)
-        free(tmpbuf_raw);
-    if (tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_BASE_FREE(allocator, tmpbuf_raw);
+    COLL_BASE_FREE(allocator, tmprecv_raw);
     return err;
 }
 
@@ -406,7 +405,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
     ptrdiff_t span, gap, totalcount, extent;
@@ -425,14 +424,14 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
                      "coll:base:reduce_scatter_block_intra_recursivehalving: rank %d/%d "
                      "switching to basic reduce_scatter_block", rank, comm_size));
         return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype,
-                                                                op, comm, module);
+                                                                op, comm, module, allocator);
     }
 
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf_raw = malloc(span);
-    tmprecv_raw = malloc(span);
+    tmpbuf_raw = COLL_BASE_ALLOC(allocator, span);
+    tmprecv_raw = COLL_BASE_ALLOC(allocator, span);
     if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -576,17 +575,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
     }
 
 cleanup_and_return:
-    if (tmpbuf_raw)
-        free(tmpbuf_raw);
-    if (tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_BASE_FREE(allocator, tmpbuf_raw);
+    COLL_BASE_FREE(allocator, tmprecv_raw);
     return err;
 }
 
 static int ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module);
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator);
 
 /*
  * ompi_coll_base_reduce_scatter_block_intra_butterfly
@@ -648,7 +645,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_butterfly(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t span, gap, totalcount, extent;
@@ -665,14 +662,14 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
     if (!(comm_size & (comm_size - 1))) {
         /* Special case: comm_size is a power of two */
         return ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
-                   sbuf, rbuf, rcount, dtype, op, comm, module);
+                   sbuf, rbuf, rcount, dtype, op, comm, module, allocator);
     }
 
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = malloc(span);
-    tmpbuf[1] = malloc(span);
+    tmpbuf[0] = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf[1] = COLL_BASE_ALLOC(allocator, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -843,10 +840,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
     }
 
 cleanup_and_return:
-    if (tmpbuf[0])
-        free(tmpbuf[0]);
-    if (tmpbuf[1])
-        free(tmpbuf[1]);
+    COLL_BASE_FREE(allocator, tmpbuf[0]);
+    COLL_BASE_FREE(allocator, tmpbuf[1]);
     return err;
 }
 
@@ -895,7 +890,7 @@ static int
 ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t span, gap, totalcount, extent;
@@ -909,8 +904,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = malloc(span);
-    tmpbuf[1] = malloc(span);
+    tmpbuf[0] = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf[1] = COLL_BASE_ALLOC(allocator, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -969,9 +964,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
 cleanup_and_return:
-    if (tmpbuf[0])
-        free(tmpbuf[0]);
-    if (tmpbuf[1])
-        free(tmpbuf[1]);
+    COLL_BASE_FREE(allocator, tmpbuf[0]);
+    COLL_BASE_FREE(allocator, tmpbuf[1]);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_scan.c b/ompi/mca/coll/base/coll_base_scan.c
index 9ac99ed255e..0d69ec8062f 100644
--- a/ompi/mca/coll/base/coll_base_scan.c
+++ b/ompi/mca/coll/base/coll_base_scan.c
@@ -157,7 +157,7 @@ ompi_coll_base_scan_intra_linear(const void *sbuf, void *rbuf, size_t count,
 int ompi_coll_base_scan_intra_recursivedoubling(
     const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
 {
     int err = MPI_SUCCESS;
     char *tmpsend_raw = NULL, *tmprecv_raw = NULL;
@@ -179,8 +179,8 @@ int ompi_coll_base_scan_intra_recursivedoubling(
 
     ptrdiff_t dsize, gap;
     dsize = opal_datatype_span(&datatype->super, count, &gap);
-    tmpsend_raw = malloc(dsize);
-    tmprecv_raw = malloc(dsize);
+    tmpsend_raw = COLL_BASE_ALLOC(allocator, dsize);
+    tmprecv_raw = COLL_BASE_ALLOC(allocator, dsize);
     if (NULL == tmpsend_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -222,9 +222,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(
     }
 
 cleanup_and_return:
-    if (NULL != tmpsend_raw)
-        free(tmpsend_raw);
-    if (NULL != tmprecv_raw)
-        free(tmprecv_raw);
+    COLL_BASE_FREE(allocator, tmpsend_raw);
+    COLL_BASE_FREE(allocator, tmprecv_raw);
     return err;
 }
diff --git a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c
index 5eb7adfda50..136255fe145 100644
--- a/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c
+++ b/ompi/mca/coll/basic/coll_basic_reduce_scatter_block.c
@@ -59,7 +59,7 @@ mca_coll_basic_reduce_scatter_block_intra(const void *sbuf, void *rbuf, size_t r
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module)
 {
-    return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module);
+    return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, NULL);
 }
 
 /*
diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h
index 53bb8705aa0..6446821093c 100644
--- a/ompi/mca/coll/tuned/coll_tuned.h
+++ b/ompi/mca/coll/tuned/coll_tuned.h
@@ -115,7 +115,7 @@ int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorith
 int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
 int ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(ALLREDUCE_ARGS);
 int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
-int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* AlltoAll */
@@ -146,25 +146,25 @@ int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mc
 /* Gather */
 int ompi_coll_tuned_gather_intra_dec_fixed(GATHER_ARGS);
 int ompi_coll_tuned_gather_intra_dec_dynamic(GATHER_ARGS);
-int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_gather_intra_do_this(GATHER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce */
 int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
 int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
-int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs);
+int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce_scatter */
 int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
 int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
-int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce_scatter_block */
 int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(REDUCESCATTERBLOCK_ARGS);
 int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Scatter */
@@ -176,13 +176,13 @@ int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_
 /* Exscan */
 int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
 int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
-int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm);
+int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_exscan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Scan */
 int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
 int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
-int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm);
+int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_scan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 struct mca_coll_tuned_component_t {
diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
index 9a63d8c5abb..0b38ae01f24 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
@@ -130,7 +130,8 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t
                                             struct ompi_op_t *op,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module,
-                                            int algorithm, int faninout, int segsize)
+                                            int algorithm, int faninout, int segsize,
+                                            mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
@@ -144,15 +145,15 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t
     case (2):
         return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
     case (3):
-        return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module, allocator);
     case (4):
-        return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module, allocator);
     case (5):
-        return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize);
+        return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize, allocator);
     case (6):
-        return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module, allocator);
     case (7):
-        return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module);
+        return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module, allocator);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
index 5d6a699b301..30e5fdb8b78 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
@@ -25,6 +25,8 @@
 
 #include "mpi.h"
 #include "ompi/constants.h"
+#include "opal/mca/accelerator/accelerator.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/base/base.h"
@@ -63,12 +65,16 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator (NULL). */
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[ALLREDUCE].algorithm) {
         return ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module,
                                                        tuned_module->user_forced[ALLREDUCE].algorithm,
                                                        tuned_module->user_forced[ALLREDUCE].tree_fanout,
-                                                       tuned_module->user_forced[ALLREDUCE].segsize);
+                                                       tuned_module->user_forced[ALLREDUCE].segsize,
+                                                       NULL);
     }
 
     /* check to see if we have some filebased rules */
@@ -87,7 +93,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
             /* we have found a valid choice from the file based rules for this message size */
             return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
                                                             comm, module,
-                                                            alg, faninout, segsize);
+                                                            alg, faninout, segsize, NULL);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -317,6 +323,9 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_dec_dynamic"));
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator (NULL). */
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCE].algorithm) {
         return ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
@@ -324,7 +333,8 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
                                                     tuned_module->user_forced[REDUCE].algorithm,
                                                     tuned_module->user_forced[REDUCE].chain_fanout,
                                                     tuned_module->user_forced[REDUCE].segsize,
-                                                    tuned_module->user_forced[REDUCE].max_requests);
+                                                    tuned_module->user_forced[REDUCE].max_requests,
+                                                    NULL);
     }
 
     /* check to see if we have some filebased rules */
@@ -345,7 +355,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
             return  ompi_coll_tuned_reduce_intra_do_this (sbuf, rbuf, count, dtype,
                                                           op, root, comm, module,
                                                           alg, faninout,
-                                                          segsize, max_requests);
+                                                          segsize, max_requests, NULL);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -374,13 +384,17 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_dec_dynamic"));
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator (NULL). */
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
         return ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
                                                             op, comm, module,
                                                             tuned_module->user_forced[REDUCESCATTER].algorithm,
                                                             tuned_module->user_forced[REDUCESCATTER].chain_fanout,
-                                                            tuned_module->user_forced[REDUCESCATTER].segsize);
+                                                            tuned_module->user_forced[REDUCESCATTER].segsize,
+                                                            NULL);
     }
 
     /* check to see if we have some filebased rules */
@@ -401,7 +415,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
             /* we have found a valid choice from the file based rules for this message size */
             return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
                                                                   op, comm, module,
-                                                                  alg, faninout, segsize);
+                                                                  alg, faninout, segsize, NULL);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -430,13 +444,17 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_dec_dynamic"));
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator (NULL). */
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) {
         return ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
                                                                   op, comm, module,
                                                                   tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm,
                                                                   tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].segsize);
+                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].segsize,
+                                                                  NULL);
     }
 
     /* check to see if we have some filebased rules */
@@ -456,7 +474,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
             /* we have found a valid choice from the file based rules for this message size */
             return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
                                                                         op, comm, module,
-                                                                        alg, faninout, segsize);
+                                                                        alg, faninout, segsize, NULL);
         } /* found a method */
     } /* end if any com rules to check */
 
@@ -600,10 +618,23 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
                                              mca_coll_base_module_t *module)
 {
     mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
+    mca_allocator_base_module_t *allocator = NULL;
 
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_gather_intra_dec_dynamic"));
 
+    /* Scratch buffer is used for data movement only (no ompi_op_reduce).
+     * Use device allocator when user buffers are on device. */
+    {
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+    }
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[GATHER].algorithm) {
         return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype,
@@ -611,7 +642,8 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                     root, comm, module,
                                                     tuned_module->user_forced[GATHER].algorithm,
                                                     tuned_module->user_forced[GATHER].tree_fanout,
-                                                    tuned_module->user_forced[GATHER].segsize);
+                                                    tuned_module->user_forced[GATHER].segsize,
+                                                    allocator);
     }
 
     /**
@@ -633,7 +665,7 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
             return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
                                                          rbuf, rcount, rdtype,
                                                          root, comm, module,
-                                                         alg, faninout, segsize);
+                                                         alg, faninout, segsize, allocator);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -703,11 +735,15 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_exscan_intra_dec_dynamic"));
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator (NULL). */
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[EXSCAN].algorithm) {
         return ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
                                                     op, comm, module,
-                                                    tuned_module->user_forced[EXSCAN].algorithm);
+                                                    tuned_module->user_forced[EXSCAN].algorithm,
+                                                    NULL);
     }
 
     /**
@@ -728,7 +764,7 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
             /* we have found a valid choice from the file based rules for this message size */
             return ompi_coll_tuned_exscan_intra_do_this (sbuf, rbuf, count, dtype,
                                                          op, comm, module,
-                                                         alg);
+                                                         alg, NULL);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -747,11 +783,15 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_scan_intra_dec_dynamic"));
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator (NULL). */
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[SCAN].algorithm) {
         return ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
                                                   op, comm, module,
-                                                  tuned_module->user_forced[SCAN].algorithm);
+                                                  tuned_module->user_forced[SCAN].algorithm,
+                                                  NULL);
     }
 
     /**
@@ -772,7 +812,7 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
             /* we have found a valid choice from the file based rules for this message size */
             return ompi_coll_tuned_scan_intra_do_this (sbuf, rbuf, count, dtype,
                                                        op, comm, module,
-                                                       alg);
+                                                       alg, NULL);
         } /* found a method */
     } /*end if any com rules to check */
 
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
index 3b0077c9bcc..b66a71563fe 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@@ -29,6 +29,8 @@
 
 #include "mpi.h"
 #include "opal/util/bit_ops.h"
+#include "opal/mca/accelerator/accelerator.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/coll.h"
@@ -214,8 +216,10 @@ ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, size_t c
         }
     }
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator. */
     return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                    comm, module, alg, 0, 0);
+                                                    comm, module, alg, 0, 0, NULL);
 }
 
 
@@ -1073,10 +1077,12 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
         }
     }
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator. */
     int faninout = 2;
     return  ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
                                                   op, root, comm, module,
-                                                  alg, faninout, 0, 0);
+                                                  alg, faninout, 0, 0, NULL);
 }
 
 /*
@@ -1223,9 +1229,11 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf
         }
     }
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator. */
     return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
                                                           op, comm, module,
-                                                          alg, 0, 0);
+                                                          alg, 0, 0, NULL);
 }
 
 /*
@@ -1344,9 +1352,11 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void
         }
     }
 
+    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
+     * reduction is not yet supported, so always use the host allocator. */
     return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
                                                                 op, comm, module,
-                                                                alg, 0, 0);
+                                                                alg, 0, 0, NULL);
 }
 
 /*
@@ -1656,6 +1666,7 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount,
 {
     int communicator_size, alg, rank;
     size_t dsize, total_dsize;
+    mca_allocator_base_module_t *allocator = NULL;
 
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_gather_intra_dec_fixed"));
@@ -1720,10 +1731,21 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount,
         alg = 2;
     }
 
+    /* Scratch buffer is used for data movement only (no ompi_op_reduce).
+     * Use device allocator when user buffers are on device. */
+    {
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+    }
     return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
                                                  rbuf, rcount, rdtype,
                                                  root, comm, module,
-                                                 alg, 0, 0);
+                                                 alg, 0, 0, allocator);
 }
 
 /*
diff --git a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
index 48288c5d7d7..781e2d240a8 100644
--- a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
@@ -93,7 +93,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                          struct ompi_op_t *op,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
-                                         int algorithm)
+                                         int algorithm, mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:exscan_intra_do_this selected algorithm %d",
@@ -104,7 +104,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co
     case (1):  return ompi_coll_base_exscan_intra_linear(sbuf, rbuf, count, dtype,
                                                          op, comm, module);
     case (2):  return ompi_coll_base_exscan_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                    op, comm, module);
+                                                                    op, comm, module, allocator);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:exscan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c
index d356202a3bf..1845f32a00b 100644
--- a/ompi/mca/coll/tuned/coll_tuned_gather_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_gather_decision.c
@@ -130,7 +130,8 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount,
                                      int root,
                                      struct ompi_communicator_t *comm,
                                      mca_coll_base_module_t *module,
-                                     int algorithm, int faninout, int segsize)
+                                     int algorithm, int faninout, int segsize,
+                                     mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "coll:tuned:gather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -148,7 +149,7 @@ ompi_coll_tuned_gather_intra_do_this(const void *sbuf, size_t scount,
     case (2):
         return ompi_coll_base_gather_intra_binomial(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
-                                                    root, comm, module);
+                                                    root, comm, module, allocator);
     case (3):
         return ompi_coll_base_gather_intra_linear_sync(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
index 6ae3c00f7d9..2d16e88ed06 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
@@ -154,7 +154,8 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
                                          int algorithm, int faninout,
-                                         int segsize, int max_requests )
+                                         int segsize, int max_requests,
+                                         mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -167,25 +168,31 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                                                op, root, comm, module);
     case (2):  return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
                                                         op, root, comm, module,
-                                                        segsize, faninout, max_requests);
+                                                        segsize, faninout, max_requests,
+                                                        allocator);
     case (3):  return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
                                                            op, root, comm, module,
-                                                           segsize, max_requests);
+                                                           segsize, max_requests,
+                                                           allocator);
     case (4):  return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
                                                          op, root, comm, module,
-                                                         segsize, max_requests);
+                                                         segsize, max_requests,
+                                                         allocator);
     case (5):  return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
                                                            op, root, comm, module,
-                                                           segsize, max_requests);
+                                                           segsize, max_requests,
+                                                           allocator);
     case (6):  return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
                                                                   op, root, comm, module,
-                                                                  segsize, max_requests);
+                                                                  segsize, max_requests,
+                                                                  allocator);
     case (7):  return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype,
-                                                                  op, root, comm, module);
+                                                                  op, root, comm, module,
+                                                                  allocator);
     case (8):  return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype,
                                                           op, root, comm, module,
                                                           segsize, max_requests,
-                                                          faninout);
+                                                          faninout, allocator);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
index f4f6bdb7590..c9e00e62585 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
@@ -123,7 +123,8 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
                                                        struct ompi_op_t *op,
                                                        struct ompi_communicator_t *comm,
                                                        mca_coll_base_module_t *module,
-                                                       int algorithm, int faninout, int segsize)
+                                                       int algorithm, int faninout, int segsize,
+                                                       mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -133,13 +134,16 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
     case (0): return ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(sbuf, rbuf, rcount,
                                                                           dtype, op, comm, module);
     case (1): return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount,
-                                                                      dtype, op, comm, module);
+                                                                      dtype, op, comm, module,
+                                                                      allocator);
     case (2): return ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(sbuf, rbuf, rcount,
-                                                                                 dtype, op, comm, module);
+                                                                                 dtype, op, comm, module,
+                                                                                 allocator);
     case (3): return ompi_coll_base_reduce_scatter_block_intra_recursivehalving(sbuf, rbuf, rcount,
-                                                                                dtype, op, comm, module);
+                                                                                dtype, op, comm, module,
+                                                                                allocator);
     case (4): return ompi_coll_base_reduce_scatter_block_intra_butterfly(sbuf, rbuf, rcount, dtype, op, comm,
-                                                                         module);
+                                                                         module, allocator);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
index 16747598b6e..6146a71e849 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
@@ -130,7 +130,8 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf,
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
                                                  mca_coll_base_module_t *module,
-                                                 int algorithm, int faninout, int segsize)
+                                                 int algorithm, int faninout, int segsize,
+                                                 mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -140,13 +141,17 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf,
     case (0): return ompi_coll_tuned_reduce_scatter_intra_dec_fixed(sbuf, rbuf, rcounts,
                                                                     dtype, op, comm, module);
     case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
-                                                                        dtype, op, comm, module);
+                                                                        dtype, op, comm, module,
+                                                                        allocator);
     case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
-                                                                                dtype, op, comm, module);
+                                                                                dtype, op, comm, module,
+                                                                                allocator);
     case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
-                                                              dtype, op, comm, module);
+                                                              dtype, op, comm, module,
+                                                              allocator);
     case (4): return ompi_coll_base_reduce_scatter_intra_butterfly(sbuf, rbuf, rcounts,
-                                                                   dtype, op, comm, module);
+                                                                   dtype, op, comm, module,
+                                                                   allocator);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
index 903e76c4694..d3db038a550 100644
--- a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
@@ -93,7 +93,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun
                                          struct ompi_op_t *op,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
-                                         int algorithm)
+                                         int algorithm, mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:scan_intra_do_this selected algorithm %d",
@@ -104,7 +104,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun
     case (1):  return ompi_coll_base_scan_intra_linear(sbuf, rbuf, count, dtype,
                                                        op, comm, module);
     case (2):  return ompi_coll_base_scan_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                  op, comm, module);
+                                                                  op, comm, module, allocator);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:scan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/opal/mca/accelerator/base/accelerator_base_frame.c b/opal/mca/accelerator/base/accelerator_base_frame.c
index 55e13f3773b..6cfb135e556 100644
--- a/opal/mca/accelerator/base/accelerator_base_frame.c
+++ b/opal/mca/accelerator/base/accelerator_base_frame.c
@@ -18,6 +18,9 @@
 #include "opal/mca/accelerator/base/base.h"
 #include "opal/mca/base/base.h"
 #include "opal/mca/mca.h"
+#include "opal/mca/allocator/allocator.h"
+#include "opal/mca/allocator/bucket/allocator_bucket_alloc.h"
+#include "opal/mca/threads/mutex.h"
 
 /*
  * The following file was created by configure.  It contains extern
@@ -30,6 +33,70 @@
 opal_accelerator_base_module_t opal_accelerator = {0};
 opal_accelerator_base_component_t opal_accelerator_base_selected_component = {{0}};
 
+#define OPAL_ACCELERATOR_MAX_DEVICES 16
+#define OPAL_ACCELERATOR_ALLOC_NUM_BUCKETS 8
+
+static mca_allocator_base_module_t *opal_accel_device_allocators[OPAL_ACCELERATOR_MAX_DEVICES];
+static opal_mutex_t opal_accel_alloc_lock = OPAL_MUTEX_STATIC_INIT;
+
+typedef struct {
+    int dev_id;
+} opal_accel_alloc_ctx_t;
+
+static void *opal_accel_seg_alloc(void *ctx, size_t *size)
+{
+    opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *)ctx;
+    void *ptr = NULL;
+    if (OPAL_SUCCESS != opal_accelerator.mem_alloc(ac->dev_id, &ptr, *size)) {
+        return NULL;
+    }
+    return ptr;
+}
+
+static void opal_accel_seg_free(void *ctx, void *seg)
+{
+    opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *)ctx;
+    opal_accelerator.mem_release(ac->dev_id, seg);
+}
+
+mca_allocator_base_module_t *
+opal_accelerator_base_get_device_allocator(int dev_id)
+{
+    mca_allocator_bucket_t *bucket;
+    opal_accel_alloc_ctx_t *ctx;
+
+    if (dev_id < 0 || dev_id >= OPAL_ACCELERATOR_MAX_DEVICES) {
+        return NULL;
+    }
+    if (NULL == opal_accelerator.mem_alloc) {
+        return NULL;
+    }
+    if (opal_accel_device_allocators[dev_id] != NULL) {
+        return opal_accel_device_allocators[dev_id];
+    }
+
+    OPAL_THREAD_LOCK(&opal_accel_alloc_lock);
+    if (opal_accel_device_allocators[dev_id] == NULL) {
+        ctx = (opal_accel_alloc_ctx_t *)malloc(sizeof(*ctx));
+        if (NULL == ctx) {
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        ctx->dev_id = dev_id;
+        bucket = mca_allocator_bucket_init(NULL, OPAL_ACCELERATOR_ALLOC_NUM_BUCKETS,
+                                           opal_accel_seg_alloc, opal_accel_seg_free);
+        if (NULL == bucket) {
+            free(ctx);
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        bucket->super.alc_context = ctx;
+        opal_accel_device_allocators[dev_id] = &bucket->super;
+    }
+    OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+    return opal_accel_device_allocators[dev_id];
+}
+
 static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags)
 {
     return OPAL_SUCCESS;
@@ -37,6 +104,14 @@ static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags)
 
 static int opal_accelerator_base_frame_close(void)
 {
+    for (int i = 0; i < OPAL_ACCELERATOR_MAX_DEVICES; i++) {
+        if (opal_accel_device_allocators[i] != NULL) {
+            opal_accel_alloc_ctx_t *ctx = (opal_accel_alloc_ctx_t *)opal_accel_device_allocators[i]->alc_context;
+            opal_accel_device_allocators[i]->alc_finalize(opal_accel_device_allocators[i]);
+            free(ctx);
+            opal_accel_device_allocators[i] = NULL;
+        }
+    }
     return mca_base_framework_components_close(&opal_accelerator_base_framework, NULL);
 }
 
diff --git a/opal/mca/accelerator/base/base.h b/opal/mca/accelerator/base/base.h
index e5922032ea8..94892d15c90 100644
--- a/opal/mca/accelerator/base/base.h
+++ b/opal/mca/accelerator/base/base.h
@@ -20,6 +20,7 @@
 #include "opal/mca/accelerator/accelerator.h"
 #include "opal/mca/base/mca_base_framework.h"
 #include "opal/mca/mca.h"
+#include "opal/mca/allocator/allocator.h"
 
 
 BEGIN_C_DECLS
@@ -33,6 +34,14 @@ OPAL_DECLSPEC int opal_accelerator_base_select(void);
 
 OPAL_DECLSPEC extern opal_accelerator_base_component_t opal_accelerator_base_selected_component;
 
+/**
+ * Return a pooled allocator for device memory on the given device.
+ * Created lazily and cached for the lifetime of the process.
+ * Returns NULL if no accelerator is available or dev_id is invalid.
+ */
+OPAL_DECLSPEC mca_allocator_base_module_t *
+opal_accelerator_base_get_device_allocator(int dev_id);
+
 END_C_DECLS
 
 #endif

From d404a04c5ccb70610cfa5a597edcd35f2a38d3db Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Sun, 15 Mar 2026 16:01:51 -0400
Subject: [PATCH 02/13] accelerator/base: switch device pool to basic allocator
 with segment tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the bucket allocator with the basic (first-fit + coalescing)
allocator for per-device GPU scratch buffer pools.  The basic allocator
splits large free blocks to serve smaller requests and merges adjacent
free blocks on release, giving good reuse across the varying scratch
buffer sizes produced by collective algorithms.

The per-device allocator array is now heap-allocated lazily on the first
call to opal_accelerator_base_get_device_allocator, sized to the actual
device count from opal_accelerator.num_devices().

The basic allocator's finalize does not call seg_free, so GPU segments
would otherwise leak.  Each GPU segment allocated via seg_alloc is now
recorded in a per-context opal_list_t.  On framework close, the list is
drained first — calling opal_accelerator.mem_release on every segment —
before alc_finalize cleans up the allocator's internal structures.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../accelerator/base/accelerator_base_frame.c | 135 ++++++++++++++----
 1 file changed, 108 insertions(+), 27 deletions(-)

diff --git a/opal/mca/accelerator/base/accelerator_base_frame.c b/opal/mca/accelerator/base/accelerator_base_frame.c
index 6cfb135e556..bbcfc84b37c 100644
--- a/opal/mca/accelerator/base/accelerator_base_frame.c
+++ b/opal/mca/accelerator/base/accelerator_base_frame.c
@@ -19,7 +19,7 @@
 #include "opal/mca/base/base.h"
 #include "opal/mca/mca.h"
 #include "opal/mca/allocator/allocator.h"
-#include "opal/mca/allocator/bucket/allocator_bucket_alloc.h"
+#include "opal/mca/allocator/basic/allocator_basic.h"
 #include "opal/mca/threads/mutex.h"
 
 /*
@@ -33,66 +33,130 @@
 opal_accelerator_base_module_t opal_accelerator = {0};
 opal_accelerator_base_component_t opal_accelerator_base_selected_component = {{0}};
 
-#define OPAL_ACCELERATOR_MAX_DEVICES 16
-#define OPAL_ACCELERATOR_ALLOC_NUM_BUCKETS 8
-
-static mca_allocator_base_module_t *opal_accel_device_allocators[OPAL_ACCELERATOR_MAX_DEVICES];
+/* Per-device allocator pool — allocated lazily to num_devices on first use. */
+static mca_allocator_base_module_t **opal_accel_device_allocators = NULL;
+static int opal_accel_num_devices = 0;
 static opal_mutex_t opal_accel_alloc_lock = OPAL_MUTEX_STATIC_INIT;
 
+/*
+ * Tracks a single GPU segment returned by opal_accelerator.mem_alloc so it
+ * can be released on cleanup.  The basic allocator never calls seg_free during
+ * normal operation (only compact/finalize would, and compact is a no-op), so
+ * we keep our own list instead of relying on it.
+ */
+struct opal_accel_alloc_seg_t {
+    opal_list_item_t super;
+    void *ptr;
+};
+typedef struct opal_accel_alloc_seg_t opal_accel_alloc_seg_t;
+OBJ_CLASS_INSTANCE(opal_accel_alloc_seg_t, opal_list_item_t, NULL, NULL);
+
 typedef struct {
     int dev_id;
+    opal_list_t segs; /* every GPU segment allocated via seg_alloc */
 } opal_accel_alloc_ctx_t;
 
+/*
+ * seg_alloc is called (under the basic allocator's internal lock) whenever the
+ * free list has no block large enough.  Record each new GPU segment so it can
+ * be released on cleanup.
+ */
 static void *opal_accel_seg_alloc(void *ctx, size_t *size)
 {
-    opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *)ctx;
+    opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *) ctx;
+    opal_accel_alloc_seg_t *seg;
     void *ptr = NULL;
+
     if (OPAL_SUCCESS != opal_accelerator.mem_alloc(ac->dev_id, &ptr, *size)) {
         return NULL;
     }
+
+    seg = OBJ_NEW(opal_accel_alloc_seg_t);
+    if (OPAL_LIKELY(NULL != seg)) {
+        seg->ptr = ptr;
+        opal_list_append(&ac->segs, &seg->super);
+    }
     return ptr;
 }
 
+/* seg_free is wired into the allocator API but never invoked during normal
+ * operation (basic allocator compact is a no-op).  Cleanup is handled
+ * explicitly in opal_accelerator_base_frame_close via the segs list. */
 static void opal_accel_seg_free(void *ctx, void *seg)
 {
-    opal_accel_alloc_ctx_t *ac = (opal_accel_alloc_ctx_t *)ctx;
-    opal_accelerator.mem_release(ac->dev_id, seg);
+    (void) ctx;
+    (void) seg;
 }
 
 mca_allocator_base_module_t *
 opal_accelerator_base_get_device_allocator(int dev_id)
 {
-    mca_allocator_bucket_t *bucket;
+    mca_allocator_base_module_t *alloc;
     opal_accel_alloc_ctx_t *ctx;
 
-    if (dev_id < 0 || dev_id >= OPAL_ACCELERATOR_MAX_DEVICES) {
-        return NULL;
-    }
-    if (NULL == opal_accelerator.mem_alloc) {
+    if (dev_id < 0 || NULL == opal_accelerator.mem_alloc) {
         return NULL;
     }
-    if (opal_accel_device_allocators[dev_id] != NULL) {
+
+    /* Fast path: array already sized and slot already filled. */
+    if (NULL != opal_accel_device_allocators
+        && dev_id < opal_accel_num_devices
+        && NULL != opal_accel_device_allocators[dev_id]) {
         return opal_accel_device_allocators[dev_id];
     }
 
     OPAL_THREAD_LOCK(&opal_accel_alloc_lock);
-    if (opal_accel_device_allocators[dev_id] == NULL) {
-        ctx = (opal_accel_alloc_ctx_t *)malloc(sizeof(*ctx));
+
+    /* Lazily allocate the per-device array on first call. */
+    if (NULL == opal_accel_device_allocators) {
+        int num_devices = 0;
+        if (OPAL_SUCCESS != opal_accelerator.num_devices(&num_devices) || num_devices <= 0) {
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        opal_accel_device_allocators = calloc(num_devices,
+                                              sizeof(*opal_accel_device_allocators));
+        if (NULL == opal_accel_device_allocators) {
+            OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+            return NULL;
+        }
+        opal_accel_num_devices = num_devices;
+    }
+
+    if (dev_id >= opal_accel_num_devices) {
+        OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
+        return NULL;
+    }
+
+    if (NULL == opal_accel_device_allocators[dev_id]) {
+        ctx = (opal_accel_alloc_ctx_t *) malloc(sizeof(*ctx));
         if (NULL == ctx) {
             OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
             return NULL;
         }
         ctx->dev_id = dev_id;
-        bucket = mca_allocator_bucket_init(NULL, OPAL_ACCELERATOR_ALLOC_NUM_BUCKETS,
-                                           opal_accel_seg_alloc, opal_accel_seg_free);
-        if (NULL == bucket) {
+        OBJ_CONSTRUCT(&ctx->segs, opal_list_t);
+        /*
+         * Use the basic (first-fit + coalescing) allocator rather than the
+         * bucket allocator.  When a large block is freed it can be split to
+         * serve a smaller future request, and adjacent free blocks are merged
+         * back together, giving good reuse across the varying scratch-buffer
+         * sizes produced by collective algorithms.  GPU segments are retained
+         * in the free list for the lifetime of the process; the GPU driver
+         * reclaims device memory on context teardown.
+         */
+        alloc = mca_allocator_basic_component_init(true,
+                                                   opal_accel_seg_alloc,
+                                                   opal_accel_seg_free,
+                                                   ctx);
+        if (NULL == alloc) {
             free(ctx);
             OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
             return NULL;
         }
-        bucket->super.alc_context = ctx;
-        opal_accel_device_allocators[dev_id] = &bucket->super;
+        opal_accel_device_allocators[dev_id] = alloc;
     }
+
     OPAL_THREAD_UNLOCK(&opal_accel_alloc_lock);
     return opal_accel_device_allocators[dev_id];
 }
@@ -104,13 +168,30 @@ static int opal_accelerator_base_frame_register(mca_base_register_flag_t flags)
 
 static int opal_accelerator_base_frame_close(void)
 {
-    for (int i = 0; i < OPAL_ACCELERATOR_MAX_DEVICES; i++) {
-        if (opal_accel_device_allocators[i] != NULL) {
-            opal_accel_alloc_ctx_t *ctx = (opal_accel_alloc_ctx_t *)opal_accel_device_allocators[i]->alc_context;
-            opal_accel_device_allocators[i]->alc_finalize(opal_accel_device_allocators[i]);
-            free(ctx);
-            opal_accel_device_allocators[i] = NULL;
+    if (NULL != opal_accel_device_allocators) {
+        for (int i = 0; i < opal_accel_num_devices; i++) {
+            if (NULL != opal_accel_device_allocators[i]) {
+                opal_accel_alloc_ctx_t *ctx =
+                    (opal_accel_alloc_ctx_t *) opal_accel_device_allocators[i]->alc_context;
+                opal_accel_alloc_seg_t *seg;
+
+                /* Release all GPU segments tracked in seg_alloc before the
+                 * basic allocator frees its internal structures. */
+                while (NULL != (seg = (opal_accel_alloc_seg_t *)
+                                      opal_list_remove_first(&ctx->segs))) {
+                    opal_accelerator.mem_release(ctx->dev_id, seg->ptr);
+                    OBJ_RELEASE(seg);
+                }
+                OBJ_DESTRUCT(&ctx->segs);
+
+                opal_accel_device_allocators[i]->alc_finalize(opal_accel_device_allocators[i]);
+                free(ctx);
+                opal_accel_device_allocators[i] = NULL;
+            }
         }
+        free(opal_accel_device_allocators);
+        opal_accel_device_allocators = NULL;
+        opal_accel_num_devices = 0;
     }
     return mca_base_framework_components_close(&opal_accelerator_base_framework, NULL);
 }

From b912eb4b9551600940f2c2b0efdc0c89f8849d1b Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 10 Apr 2026 12:22:19 -0400
Subject: [PATCH 03/13] coll/base: replace allocator with GPU op session in
 reduction algorithms

Introduce ompi_op_gpu_session_t as the plumbing for future persistent
GPU reduction kernels. Replace the mca_allocator_base_module_t *allocator
parameter with ompi_op_gpu_session_t *session in all six reduction
algorithm families (allreduce, reduce, reduce_scatter, reduce_scatter_block,
scan, exscan) and their coll/tuned dispatch functions.

Add COLL_BASE_REDUCE / COLL_SESSION_ALLOC / COLL_SESSION_FREE macros that
dispatch to the GPU session when non-NULL, and fall back to ompi_op_reduce /
malloc / free otherwise. Add optional opc_session_begin/reduce/end function
pointers to ompi_op_base_component_t for future GPU op components.

Phase 1 only: ompi_op_gpu_session_begin() is a stub that always returns NULL,
so all code paths remain identical to before on host-only systems.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ompi/mca/coll/base/coll_base_allreduce.c      |  97 ++++++++--------
 ompi/mca/coll/base/coll_base_exscan.c         |  19 +--
 ompi/mca/coll/base/coll_base_functions.h      |  65 +++++++----
 ompi/mca/coll/base/coll_base_reduce.c         | 108 +++++++++---------
 ompi/mca/coll/base/coll_base_reduce_scatter.c |  73 ++++++------
 .../base/coll_base_reduce_scatter_block.c     |  97 ++++++++--------
 ompi/mca/coll/base/coll_base_scan.c           |  19 +--
 ompi/mca/coll/tuned/coll_tuned.h              |  13 ++-
 .../tuned/coll_tuned_allreduce_decision.c     |  12 +-
 .../coll/tuned/coll_tuned_decision_dynamic.c  |  18 +--
 .../coll/tuned/coll_tuned_decision_fixed.c    |  12 +-
 .../coll/tuned/coll_tuned_exscan_decision.c   |   4 +-
 .../coll/tuned/coll_tuned_reduce_decision.c   |  16 +--
 ...coll_tuned_reduce_scatter_block_decision.c |  10 +-
 .../coll_tuned_reduce_scatter_decision.c      |  10 +-
 .../mca/coll/tuned/coll_tuned_scan_decision.c |   4 +-
 ompi/mca/op/op.h                              |  33 ++++++
 ompi/op/Makefile.am                           |   2 +
 ompi/op/op_gpu_session.c                      |  46 ++++++++
 ompi/op/op_gpu_session.h                      |  60 ++++++++++
 20 files changed, 437 insertions(+), 281 deletions(-)
 create mode 100644 ompi/op/op_gpu_session.c
 create mode 100644 ompi/op/op_gpu_session.h

diff --git a/ompi/mca/coll/base/coll_base_allreduce.c b/ompi/mca/coll/base/coll_base_allreduce.c
index ae1e27aac40..1d605258102 100644
--- a/ompi/mca/coll/base/coll_base_allreduce.c
+++ b/ompi/mca/coll/base/coll_base_allreduce.c
@@ -40,6 +40,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
 
@@ -137,7 +138,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                                   struct ompi_op_t *op,
                                                   struct ompi_communicator_t *comm,
                                                   mca_coll_base_module_t *module,
-                                                  mca_allocator_base_module_t *allocator)
+                                                  ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, adjsize, remote, distance;
     int newrank, newremote, extra_ranks;
@@ -161,7 +162,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
 
     /* Allocate and initialize temporary send buffer */
     span = opal_datatype_span(&dtype->super, count, &gap);
-    inplacebuf_free = (char*) COLL_BASE_ALLOC(allocator, span);
+    inplacebuf_free = (char*) COLL_SESSION_ALLOC(session, span);
     if (NULL == inplacebuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     inplacebuf = inplacebuf_free - gap;
 
@@ -201,7 +202,7 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
             /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
+            COLL_BASE_REDUCE(session, op, tmprecv, tmpsend, count, dtype);
             newrank = rank >> 1;
         }
     } else {
@@ -231,13 +232,13 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         /* Apply operation */
         if (rank < remote) {
             /* tmprecv = tmpsend (op) tmprecv */
-            ompi_op_reduce(op, tmpsend, tmprecv, count, dtype);
+            COLL_BASE_REDUCE(session, op, tmpsend, tmprecv, count, dtype);
             tmpswap = tmprecv;
             tmprecv = tmpsend;
             tmpsend = tmpswap;
         } else {
             /* tmpsend = tmprecv (op) tmpsend */
-            ompi_op_reduce(op, tmprecv, tmpsend, count, dtype);
+            COLL_BASE_REDUCE(session, op, tmprecv, tmpsend, count, dtype);
         }
     }
 
@@ -267,14 +268,14 @@ ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf,
         if (ret < 0) { line = __LINE__; goto error_hndl; }
     }
 
-    COLL_BASE_FREE(allocator, inplacebuf_free);
+    COLL_SESSION_FREE(session, inplacebuf_free);
     return MPI_SUCCESS;
 
  error_hndl:
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
-    COLL_BASE_FREE(allocator, inplacebuf_free);
+    COLL_SESSION_FREE(session, inplacebuf_free);
     return ret;
 }
 
@@ -348,7 +349,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                                      struct ompi_op_t *op,
                                      struct ompi_communicator_t *comm,
                                      mca_coll_base_module_t *module,
-                                     mca_allocator_base_module_t *allocator)
+                                     ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, k, recv_from, send_to, block_count, inbi;
     int early_segcount, late_segcount, split_rank, max_segcount;
@@ -380,7 +381,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                                                                   count,
                                                                   dtype, op,
                                                                   comm, module,
-                                                                  allocator));
+                                                                  session));
     }
 
     /* Allocate and initialize temporary buffers */
@@ -404,10 +405,10 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
     max_real_segsize = true_extent + (max_segcount - 1) * extent;
 
 
-    inbuf[0] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
+    inbuf[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
+        inbuf[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -475,7 +476,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                         ((ptrdiff_t)prevblock * late_segcount + split_rank));
         block_count = ((prevblock < split_rank)? early_segcount : late_segcount);
         tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
-        ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
+        COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, block_count, dtype);
 
         /* send previous block to send_to */
         ret = MCA_PML_CALL(send(tmprecv, block_count, dtype, send_to,
@@ -496,7 +497,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                     ((ptrdiff_t)recv_from * late_segcount + split_rank));
     block_count = ((recv_from < split_rank)? early_segcount : late_segcount);
     tmprecv = ((char*)rbuf) + (ptrdiff_t)block_offset * extent;
-    ompi_op_reduce(op, inbuf[inbi], tmprecv, block_count, dtype);
+    COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, block_count, dtype);
 
     /* Distribution loop - variation of ring allgather */
     send_to = (rank + 1) % size;
@@ -527,8 +528,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
 
     }
 
-    COLL_BASE_FREE(allocator, inbuf[0]);
-    COLL_BASE_FREE(allocator, inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
 
     return MPI_SUCCESS;
 
@@ -537,8 +538,8 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, size_t count,
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    COLL_BASE_FREE(allocator, inbuf[0]);
-    COLL_BASE_FREE(allocator, inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
     return ret;
 }
 
@@ -628,7 +629,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                                                struct ompi_communicator_t *comm,
                                                mca_coll_base_module_t *module,
                                                uint32_t segsize,
-                                               mca_allocator_base_module_t *allocator)
+                                               ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, k, recv_from, send_to;
     int early_blockcount, late_blockcount, split_rank;
@@ -664,7 +665,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
         if (count < (size_t) (size * segcount)) {
             OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_ring_segmented rank %d/%d, count %zu, switching to regular ring", rank, size, count));
             return (ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op,
-                                                         comm, module, allocator));
+                                                         comm, module, session));
         }
 
     /* Determine the number of phases of the algorithm */
@@ -693,10 +694,10 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
      max_real_segsize = opal_datatype_span(&dtype->super, max_segcount, &gap);
 
     /* Allocate and initialize temporary buffers */
-    inbuf[0] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
+    inbuf[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
     if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     if (size > 2) {
-        inbuf[1] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
+        inbuf[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
         if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
     }
 
@@ -787,7 +788,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                             ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
                             ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
             tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
-            ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
+            COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, phase_count, dtype);
 
             /* send previous block to send_to */
             ret = MCA_PML_CALL(send(tmprecv, phase_count, dtype, send_to,
@@ -816,7 +817,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                         ((ptrdiff_t)phase * (ptrdiff_t)early_phase_segcount) :
                         ((ptrdiff_t)phase * (ptrdiff_t)late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (ptrdiff_t)(block_offset + phase_offset) * extent;
-        ompi_op_reduce(op, inbuf[inbi], tmprecv, phase_count, dtype);
+        COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, phase_count, dtype);
     }
 
     /* Distribution loop - variation of ring allgather */
@@ -848,8 +849,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
 
     }
 
-    COLL_BASE_FREE(allocator, inbuf[0]);
-    COLL_BASE_FREE(allocator, inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
 
     return MPI_SUCCESS;
 
@@ -858,8 +859,8 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, size
                  __FILE__, line, rank, ret));
     ompi_coll_base_free_reqs(reqs, 2);
     (void)line;  // silence compiler warning
-    COLL_BASE_FREE(allocator, inbuf[0]);
-    COLL_BASE_FREE(allocator, inbuf[1]);
+    COLL_SESSION_FREE(session, inbuf[0]);
+    COLL_SESSION_FREE(session, inbuf[1]);
     return ret;
 }
 
@@ -978,7 +979,7 @@ ompi_coll_base_allreduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t
 int ompi_coll_base_allreduce_intra_redscat_allgather(
     const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int *rindex = NULL, *rcount = NULL, *sindex = NULL, *scount = NULL;
 
@@ -1010,7 +1011,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
 
     /* Temporary buffer for receiving messages */
     char *tmp_buf = NULL;
-    char *tmp_buf_raw = (char *)COLL_BASE_ALLOC(allocator, dsize);
+    char *tmp_buf_raw = (char *)COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmp_buf_raw)
         return OMPI_ERR_OUT_OF_RESOURCE;
     tmp_buf = tmp_buf_raw - gap;
@@ -1060,8 +1061,8 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                           (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                             (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
 
             /* Send the right half to the left neighbor */
             err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1088,7 +1089,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
+            COLL_BASE_REDUCE(session, op, tmp_buf, rbuf, count_lhalf, dtype);
 
             /* Recv the right half from the right neighbor */
             err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1169,9 +1170,9 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
-                           (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
-                           rcount[step], dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
+                             (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
+                             rcount[step], dtype);
 
             /* Move the current window to the received message */
             if (step + 1 < nsteps) {
@@ -1238,7 +1239,7 @@ int ompi_coll_base_allreduce_intra_redscat_allgather(
     }
 
   cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmp_buf_raw);
+    COLL_SESSION_FREE(session, tmp_buf_raw);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
@@ -1272,7 +1273,7 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
                                                     struct ompi_op_t *op,
                                                     struct ompi_communicator_t *comm,
                                                     mca_coll_base_module_t *module,
-                                                    mca_allocator_base_module_t *allocator)
+                                                    ompi_op_gpu_session_t *session)
 {
     int line = -1;
     char *partial_buf = NULL;
@@ -1293,10 +1294,10 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
     }
     ptrdiff_t buf_size, gap = 0;
     buf_size = opal_datatype_span(&dtype->super, (int64_t)count * size, &gap);
-    partial_buf = (char *) COLL_BASE_ALLOC(allocator, buf_size);
+    partial_buf = (char *) COLL_SESSION_ALLOC(session, buf_size);
     partial_buf_start = partial_buf - gap;
     buf_size = opal_datatype_span(&dtype->super, (int64_t)count, &gap);
-    tmpsend = (char *) COLL_BASE_ALLOC(allocator, buf_size);
+    tmpsend = (char *) COLL_SESSION_ALLOC(session, buf_size);
     tmpsend_start = tmpsend - gap;
 
     err = ompi_datatype_copy_content_same_ddt(dtype, count,
@@ -1311,11 +1312,11 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
     for (int target = 1; target < size; target++) {
-        ompi_op_reduce(op,
-                       partial_buf_start + (ptrdiff_t)target * count * extent,
-                       partial_buf_start,
-                       count,
-                       dtype);
+        COLL_BASE_REDUCE(session, op,
+                         partial_buf_start + (ptrdiff_t)target * count * extent,
+                         partial_buf_start,
+                         count,
+                         dtype);
     }
 
     // move data to rbuf
@@ -1324,18 +1325,18 @@ int ompi_coll_base_allreduce_intra_allgather_reduce(const void *sbuf, void *rbuf
                                               (char*)partial_buf_start);
     if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
 
-    COLL_BASE_FREE(allocator, partial_buf);
-    COLL_BASE_FREE(allocator, tmpsend);
+    COLL_SESSION_FREE(session, partial_buf);
+    COLL_SESSION_FREE(session, tmpsend);
     return MPI_SUCCESS;
 
 err_hndl:
     if (NULL != partial_buf) {
-        COLL_BASE_FREE(allocator, partial_buf);
+        COLL_SESSION_FREE(session, partial_buf);
         partial_buf = NULL;
         partial_buf_start = NULL;
     }
      if (NULL != tmpsend) {
-        COLL_BASE_FREE(allocator, tmpsend);
+        COLL_SESSION_FREE(session, tmpsend);
         tmpsend = NULL;
         tmpsend_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_exscan.c b/ompi/mca/coll/base/coll_base_exscan.c
index 1e9cab4b942..d49b7912f7c 100644
--- a/ompi/mca/coll/base/coll_base_exscan.c
+++ b/ompi/mca/coll/base/coll_base_exscan.c
@@ -23,6 +23,7 @@
 #include "ompi/mca/coll/base/coll_base_util.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 
 /*
  * ompi_coll_base_exscan_intra_linear
@@ -142,7 +143,7 @@ ompi_coll_base_exscan_intra_linear(const void *sbuf, void *rbuf, size_t count,
 int ompi_coll_base_exscan_intra_recursivedoubling(
     const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int err = MPI_SUCCESS;
     char *tmpsend_raw = NULL, *tmprecv_raw = NULL;
@@ -158,8 +159,8 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
 
     ptrdiff_t dsize, gap;
     dsize = opal_datatype_span(&datatype->super, count, &gap);
-    tmpsend_raw = COLL_BASE_ALLOC(allocator, dsize);
-    tmprecv_raw = COLL_BASE_ALLOC(allocator, dsize);
+    tmpsend_raw = COLL_SESSION_ALLOC(session, dsize);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmpsend_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -195,17 +196,17 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
                     is_first_block = 0;
                 } else {
                     /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
-                    ompi_op_reduce(op, precv, recvbuf, count, datatype);
+                    COLL_BASE_REDUCE(session, op, precv, recvbuf, count, datatype);
                 }
                 /* Partial result: psend = precv <op> psend */
-                ompi_op_reduce(op, precv, psend, count, datatype);
+                COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
             } else {
                 if (is_commute) {
                     /* psend = precv <op> psend */
-                    ompi_op_reduce(op, precv, psend, count, datatype);
+                    COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
                 } else {
                     /* precv = psend <op> precv */
-                    ompi_op_reduce(op, psend, precv, count, datatype);
+                    COLL_BASE_REDUCE(session, op, psend, precv, count, datatype);
                     char *tmp = psend;
                     psend = precv;
                     precv = tmp;
@@ -215,7 +216,7 @@ int ompi_coll_base_exscan_intra_recursivedoubling(
     }
 
 cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmpsend_raw);
-    COLL_BASE_FREE(allocator, tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpsend_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index c0195eafd0f..77e07533f72 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -37,6 +37,7 @@
 #include "ompi/info/info.h"
 #include "ompi/request/request.h"
 #include "opal/mca/allocator/allocator.h"
+#include "ompi/op/op_gpu_session.h"
 
 /* Allocator-aware helpers for Pattern-A scratch buffers.
  * Pass allocator=NULL to fall back to plain malloc/free. */
@@ -47,6 +48,24 @@
     do { if (ptr) { if (allocator) (allocator)->alc_free((allocator), (ptr)); \
                     else free(ptr); } } while (0)
 
+/* GPU session-aware helpers for reduction scratch buffers.
+ * When session is non-NULL, use session->allocator; otherwise fall back to
+ * plain malloc/free.  Pass session=NULL for non-GPU collectives. */
+#define COLL_BASE_REDUCE(session, op, src, dst, count, dtype)                  \
+    do {                                                                        \
+        if (NULL != (session))                                                  \
+            ompi_op_gpu_session_reduce((session), (src), (dst), (count));      \
+        else                                                                    \
+            ompi_op_reduce((op), (src), (dst), (count), (dtype));              \
+    } while (0)
+
+#define COLL_SESSION_ALLOC(session, size) \
+    ((session) ? COLL_BASE_ALLOC((session)->allocator, (size)) : malloc(size))
+
+#define COLL_SESSION_FREE(session, ptr) \
+    do { if (session) { COLL_BASE_FREE((session)->allocator, (ptr)); } \
+         else { if (ptr) free(ptr); } } while (0)
+
 /* need to include our own topo prototypes so we can malloc data on the comm correctly */
 #include "coll_base_topo.h"
 
@@ -216,12 +235,12 @@ int ompi_coll_base_allgatherv_intra_two_procs(ALLGATHERV_ARGS);
 
 /* All Reduce */
 int ompi_coll_base_allreduce_intra_nonoverlapping(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_allreduce_intra_recursivedoubling(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_allreduce_intra_ring(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_allreduce_intra_ring_segmented(ALLREDUCE_ARGS, uint32_t segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_base_allreduce_intra_basic_linear(ALLREDUCE_ARGS);
-int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_allreduce_intra_redscat_allgather(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_allreduce_intra_allgather_reduce(ALLREDUCE_ARGS, ompi_op_gpu_session_t *session);
 
 /* AlltoAll */
 int ompi_coll_base_alltoall_intra_pairwise(ALLTOALL_ARGS);
@@ -265,7 +284,7 @@ int ompi_coll_base_bcast_intra_scatter_allgather(BCAST_ARGS, uint32_t segsize);
 int ompi_coll_base_bcast_intra_scatter_allgather_ring(BCAST_ARGS, uint32_t segsize);
 
 /* Exscan */
-int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_exscan_intra_recursivedoubling(EXSCAN_ARGS, ompi_op_gpu_session_t *session);
 int ompi_coll_base_exscan_intra_linear(EXSCAN_ARGS);
 
 /* Gather */
@@ -276,30 +295,30 @@ int ompi_coll_base_gather_intra_linear_sync(GATHER_ARGS, int first_segment_size)
 /* GatherV */
 
 /* Reduce */
-int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_generic(REDUCE_ARGS, ompi_coll_tree_t* tree, size_t count_by_segment, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
 int ompi_coll_base_reduce_intra_basic_linear(REDUCE_ARGS);
-int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_intra_chain(REDUCE_ARGS, uint32_t segsize, int fanout, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_pipeline(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_binomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_in_order_binary(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_redscat_gather(REDUCE_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_intra_knomial(REDUCE_ARGS, uint32_t segsize, int max_outstanding_reqs, int radix, ompi_op_gpu_session_t *session);
 
 /* Reduce_scatter */
-int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_intra_nonoverlapping(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_intra_ring(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_intra_butterfly(REDUCESCATTER_ARGS, ompi_op_gpu_session_t *session);
 
 /* Reduce_scatter_block */
-int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
-int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_reduce_scatter_block_basic_linear(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_block_intra_recursivehalving(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
+int ompi_coll_base_reduce_scatter_block_intra_butterfly(REDUCESCATTERBLOCK_ARGS, ompi_op_gpu_session_t *session);
 
 /* Scan */
-int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS, mca_allocator_base_module_t *allocator);
+int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS, ompi_op_gpu_session_t *session);
 int ompi_coll_base_scan_intra_linear(SCAN_ARGS);
 
 /* Scatter */
diff --git a/ompi/mca/coll/base/coll_base_reduce.c b/ompi/mca/coll/base/coll_base_reduce.c
index 588973aa832..3a2a50171b0 100644
--- a/ompi/mca/coll/base/coll_base_reduce.c
+++ b/ompi/mca/coll/base/coll_base_reduce.c
@@ -38,6 +38,7 @@
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
 
@@ -66,7 +67,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                                     int root, ompi_communicator_t* comm,
                                     mca_coll_base_module_t *module,
                                     ompi_coll_tree_t* tree, size_t count_by_segment,
-                                    int max_outstanding_reqs, mca_allocator_base_module_t *allocator )
+                                    int max_outstanding_reqs, ompi_op_gpu_session_t *session )
 {
     char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
     char *accumbuf = NULL, *accumbuf_free = NULL;
@@ -106,7 +107,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         if( (NULL == accumbuf) || (root != rank) ) {
             /* Allocate temporary accumulator buffer. */
             size = opal_datatype_span(&datatype->super, original_count, &gap);
-            accumbuf_free = (char*)COLL_BASE_ALLOC(allocator, size);
+            accumbuf_free = (char*)COLL_SESSION_ALLOC(session, size);
             if (accumbuf_free == NULL) {
                 line = __LINE__; ret = -1; goto error_hndl;
             }
@@ -123,7 +124,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         }
         /* Allocate two buffers for incoming segments */
         real_segment_size = opal_datatype_span(&datatype->super, count_by_segment, &gap);
-        inbuf_free[0] = (char*) COLL_BASE_ALLOC(allocator, real_segment_size);
+        inbuf_free[0] = (char*) COLL_SESSION_ALLOC(session, real_segment_size);
         if( inbuf_free[0] == NULL ) {
             line = __LINE__; ret = -1; goto error_hndl;
         }
@@ -131,7 +132,7 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         /* if there is chance to overlap communication -
            allocate second buffer */
         if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
-            inbuf_free[1] = (char*) COLL_BASE_ALLOC(allocator, real_segment_size);
+            inbuf_free[1] = (char*) COLL_SESSION_ALLOC(session, real_segment_size);
             if( inbuf_free[1] == NULL ) {
                 line = __LINE__; ret = -1; goto error_hndl;
             }
@@ -202,9 +203,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                         }
                     }
                     /* apply operation */
-                    ompi_op_reduce(op, local_op_buffer,
-                                   accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
-                                   recvcount, datatype );
+                    COLL_BASE_REDUCE(session, op, local_op_buffer,
+                                     accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment,
+                                     recvcount, datatype);
                 } else if ( segindex > 0 ) {
                     void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                     if( tree->tree_nextsize <= 1 ) {
@@ -213,8 +214,8 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
                             local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment;
                         }
                     }
-                    ompi_op_reduce(op, local_op_buffer, accumulator, prevcount,
-                                   datatype );
+                    COLL_BASE_REDUCE(session, op, local_op_buffer, accumulator, prevcount,
+                                     datatype);
 
                     /* all reduced on available data this step (i) complete,
                      * pass to the next process unless you are the root.
@@ -242,9 +243,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         } /* end of for each segment */
 
         /* clean up */
-        COLL_BASE_FREE(allocator, inbuf_free[0]);
-        COLL_BASE_FREE(allocator, inbuf_free[1]);
-        COLL_BASE_FREE(allocator, accumbuf_free);
+        COLL_SESSION_FREE(session, inbuf_free[0]);
+        COLL_SESSION_FREE(session, inbuf_free[1]);
+        COLL_SESSION_FREE(session, accumbuf_free);
     }
 
     /* leaf nodes
@@ -365,9 +366,9 @@ int ompi_coll_base_reduce_generic( const void* sendbuf, void* recvbuf, size_t or
         }
         ompi_coll_base_free_reqs(sreq, max_outstanding_reqs);
     }
-    COLL_BASE_FREE(allocator, inbuf_free[0]);
-    COLL_BASE_FREE(allocator, inbuf_free[1]);
-    COLL_BASE_FREE(allocator, accumbuf_free);
+    COLL_SESSION_FREE(session, inbuf_free[0]);
+    COLL_SESSION_FREE(session, inbuf_free[1]);
+    COLL_SESSION_FREE(session, accumbuf_free);
     OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,
                    "ERROR_HNDL: node %d file %s line %d error %d\n",
                    rank, __FILE__, line, ret ));
@@ -388,7 +389,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_
                                         ompi_communicator_t* comm,
                                         mca_coll_base_module_t *module,
                                         uint32_t segsize, int fanout,
-                                        int max_outstanding_reqs, mca_allocator_base_module_t *allocator )
+                                        int max_outstanding_reqs, ompi_op_gpu_session_t *session )
 {
     size_t segcount = count;
     size_t typelng;
@@ -408,7 +409,7 @@ int ompi_coll_base_reduce_intra_chain( const void *sendbuf, void *recvbuf, size_
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_chain,
-                                           segcount, max_outstanding_reqs, allocator );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 
@@ -418,7 +419,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
+                                           int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -440,7 +441,7 @@ int ompi_coll_base_reduce_intra_pipeline( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_pipeline,
-                                           segcount, max_outstanding_reqs, allocator );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
@@ -449,7 +450,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
                                          ompi_communicator_t* comm,
                                          mca_coll_base_module_t *module,
                                          uint32_t segsize,
-                                         int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
+                                         int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -471,7 +472,7 @@ int ompi_coll_base_reduce_intra_binary( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_bintree,
-                                           segcount, max_outstanding_reqs, allocator );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
@@ -480,7 +481,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
+                                           int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     size_t segcount = count;
     size_t typelng;
@@ -502,7 +503,7 @@ int ompi_coll_base_reduce_intra_binomial( const void *sendbuf, void *recvbuf,
     return ompi_coll_base_reduce_generic( sendbuf, recvbuf, count, datatype,
                                            op, root, comm, module,
                                            data->cached_in_order_bmtree,
-                                           segcount, max_outstanding_reqs, allocator );
+                                           segcount, max_outstanding_reqs, session );
 }
 
 /*
@@ -519,7 +520,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                                   ompi_communicator_t* comm,
                                                   mca_coll_base_module_t *module,
                                                   uint32_t segsize,
-                                                  int max_outstanding_reqs, mca_allocator_base_module_t *allocator  )
+                                                  int max_outstanding_reqs, ompi_op_gpu_session_t *session  )
 {
     int ret, rank, size, io_root, segcount = count;
     void *use_this_sendbuf = NULL;
@@ -560,7 +561,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
         dsize = opal_datatype_span(&datatype->super, count, &gap);
 
         if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
-            tmpbuf_free = (char *) COLL_BASE_ALLOC(allocator, dsize);
+            tmpbuf_free = (char *) COLL_SESSION_ALLOC(session, dsize);
             if (NULL == tmpbuf_free) {
                 return MPI_ERR_INTERN;
             }
@@ -570,7 +571,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                                 (char*)recvbuf);
             use_this_sendbuf = tmpbuf;
         } else if (io_root == rank) {
-            tmpbuf_free = (char *) COLL_BASE_ALLOC(allocator, dsize);
+            tmpbuf_free = (char *) COLL_SESSION_ALLOC(session, dsize);
             if (NULL == tmpbuf_free) {
                 return MPI_ERR_INTERN;
             }
@@ -583,9 +584,9 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
     ret = ompi_coll_base_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                           op, io_root, comm, module,
                                           data->cached_in_order_bintree,
-                                          segcount, max_outstanding_reqs, allocator );
+                                          segcount, max_outstanding_reqs, session );
     if (MPI_SUCCESS != ret) {
-        COLL_BASE_FREE(allocator, tmpbuf_free);
+        COLL_SESSION_FREE(session, tmpbuf_free);
         return ret;
     }
 
@@ -597,7 +598,7 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                     MCA_COLL_BASE_TAG_REDUCE, comm,
                                     MPI_STATUS_IGNORE));
             if (MPI_SUCCESS != ret) {
-                COLL_BASE_FREE(allocator, tmpbuf_free);
+                COLL_SESSION_FREE(session, tmpbuf_free);
                 return ret;
             }
 
@@ -607,13 +608,13 @@ int ompi_coll_base_reduce_intra_in_order_binary( const void *sendbuf, void *recv
                                     MCA_COLL_BASE_TAG_REDUCE,
                                     MCA_PML_BASE_SEND_STANDARD, comm));
             if (MPI_SUCCESS != ret) {
-                COLL_BASE_FREE(allocator, tmpbuf_free);
+                COLL_SESSION_FREE(session, tmpbuf_free);
                 return ret;
             }
         }
     }
     if (NULL != tmpbuf_free) {
-        COLL_BASE_FREE(allocator, tmpbuf_free);
+        COLL_SESSION_FREE(session, tmpbuf_free);
     }
 
     return MPI_SUCCESS;
@@ -812,7 +813,7 @@ ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, size_t co
 int ompi_coll_base_reduce_intra_redscat_gather(
     const void *sbuf, void *rbuf, size_t count, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, int root, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int comm_size = ompi_comm_size(comm);
     int rank = ompi_comm_rank(comm);
@@ -844,7 +845,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
 
     /* Temporary buffers */
     char *tmp_buf_raw = NULL, *rbuf_raw = NULL;
-    tmp_buf_raw = COLL_BASE_ALLOC(allocator, dsize);
+    tmp_buf_raw = COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmp_buf_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -852,7 +853,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
     char *tmp_buf = tmp_buf_raw - gap;
 
     if (rank != root) {
-        rbuf_raw = COLL_BASE_ALLOC(allocator, dsize);
+        rbuf_raw = COLL_SESSION_ALLOC(session, dsize);
         if (NULL == rbuf_raw) {
             err = OMPI_ERR_OUT_OF_RESOURCE;
             goto cleanup_and_return;
@@ -906,8 +907,8 @@ int ompi_coll_base_reduce_intra_redscat_gather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
-                           (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)count_lhalf * extent,
+                             (char *)rbuf + count_lhalf * extent, count_rhalf, dtype);
 
             /* Send the right half to the left neighbor */
             err = MCA_PML_CALL(send((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -934,7 +935,7 @@ int ompi_coll_base_reduce_intra_redscat_gather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Reduce on the right half of the buffers (result in rbuf) */
-            ompi_op_reduce(op, tmp_buf, rbuf, count_lhalf, dtype);
+            COLL_BASE_REDUCE(session, op, tmp_buf, rbuf, count_lhalf, dtype);
 
             /* Recv the right half from the right neighbor */
             err = MCA_PML_CALL(recv((char *)rbuf + (ptrdiff_t)count_lhalf * extent,
@@ -1016,9 +1017,9 @@ int ompi_coll_base_reduce_intra_redscat_gather(
             if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
             /* Local reduce: rbuf[] = tmp_buf[] <op> rbuf[] */
-            ompi_op_reduce(op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
-                           (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
-                           rcount[step], dtype);
+            COLL_BASE_REDUCE(session, op, (char *)tmp_buf + (ptrdiff_t)rindex[step] * extent,
+                             (char *)rbuf + (ptrdiff_t)rindex[step] * extent,
+                             rcount[step], dtype);
 
             /* Move the current window to the received message */
             if (step + 1 < nsteps) {
@@ -1129,8 +1130,8 @@ int ompi_coll_base_reduce_intra_redscat_gather(
     }
 
   cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmp_buf_raw);
-    COLL_BASE_FREE(allocator, rbuf_raw);
+    COLL_SESSION_FREE(session, tmp_buf_raw);
+    COLL_SESSION_FREE(session, rbuf_raw);
     if (NULL != rindex)
         free(rindex);
     if (NULL != sindex)
@@ -1168,7 +1169,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
                                            ompi_communicator_t* comm,
                                            mca_coll_base_module_t *module,
                                            uint32_t segsize,
-                                           int max_outstanding_reqs, int radix, mca_allocator_base_module_t *allocator)
+                                           int max_outstanding_reqs, int radix, ompi_op_gpu_session_t *session)
 {
     int err = OMPI_SUCCESS, rank, line;
     ptrdiff_t extent, lb;
@@ -1213,7 +1214,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
         sendtmpbuf = (char *)recvbuf;
     }
     buf_size = opal_datatype_span(&datatype->super, (int64_t)count, &gap);
-    reduce_buf = (char *)COLL_BASE_ALLOC(allocator, buf_size);
+    reduce_buf = (char *)COLL_SESSION_ALLOC(session, buf_size);
     reduce_buf_start = reduce_buf - gap;
     err = ompi_datatype_copy_content_same_ddt(datatype, count,
                                               (char*)reduce_buf_start,
@@ -1225,7 +1226,7 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
     max_reqs = num_children;
     if(!is_leaf) {
         buf_size = opal_datatype_span(&datatype->super, (int64_t)count * num_children, &gap);
-        child_buf = (char *)COLL_BASE_ALLOC(allocator, buf_size);
+        child_buf = (char *)COLL_SESSION_ALLOC(session, buf_size);
         child_buf_start = child_buf - gap;
         reqs = ompi_coll_base_comm_get_reqs(data, max_reqs);
     }
@@ -1248,11 +1249,10 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
     }
 
     for (int i = 0; i < num_children; i++) {
-        ompi_op_reduce(op,
-                       child_buf_start + (ptrdiff_t)i * count * extent,
-                       reduce_buf,
-                       count,
-                       datatype);
+        COLL_BASE_REDUCE(session, op,
+                         child_buf_start + (ptrdiff_t)i * count * extent,
+                         reduce_buf,
+                         count, datatype);
     }
 
     if (rank != root) {
@@ -1273,18 +1273,18 @@ int ompi_coll_base_reduce_intra_knomial( const void *sendbuf, void *recvbuf,
         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
     }
 
-    COLL_BASE_FREE(allocator, child_buf);
-    COLL_BASE_FREE(allocator, reduce_buf);
+    COLL_SESSION_FREE(session, child_buf);
+    COLL_SESSION_FREE(session, reduce_buf);
     return MPI_SUCCESS;
 
  err_hndl:
     if (NULL != child_buf) {
-        COLL_BASE_FREE(allocator, child_buf);
+        COLL_SESSION_FREE(session, child_buf);
         child_buf = NULL;
         child_buf_start = NULL;
     }
     if (NULL != reduce_buf) {
-        COLL_BASE_FREE(allocator, reduce_buf);
+        COLL_SESSION_FREE(session, reduce_buf);
         reduce_buf = NULL;
         reduce_buf_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter.c b/ompi/mca/coll/base/coll_base_reduce_scatter.c
index 2d9119bb1da..62930cccedd 100644
--- a/ompi/mca/coll/base/coll_base_reduce_scatter.c
+++ b/ompi/mca/coll/base/coll_base_reduce_scatter.c
@@ -34,6 +34,7 @@
 #include "ompi/mca/coll/base/coll_tags.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
 #include "coll_base_topo.h"
 #include "coll_base_util.h"
@@ -50,7 +51,7 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
                                                         struct ompi_op_t *op,
                                                         struct ompi_communicator_t *comm,
                                                         mca_coll_base_module_t *module,
-                                                        mca_allocator_base_module_t *allocator)
+                                                        ompi_op_gpu_session_t *session)
 {
     int err, i, rank, size, total_count;
     ptrdiff_t *displs = NULL;
@@ -83,14 +84,14 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
             ptrdiff_t dsize, gap = 0;
             dsize = opal_datatype_span(&dtype->super, total_count, &gap);
 
-            tmprbuf_free = (char*) COLL_BASE_ALLOC(allocator, dsize);
+            tmprbuf_free = (char*) COLL_SESSION_ALLOC(session, dsize);
             tmprbuf = tmprbuf_free - gap;
         }
         err = comm->c_coll->coll_reduce (sbuf, tmprbuf, total_count,
                                         dtype, op, root, comm, comm->c_coll->coll_reduce_module);
     }
     if (MPI_SUCCESS != err) {
-        COLL_BASE_FREE(allocator, tmprbuf_free);
+        COLL_SESSION_FREE(session, tmprbuf_free);
         return err;
     }
 
@@ -110,7 +111,7 @@ int ompi_coll_base_reduce_scatter_intra_nonoverlapping(const void *sbuf, void *r
                                            root, comm, comm->c_coll->coll_scatterv_module);
     }
     free(displs);
-    COLL_BASE_FREE(allocator, tmprbuf_free);
+    COLL_SESSION_FREE(session, tmprbuf_free);
 
     return err;
 }
@@ -140,7 +141,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                                                             struct ompi_op_t *op,
                                                             struct ompi_communicator_t *comm,
                                                             mca_coll_base_module_t *module,
-                                                            mca_allocator_base_module_t *allocator)
+                                                            ompi_op_gpu_session_t *session)
 {
     int i, rank, size, err = OMPI_SUCCESS;
     int tmp_size, remain = 0, tmp_rank;
@@ -182,7 +183,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
     }
 
     /* Allocate temporary receive buffer. */
-    recv_buf_free = (char*) COLL_BASE_ALLOC(allocator, buf_size);
+    recv_buf_free = (char*) COLL_SESSION_ALLOC(session, buf_size);
     recv_buf = recv_buf_free - gap;
     if (NULL == recv_buf_free) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
@@ -190,7 +191,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
     }
 
     /* allocate temporary buffer for results */
-    result_buf_free = (char*) COLL_BASE_ALLOC(allocator, buf_size);
+    result_buf_free = (char*) COLL_SESSION_ALLOC(session, buf_size);
     result_buf = result_buf_free - gap;
 
     /* copy local buffer into the temporary results */
@@ -223,7 +224,7 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                                     comm, MPI_STATUS_IGNORE));
 
             /* integrate their results into our temp results */
-            ompi_op_reduce(op, recv_buf, result_buf, count, dtype);
+            COLL_BASE_REDUCE(session, op, recv_buf, result_buf, count, dtype);
 
             /* adjust rank to be the bottom "remain" ranks */
             tmp_rank = rank / 2;
@@ -341,10 +342,10 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
                     goto cleanup;
                 }
 
-                ompi_op_reduce(op,
-                               recv_buf + tmp_disps[recv_index] * extent,
-                               result_buf + tmp_disps[recv_index] * extent,
-                               recv_count, dtype);
+                COLL_BASE_REDUCE(session, op,
+                                 recv_buf + tmp_disps[recv_index] * extent,
+                                 result_buf + tmp_disps[recv_index] * extent,
+                                 recv_count, dtype);
             }
 
             /* update for next iteration */
@@ -393,8 +394,8 @@ ompi_coll_base_reduce_scatter_intra_basic_recursivehalving( const void *sbuf,
 
  cleanup:
     if (NULL != disps) free(disps);
-    COLL_BASE_FREE(allocator, recv_buf_free);
-    COLL_BASE_FREE(allocator, result_buf_free);
+    COLL_SESSION_FREE(session, recv_buf_free);
+    COLL_SESSION_FREE(session, result_buf_free);
 
     return err;
 }
@@ -467,7 +468,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
                                           struct ompi_op_t *op,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module,
-                                          mca_allocator_base_module_t *allocator)
+                                          ompi_op_gpu_session_t *session)
 {
     int ret, line, rank, size, i, k, recv_from, send_to;
     int inbi;
@@ -521,15 +522,15 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     max_real_segsize = opal_datatype_span(&dtype->super, max_block_count, &gap);
     dsize = opal_datatype_span(&dtype->super, total_count, &gap);
 
-    accumbuf_free = (char*)COLL_BASE_ALLOC(allocator, dsize);
+    accumbuf_free = (char*)COLL_SESSION_ALLOC(session, dsize);
     if (NULL == accumbuf_free) { ret = -1; line = __LINE__; goto error_hndl; }
     accumbuf = accumbuf_free - gap;
 
-    inbuf_free[0] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
+    inbuf_free[0] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
     if (NULL == inbuf_free[0]) { ret = -1; line = __LINE__; goto error_hndl; }
     inbuf[0] = inbuf_free[0] - gap;
     if (size > 2) {
-        inbuf_free[1] = (char*)COLL_BASE_ALLOC(allocator, max_real_segsize);
+        inbuf_free[1] = (char*)COLL_SESSION_ALLOC(session, max_real_segsize);
         if (NULL == inbuf_free[1]) { ret = -1; line = __LINE__; goto error_hndl; }
         inbuf[1] = inbuf_free[1] - gap;
     }
@@ -594,7 +595,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         tmprecv = accumbuf + displs[prevblock] * extent;
-        ompi_op_reduce(op, inbuf[inbi ^ 0x1], tmprecv, ompi_count_array_get(rcounts, prevblock), dtype);
+        COLL_BASE_REDUCE(session, op, inbuf[inbi ^ 0x1], tmprecv, ompi_count_array_get(rcounts, prevblock), dtype);
 
         /* send previous block to send_to */
         ret = MCA_PML_CALL(send(tmprecv, ompi_count_array_get(rcounts, prevblock), dtype, send_to,
@@ -610,7 +611,7 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     /* Apply operation on the last block (my block)
        rbuf[rank] = inbuf[inbi] (op) rbuf[rank] */
     tmprecv = accumbuf + displs[rank] * extent;
-    ompi_op_reduce(op, inbuf[inbi], tmprecv, ompi_count_array_get(rcounts, rank), dtype);
+    COLL_BASE_REDUCE(session, op, inbuf[inbi], tmprecv, ompi_count_array_get(rcounts, rank), dtype);
 
     /* Copy result from tmprecv to rbuf */
     ret = ompi_datatype_copy_content_same_ddt(dtype, ompi_count_array_get(rcounts, rank),
@@ -618,9 +619,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
     if (ret < 0) { line = __LINE__; goto error_hndl; }
 
     if (NULL != displs) free(displs);
-    COLL_BASE_FREE(allocator, accumbuf_free);
-    COLL_BASE_FREE(allocator, inbuf_free[0]);
-    COLL_BASE_FREE(allocator, inbuf_free[1]);
+    COLL_SESSION_FREE(session, accumbuf_free);
+    COLL_SESSION_FREE(session, inbuf_free[0]);
+    COLL_SESSION_FREE(session, inbuf_free[1]);
 
     return MPI_SUCCESS;
 
@@ -629,9 +630,9 @@ ompi_coll_base_reduce_scatter_intra_ring( const void *sbuf, void *rbuf, ompi_cou
                  __FILE__, line, rank, ret));
     (void)line;  // silence compiler warning
     if (NULL != displs) free(displs);
-    COLL_BASE_FREE(allocator, accumbuf_free);
-    COLL_BASE_FREE(allocator, inbuf_free[0]);
-    COLL_BASE_FREE(allocator, inbuf_free[1]);
+    COLL_SESSION_FREE(session, accumbuf_free);
+    COLL_SESSION_FREE(session, inbuf_free[0]);
+    COLL_SESSION_FREE(session, inbuf_free[1]);
     return ret;
 }
 
@@ -704,7 +705,7 @@ int
 ompi_coll_base_reduce_scatter_intra_butterfly(
     const void *sbuf, void *rbuf, ompi_count_array_t rcounts, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t *displs = NULL, index;
@@ -732,8 +733,8 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = COLL_BASE_ALLOC(allocator, span);
-    tmpbuf[1] = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf[0] = COLL_SESSION_ALLOC(session, span);
+    tmpbuf[1] = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -784,7 +785,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
                                     MCA_COLL_BASE_TAG_REDUCE_SCATTER,
                                     comm, MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-            ompi_op_reduce(op, precv, psend, totalcount, dtype);
+            COLL_BASE_REDUCE(session, op, precv, psend, totalcount, dtype);
             /* Adjust rank to be the bottom "remain" ranks */
             vrank = rank / 2;
         }
@@ -840,15 +841,15 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 
             if (vrank < vpeer) {
                 /* precv = psend <op> precv */
-                ompi_op_reduce(op, psend + rdispl * extent,
-                               precv + rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, psend + rdispl * extent,
+                                 precv + rdispl * extent, recv_count, dtype);
                 char *p = psend;
                 psend = precv;
                 precv = p;
             } else {
                 /* psend = precv <op> psend */
-                ompi_op_reduce(op, precv + rdispl * extent,
-                               psend + rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, precv + rdispl * extent,
+                                 psend + rdispl * extent, recv_count, dtype);
             }
             send_index = recv_index;
         }
@@ -902,7 +903,7 @@ ompi_coll_base_reduce_scatter_intra_butterfly(
 cleanup_and_return:
     if (displs)
         free(displs);
-    COLL_BASE_FREE(allocator, tmpbuf[0]);
-    COLL_BASE_FREE(allocator, tmpbuf[1]);
+    COLL_SESSION_FREE(session, tmpbuf[0]);
+    COLL_SESSION_FREE(session, tmpbuf[1]);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
index 47b143a24ec..19319b67ff8 100644
--- a/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
+++ b/ompi/mca/coll/base/coll_base_reduce_scatter_block.c
@@ -38,6 +38,7 @@
 #include "ompi/mca/coll/basic/coll_basic.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_tags.h"
 #include "coll_base_functions.h"
 #include "coll_base_topo.h"
@@ -60,7 +61,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
                                                  struct ompi_op_t *op,
                                                  struct ompi_communicator_t *comm,
                                                  mca_coll_base_module_t *module,
-                                                 mca_allocator_base_module_t *allocator)
+                                                 ompi_op_gpu_session_t *session)
 {
     int rank, size, err = OMPI_SUCCESS;
     size_t count;
@@ -102,7 +103,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
         if (0 == rank) {
             /* temporary receive buffer.  See coll_basic_reduce.c for
                details on sizing */
-            recv_buf_free = (char*) COLL_BASE_ALLOC(allocator, span);
+            recv_buf_free = (char*) COLL_SESSION_ALLOC(session, span);
             if (NULL == recv_buf_free) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto cleanup;
@@ -152,7 +153,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
         if (0 == rank) {
             /* temporary receive buffer.  See coll_basic_reduce.c for
                details on sizing */
-            recv_buf_free = (char*) COLL_BASE_ALLOC(allocator, span);
+            recv_buf_free = (char*) COLL_SESSION_ALLOC(session, span);
             if (NULL == recv_buf_free) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto cleanup;
@@ -175,7 +176,7 @@ ompi_coll_base_reduce_scatter_block_basic_linear(const void *sbuf, void *rbuf, s
     }
 
  cleanup:
-    COLL_BASE_FREE(allocator, recv_buf_free);
+    COLL_SESSION_FREE(session, recv_buf_free);
 
     return err;
 }
@@ -199,7 +200,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     struct ompi_datatype_t *dtypesend = NULL, *dtyperecv = NULL;
     char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
@@ -226,12 +227,12 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
          * will overflow an int data type.
          * Fallback to the linear algorithm.
          */
-        return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, allocator);
+        return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype, op, comm, module, session);
     }
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf_raw = COLL_BASE_ALLOC(allocator, span);
-    tmprecv_raw = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf_raw = COLL_SESSION_ALLOC(session, span);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -341,15 +342,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
         if (is_block_received) {
             /* After reduction the result must be in tmpbuf */
             if (is_commutative || (remote_tree_root < cur_tree_root)) {
-                ompi_op_reduce(op, tmprecv, tmpbuf, blocklens[0], dtype);
-                ompi_op_reduce(op, tmprecv + (ptrdiff_t)displs[1] * extent,
-                               tmpbuf + (ptrdiff_t)displs[1] * extent,
-                               blocklens[1], dtype);
+                COLL_BASE_REDUCE(session, op, tmprecv, tmpbuf, blocklens[0], dtype);
+                COLL_BASE_REDUCE(session, op, tmprecv + (ptrdiff_t)displs[1] * extent,
+                                 tmpbuf + (ptrdiff_t)displs[1] * extent,
+                                 blocklens[1], dtype);
             } else {
-                ompi_op_reduce(op, tmpbuf, tmprecv, blocklens[0], dtype);
-                ompi_op_reduce(op, tmpbuf + (ptrdiff_t)displs[1] * extent,
-                               tmprecv + (ptrdiff_t)displs[1] * extent,
-                               blocklens[1], dtype);
+                COLL_BASE_REDUCE(session, op, tmpbuf, tmprecv, blocklens[0], dtype);
+                COLL_BASE_REDUCE(session, op, tmpbuf + (ptrdiff_t)displs[1] * extent,
+                                 tmprecv + (ptrdiff_t)displs[1] * extent,
+                                 blocklens[1], dtype);
                 err = ompi_datatype_copy_content_same_ddt(dtyperecv, 1,
                                                           tmpbuf, tmprecv);
                 if (MPI_SUCCESS != err) { goto cleanup_and_return; }
@@ -369,8 +370,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(
         ompi_datatype_destroy(&dtypesend);
     if (dtyperecv)
         ompi_datatype_destroy(&dtyperecv);
-    COLL_BASE_FREE(allocator, tmpbuf_raw);
-    COLL_BASE_FREE(allocator, tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpbuf_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
 
@@ -405,7 +406,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmprecv_raw = NULL, *tmpbuf_raw = NULL, *tmprecv, *tmpbuf;
     ptrdiff_t span, gap, totalcount, extent;
@@ -424,14 +425,14 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
                      "coll:base:reduce_scatter_block_intra_recursivehalving: rank %d/%d "
                      "switching to basic reduce_scatter_block", rank, comm_size));
         return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount, dtype,
-                                                                op, comm, module, allocator);
+                                                                op, comm, module, session);
     }
 
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf_raw = COLL_BASE_ALLOC(allocator, span);
-    tmprecv_raw = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf_raw = COLL_SESSION_ALLOC(session, span);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -481,7 +482,7 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
                                     MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
                                     comm, MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-            ompi_op_reduce(op, tmprecv, tmpbuf, totalcount, dtype);
+            COLL_BASE_REDUCE(session, op, tmprecv, tmpbuf, totalcount, dtype);
             /* Adjust rank to be the bottom "remain" ranks */
             vrank = rank / 2;
         }
@@ -545,8 +546,8 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
             if (recv_count > 0) {
                 err = ompi_request_wait(&request, MPI_STATUS_IGNORE);
                 if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-                ompi_op_reduce(op, tmprecv + rdispl * extent,
-                               tmpbuf + rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, tmprecv + rdispl * extent,
+                                 tmpbuf + rdispl * extent, recv_count, dtype);
             }
             send_index = recv_index;
             last_index = recv_index + mask;
@@ -575,15 +576,15 @@ ompi_coll_base_reduce_scatter_block_intra_recursivehalving(
     }
 
 cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmpbuf_raw);
-    COLL_BASE_FREE(allocator, tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpbuf_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
 
 static int ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator);
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session);
 
 /*
  * ompi_coll_base_reduce_scatter_block_intra_butterfly
@@ -645,7 +646,7 @@ int
 ompi_coll_base_reduce_scatter_block_intra_butterfly(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t span, gap, totalcount, extent;
@@ -662,14 +663,14 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
     if (!(comm_size & (comm_size - 1))) {
         /* Special case: comm_size is a power of two */
         return ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
-                   sbuf, rbuf, rcount, dtype, op, comm, module, allocator);
+                   sbuf, rbuf, rcount, dtype, op, comm, module, session);
     }
 
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = COLL_BASE_ALLOC(allocator, span);
-    tmpbuf[1] = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf[0] = COLL_SESSION_ALLOC(session, span);
+    tmpbuf[1] = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -720,7 +721,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
                                     MCA_COLL_BASE_TAG_REDUCE_SCATTER_BLOCK,
                                     comm, MPI_STATUS_IGNORE));
             if (OMPI_SUCCESS != err) { goto cleanup_and_return; }
-            ompi_op_reduce(op, precv, psend, totalcount, dtype);
+            COLL_BASE_REDUCE(session, op, precv, psend, totalcount, dtype);
             /* Adjust rank to be the bottom "remain" ranks */
             vrank = rank / 2;
         }
@@ -777,15 +778,15 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
 
             if (vrank < vpeer) {
                 /* precv = psend <op> precv */
-                ompi_op_reduce(op, psend + (ptrdiff_t)rdispl * extent,
-                               precv + (ptrdiff_t)rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, psend + (ptrdiff_t)rdispl * extent,
+                                 precv + (ptrdiff_t)rdispl * extent, recv_count, dtype);
                 char *p = psend;
                 psend = precv;
                 precv = p;
             } else {
                 /* psend = precv <op> psend */
-                ompi_op_reduce(op, precv + (ptrdiff_t)rdispl * extent,
-                               psend + (ptrdiff_t)rdispl * extent, recv_count, dtype);
+                COLL_BASE_REDUCE(session, op, precv + (ptrdiff_t)rdispl * extent,
+                                 psend + (ptrdiff_t)rdispl * extent, recv_count, dtype);
             }
             send_index = recv_index;
         }
@@ -840,8 +841,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly(
     }
 
 cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmpbuf[0]);
-    COLL_BASE_FREE(allocator, tmpbuf[1]);
+    COLL_SESSION_FREE(session, tmpbuf[0]);
+    COLL_SESSION_FREE(session, tmpbuf[1]);
     return err;
 }
 
@@ -890,7 +891,7 @@ static int
 ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     const void *sbuf, void *rbuf, size_t rcount, struct ompi_datatype_t *dtype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     char *tmpbuf[2] = {NULL, NULL}, *psend, *precv;
     ptrdiff_t span, gap, totalcount, extent;
@@ -904,8 +905,8 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     totalcount = comm_size * (size_t)rcount;
     ompi_datatype_type_extent(dtype, &extent);
     span = opal_datatype_span(&dtype->super, totalcount, &gap);
-    tmpbuf[0] = COLL_BASE_ALLOC(allocator, span);
-    tmpbuf[1] = COLL_BASE_ALLOC(allocator, span);
+    tmpbuf[0] = COLL_SESSION_ALLOC(session, span);
+    tmpbuf[1] = COLL_SESSION_ALLOC(session, span);
     if (NULL == tmpbuf[0] || NULL == tmpbuf[1]) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -946,15 +947,15 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
 
         if (rank < peer) {
             /* precv = psend <op> precv */
-            ompi_op_reduce(op, psend + (ptrdiff_t)recv_index * extent,
-                           precv + (ptrdiff_t)recv_index * extent, nblocks, dtype);
+            COLL_BASE_REDUCE(session, op, psend + (ptrdiff_t)recv_index * extent,
+                             precv + (ptrdiff_t)recv_index * extent, nblocks, dtype);
             char *p = psend;
             psend = precv;
             precv = p;
         } else {
             /* psend = precv <op> psend */
-            ompi_op_reduce(op, precv + (ptrdiff_t)recv_index * extent,
-                           psend + (ptrdiff_t)recv_index * extent, nblocks, dtype);
+            COLL_BASE_REDUCE(session, op, precv + (ptrdiff_t)recv_index * extent,
+                             psend + (ptrdiff_t)recv_index * extent, nblocks, dtype);
         }
         send_index = recv_index;
     }
@@ -964,7 +965,7 @@ ompi_coll_base_reduce_scatter_block_intra_butterfly_pof2(
     if (MPI_SUCCESS != err) { goto cleanup_and_return; }
 
 cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmpbuf[0]);
-    COLL_BASE_FREE(allocator, tmpbuf[1]);
+    COLL_SESSION_FREE(session, tmpbuf[0]);
+    COLL_SESSION_FREE(session, tmpbuf[1]);
     return err;
 }
diff --git a/ompi/mca/coll/base/coll_base_scan.c b/ompi/mca/coll/base/coll_base_scan.c
index 0d69ec8062f..3cf663432ee 100644
--- a/ompi/mca/coll/base/coll_base_scan.c
+++ b/ompi/mca/coll/base/coll_base_scan.c
@@ -23,6 +23,7 @@
 #include "ompi/mca/coll/base/coll_base_util.h"
 #include "ompi/mca/pml/pml.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 
 /*
  * ompi_coll_base_scan_intra_linear
@@ -157,7 +158,7 @@ ompi_coll_base_scan_intra_linear(const void *sbuf, void *rbuf, size_t count,
 int ompi_coll_base_scan_intra_recursivedoubling(
     const void *sendbuf, void *recvbuf, size_t count, struct ompi_datatype_t *datatype,
     struct ompi_op_t *op, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module, mca_allocator_base_module_t *allocator)
+    mca_coll_base_module_t *module, ompi_op_gpu_session_t *session)
 {
     int err = MPI_SUCCESS;
     char *tmpsend_raw = NULL, *tmprecv_raw = NULL;
@@ -179,8 +180,8 @@ int ompi_coll_base_scan_intra_recursivedoubling(
 
     ptrdiff_t dsize, gap;
     dsize = opal_datatype_span(&datatype->super, count, &gap);
-    tmpsend_raw = COLL_BASE_ALLOC(allocator, dsize);
-    tmprecv_raw = COLL_BASE_ALLOC(allocator, dsize);
+    tmpsend_raw = COLL_SESSION_ALLOC(session, dsize);
+    tmprecv_raw = COLL_SESSION_ALLOC(session, dsize);
     if (NULL == tmpsend_raw || NULL == tmprecv_raw) {
         err = OMPI_ERR_OUT_OF_RESOURCE;
         goto cleanup_and_return;
@@ -203,16 +204,16 @@ int ompi_coll_base_scan_intra_recursivedoubling(
 
             if (rank > remote) {
                 /* Accumulate prefix reduction: recvbuf = precv <op> recvbuf */
-                ompi_op_reduce(op, precv, recvbuf, count, datatype);
+                COLL_BASE_REDUCE(session, op, precv, recvbuf, count, datatype);
                 /* Partial result: psend = precv <op> psend */
-                ompi_op_reduce(op, precv, psend, count, datatype);
+                COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
             } else {
                 if (is_commute) {
                     /* psend = precv <op> psend */
-                    ompi_op_reduce(op, precv, psend, count, datatype);
+                    COLL_BASE_REDUCE(session, op, precv, psend, count, datatype);
                 } else {
                     /* precv = psend <op> precv */
-                    ompi_op_reduce(op, psend, precv, count, datatype);
+                    COLL_BASE_REDUCE(session, op, psend, precv, count, datatype);
                     char *tmp = psend;
                     psend = precv;
                     precv = tmp;
@@ -222,7 +223,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(
     }
 
 cleanup_and_return:
-    COLL_BASE_FREE(allocator, tmpsend_raw);
-    COLL_BASE_FREE(allocator, tmprecv_raw);
+    COLL_SESSION_FREE(session, tmpsend_raw);
+    COLL_SESSION_FREE(session, tmprecv_raw);
     return err;
 }
diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h
index 6446821093c..59c1d0ee1f8 100644
--- a/ompi/mca/coll/tuned/coll_tuned.h
+++ b/ompi/mca/coll/tuned/coll_tuned.h
@@ -26,6 +26,7 @@
 #include "ompi/request/request.h"
 #include "ompi/mca/coll/base/coll_base_functions.h"
 #include "opal/util/output.h"
+#include "ompi/op/op_gpu_session.h"
 
 /* also need the dynamic rule structures */
 #include "coll_tuned_dynamic_rules.h"
@@ -115,7 +116,7 @@ int ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorith
 int ompi_coll_tuned_allreduce_intra_dec_fixed(ALLREDUCE_ARGS);
 int ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(ALLREDUCE_ARGS);
 int ompi_coll_tuned_allreduce_intra_dec_dynamic(ALLREDUCE_ARGS);
-int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
+int ompi_coll_tuned_allreduce_intra_do_this(ALLREDUCE_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_allreduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* AlltoAll */
@@ -152,19 +153,19 @@ int ompi_coll_tuned_gather_intra_check_forced_init (coll_tuned_force_algorithm_m
 /* Reduce */
 int ompi_coll_tuned_reduce_intra_dec_fixed(REDUCE_ARGS);
 int ompi_coll_tuned_reduce_intra_dec_dynamic(REDUCE_ARGS);
-int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs, mca_allocator_base_module_t *allocator);
+int ompi_coll_tuned_reduce_intra_do_this(REDUCE_ARGS, int algorithm, int faninout, int segsize, int max_oustanding_reqs, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_reduce_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce_scatter */
 int ompi_coll_tuned_reduce_scatter_intra_dec_fixed(REDUCESCATTER_ARGS);
 int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(REDUCESCATTER_ARGS);
-int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
+int ompi_coll_tuned_reduce_scatter_intra_do_this(REDUCESCATTER_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_reduce_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Reduce_scatter_block */
 int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(REDUCESCATTERBLOCK_ARGS);
 int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(REDUCESCATTERBLOCK_ARGS);
-int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
+int ompi_coll_tuned_reduce_scatter_block_intra_do_this(REDUCESCATTERBLOCK_ARGS, int algorithm, int faninout, int segsize, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Scatter */
@@ -176,13 +177,13 @@ int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_
 /* Exscan */
 int ompi_coll_tuned_exscan_intra_dec_fixed(EXSCAN_ARGS);
 int ompi_coll_tuned_exscan_intra_dec_dynamic(EXSCAN_ARGS);
-int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm, mca_allocator_base_module_t *allocator);
+int ompi_coll_tuned_exscan_intra_do_this(EXSCAN_ARGS, int algorithm, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_exscan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Scan */
 int ompi_coll_tuned_scan_intra_dec_fixed(SCAN_ARGS);
 int ompi_coll_tuned_scan_intra_dec_dynamic(SCAN_ARGS);
-int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm, mca_allocator_base_module_t *allocator);
+int ompi_coll_tuned_scan_intra_do_this(SCAN_ARGS, int algorithm, ompi_op_gpu_session_t *session);
 int ompi_coll_tuned_scan_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 struct mca_coll_tuned_component_t {
diff --git a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
index 0b38ae01f24..113779c90b7 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allreduce_decision.c
@@ -131,7 +131,7 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module,
                                             int algorithm, int faninout, int segsize,
-                                            mca_allocator_base_module_t *allocator)
+                                            ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:allreduce_intra_do_this algorithm %d topo fan in/out %d segsize %d",
@@ -145,15 +145,15 @@ int ompi_coll_tuned_allreduce_intra_do_this(const void *sbuf, void *rbuf, size_t
     case (2):
         return ompi_coll_base_allreduce_intra_nonoverlapping(sbuf, rbuf, count, dtype, op, comm, module);
     case (3):
-        return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module, allocator);
+        return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm, module, session);
     case (4):
-        return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module, allocator);
+        return ompi_coll_base_allreduce_intra_ring(sbuf, rbuf, count, dtype, op, comm, module, session);
     case (5):
-        return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize, allocator);
+        return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op, comm, module, segsize, session);
     case (6):
-        return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module, allocator);
+        return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm, module, session);
     case (7):
-        return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module, allocator);
+        return ompi_coll_base_allreduce_intra_allgather_reduce(sbuf, rbuf, count, dtype, op, comm, module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:allreduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
index 30e5fdb8b78..31204c472bb 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
@@ -65,8 +65,7 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator (NULL). */
+    /* session=NULL uses host ompi_op_reduce path. */
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[ALLREDUCE].algorithm) {
@@ -323,8 +322,7 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_dec_dynamic"));
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator (NULL). */
+    /* session=NULL uses host ompi_op_reduce path. */
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCE].algorithm) {
@@ -384,8 +382,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_dec_dynamic"));
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator (NULL). */
+    /* session=NULL uses host ompi_op_reduce path. */
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
@@ -444,8 +441,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_dec_dynamic"));
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator (NULL). */
+    /* session=NULL uses host ompi_op_reduce path. */
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) {
@@ -735,8 +731,7 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_exscan_intra_dec_dynamic"));
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator (NULL). */
+    /* session=NULL uses host ompi_op_reduce path. */
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[EXSCAN].algorithm) {
@@ -783,8 +778,7 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_scan_intra_dec_dynamic"));
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator (NULL). */
+    /* session=NULL uses host ompi_op_reduce path. */
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[SCAN].algorithm) {
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
index b66a71563fe..d4994ba8c4a 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@@ -216,8 +216,7 @@ ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, size_t c
         }
     }
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator. */
+    /* session=NULL uses host ompi_op_reduce path. */
     return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
                                                     comm, module, alg, 0, 0, NULL);
 }
@@ -1077,8 +1076,7 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
         }
     }
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator. */
+    /* session=NULL uses host ompi_op_reduce path. */
     int faninout = 2;
     return  ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
                                                   op, root, comm, module,
@@ -1229,8 +1227,7 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf
         }
     }
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator. */
+    /* session=NULL uses host ompi_op_reduce path. */
     return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
                                                           op, comm, module,
                                                           alg, 0, 0, NULL);
@@ -1352,8 +1349,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void
         }
     }
 
-    /* Scratch buffers are used for reductions (ompi_op_reduce); device-side
-     * reduction is not yet supported, so always use the host allocator. */
+    /* session=NULL uses host ompi_op_reduce path. */
     return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
                                                                 op, comm, module,
                                                                 alg, 0, 0, NULL);
diff --git a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
index 781e2d240a8..50f16dc27cd 100644
--- a/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_exscan_decision.c
@@ -93,7 +93,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                          struct ompi_op_t *op,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
-                                         int algorithm, mca_allocator_base_module_t *allocator)
+                                         int algorithm, ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:exscan_intra_do_this selected algorithm %d",
@@ -104,7 +104,7 @@ int ompi_coll_tuned_exscan_intra_do_this(const void *sbuf, void* rbuf, size_t co
     case (1):  return ompi_coll_base_exscan_intra_linear(sbuf, rbuf, count, dtype,
                                                          op, comm, module);
     case (2):  return ompi_coll_base_exscan_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                    op, comm, module, allocator);
+                                                                    op, comm, module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:exscan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
index 2d16e88ed06..f935a680116 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_decision.c
@@ -155,7 +155,7 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co
                                          mca_coll_base_module_t *module,
                                          int algorithm, int faninout,
                                          int segsize, int max_requests,
-                                         mca_allocator_base_module_t *allocator)
+                                         ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -169,30 +169,30 @@ int ompi_coll_tuned_reduce_intra_do_this(const void *sbuf, void* rbuf, size_t co
     case (2):  return ompi_coll_base_reduce_intra_chain(sbuf, rbuf, count, dtype,
                                                         op, root, comm, module,
                                                         segsize, faninout, max_requests,
-                                                        allocator);
+                                                        session);
     case (3):  return ompi_coll_base_reduce_intra_pipeline(sbuf, rbuf, count, dtype,
                                                            op, root, comm, module,
                                                            segsize, max_requests,
-                                                           allocator);
+                                                           session);
     case (4):  return ompi_coll_base_reduce_intra_binary(sbuf, rbuf, count, dtype,
                                                          op, root, comm, module,
                                                          segsize, max_requests,
-                                                         allocator);
+                                                         session);
     case (5):  return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype,
                                                            op, root, comm, module,
                                                            segsize, max_requests,
-                                                           allocator);
+                                                           session);
     case (6):  return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype,
                                                                   op, root, comm, module,
                                                                   segsize, max_requests,
-                                                                  allocator);
+                                                                  session);
     case (7):  return ompi_coll_base_reduce_intra_redscat_gather(sbuf, rbuf, count, dtype,
                                                                   op, root, comm, module,
-                                                                  allocator);
+                                                                  session);
     case (8):  return ompi_coll_base_reduce_intra_knomial(sbuf, rbuf, count, dtype,
                                                           op, root, comm, module,
                                                           segsize, max_requests,
-                                                          faninout, allocator);
+                                                          faninout, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
index c9e00e62585..7a05ce9ee72 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_block_decision.c
@@ -124,7 +124,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
                                                        struct ompi_communicator_t *comm,
                                                        mca_coll_base_module_t *module,
                                                        int algorithm, int faninout, int segsize,
-                                                       mca_allocator_base_module_t *allocator)
+                                                       ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -135,15 +135,15 @@ int ompi_coll_tuned_reduce_scatter_block_intra_do_this(const void *sbuf, void *r
                                                                           dtype, op, comm, module);
     case (1): return ompi_coll_base_reduce_scatter_block_basic_linear(sbuf, rbuf, rcount,
                                                                       dtype, op, comm, module,
-                                                                      allocator);
+                                                                      session);
     case (2): return ompi_coll_base_reduce_scatter_block_intra_recursivedoubling(sbuf, rbuf, rcount,
                                                                                  dtype, op, comm, module,
-                                                                                 allocator);
+                                                                                 session);
     case (3): return ompi_coll_base_reduce_scatter_block_intra_recursivehalving(sbuf, rbuf, rcount,
                                                                                 dtype, op, comm, module,
-                                                                                allocator);
+                                                                                session);
     case (4): return ompi_coll_base_reduce_scatter_block_intra_butterfly(sbuf, rbuf, rcount, dtype, op, comm,
-                                                                         module, allocator);
+                                                                         module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
index 6146a71e849..5c79333e567 100644
--- a/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_reduce_scatter_decision.c
@@ -131,7 +131,7 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf,
                                                  struct ompi_communicator_t *comm,
                                                  mca_coll_base_module_t *module,
                                                  int algorithm, int faninout, int segsize,
-                                                 mca_allocator_base_module_t *allocator)
+                                                 ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -142,16 +142,16 @@ int ompi_coll_tuned_reduce_scatter_intra_do_this(const void *sbuf, void* rbuf,
                                                                     dtype, op, comm, module);
     case (1): return ompi_coll_base_reduce_scatter_intra_nonoverlapping(sbuf, rbuf, rcounts,
                                                                         dtype, op, comm, module,
-                                                                        allocator);
+                                                                        session);
     case (2): return ompi_coll_base_reduce_scatter_intra_basic_recursivehalving(sbuf, rbuf, rcounts,
                                                                                 dtype, op, comm, module,
-                                                                                allocator);
+                                                                                session);
     case (3): return ompi_coll_base_reduce_scatter_intra_ring(sbuf, rbuf, rcounts,
                                                               dtype, op, comm, module,
-                                                              allocator);
+                                                              session);
     case (4): return ompi_coll_base_reduce_scatter_intra_butterfly(sbuf, rbuf, rcounts,
                                                                    dtype, op, comm, module,
-                                                                   allocator);
+                                                                   session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
index d3db038a550..e16a2376c65 100644
--- a/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_scan_decision.c
@@ -93,7 +93,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun
                                          struct ompi_op_t *op,
                                          struct ompi_communicator_t *comm,
                                          mca_coll_base_module_t *module,
-                                         int algorithm, mca_allocator_base_module_t *allocator)
+                                         int algorithm, ompi_op_gpu_session_t *session)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:scan_intra_do_this selected algorithm %d",
@@ -104,7 +104,7 @@ int ompi_coll_tuned_scan_intra_do_this(const void *sbuf, void* rbuf, size_t coun
     case (1):  return ompi_coll_base_scan_intra_linear(sbuf, rbuf, count, dtype,
                                                        op, comm, module);
     case (2):  return ompi_coll_base_scan_intra_recursivedoubling(sbuf, rbuf, count, dtype,
-                                                                  op, comm, module, allocator);
+                                                                  op, comm, module, session);
     } /* switch */
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:scan_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 34d26376ab9..45b8f81c1b1 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -326,6 +326,34 @@ typedef struct ompi_op_base_module_1_0_0_t *
   (*ompi_op_base_component_op_query_1_0_0_fn_t)
     (struct ompi_op_t *op, int *priority);
 
+/* Forward declaration for GPU session (defined in ompi/op/op_gpu_session.h) */
+struct ompi_op_gpu_session_t;
+
+/**
+ * Optional component hook: create a GPU reduction session for the given
+ * (op, dtype) on a specific device.  Returns NULL if this component does
+ * not support the combination (caller tries the next component).
+ */
+typedef struct ompi_op_gpu_session_t *
+  (*ompi_op_base_component_session_begin_fn_t)(struct ompi_op_t *op,
+                                               struct ompi_datatype_t *dtype,
+                                               int dev_id);
+
+/**
+ * Optional component hook: post one reduction to the persistent kernel and
+ * block until done.
+ */
+typedef void (*ompi_op_base_component_session_reduce_fn_t)(
+                  struct ompi_op_gpu_session_t *session,
+                  const void *src, void *dst, size_t count);
+
+/**
+ * Optional component hook: shut down persistent kernel and free session.
+ * Must be NULL-safe.
+ */
+typedef void (*ompi_op_base_component_session_end_fn_t)(
+                  struct ompi_op_gpu_session_t *session);
+
 /**
  * Op component interface.
  *
@@ -343,6 +371,11 @@ typedef struct ompi_op_base_component_1_0_0_t {
     ompi_op_base_component_init_query_fn_t opc_init_query;
     /** Query whether component is usable for given op */
     ompi_op_base_component_op_query_1_0_0_fn_t opc_op_query;
+
+    /** Optional: GPU session lifecycle hooks.  NULL in host-only components. */
+    ompi_op_base_component_session_begin_fn_t  opc_session_begin;
+    ompi_op_base_component_session_reduce_fn_t opc_session_reduce;
+    ompi_op_base_component_session_end_fn_t    opc_session_end;
 } ompi_op_base_component_1_0_0_t;
 
 
diff --git a/ompi/op/Makefile.am b/ompi/op/Makefile.am
index 5599c31311b..b86bb1a3965 100644
--- a/ompi/op/Makefile.am
+++ b/ompi/op/Makefile.am
@@ -23,5 +23,7 @@
 # ompi/Makefile.am
 
 headers += op/op.h
+headers += op/op_gpu_session.h
 
 lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op.c
+lib@OMPI_LIBMPI_NAME@_la_SOURCES += op/op_gpu_session.c
diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c
new file mode 100644
index 00000000000..b3aefdd5c98
--- /dev/null
+++ b/ompi/op/op_gpu_session.c
@@ -0,0 +1,46 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/op/op.h"
+
+ompi_op_gpu_session_t *
+ompi_op_gpu_session_begin(struct ompi_op_t *op,
+                          struct ompi_datatype_t *dtype,
+                          int dev_id)
+{
+    /* Phase 1 stub: no GPU op components yet.  Always return NULL so that
+     * all callers use the host ompi_op_reduce path. */
+    (void) op;
+    (void) dtype;
+    (void) dev_id;
+    return NULL;
+}
+
+void
+ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
+                           const void *src, void *dst, size_t count)
+{
+    /* Must not be called when session is NULL */
+    (void) session;
+    (void) src;
+    (void) dst;
+    (void) count;
+}
+
+void
+ompi_op_gpu_session_end(ompi_op_gpu_session_t *session)
+{
+    /* NULL-safe no-op in Phase 1 */
+    (void) session;
+}
diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h
new file mode 100644
index 00000000000..79ff195e18e
--- /dev/null
+++ b/ompi/op/op_gpu_session.h
@@ -0,0 +1,60 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_OP_GPU_SESSION_H
+#define OMPI_OP_GPU_SESSION_H
+
+#include "ompi_config.h"
+#include "opal/mca/allocator/allocator.h"
+
+BEGIN_C_DECLS
+
+struct ompi_op_t;
+struct ompi_datatype_t;
+
+/**
+ * Per-collective GPU reduction session.  Created by ompi_op_gpu_session_begin()
+ * before a collective algorithm's reduction loop starts, and destroyed by
+ * ompi_op_gpu_session_end() after the loop completes.  When no GPU op
+ * component is available or the (op, dtype) combination has no GPU kernel,
+ * begin() returns NULL and all callers fall back to ompi_op_reduce().
+ */
+typedef struct ompi_op_gpu_session_t {
+    int                          dev_id;
+    mca_allocator_base_module_t *allocator;  /* GPU scratch allocator for this session */
+    void                        *backend;    /* opaque: cuda or rocm session state */
+} ompi_op_gpu_session_t;
+
+/**
+ * Create a GPU reduction session and launch a persistent reduction kernel.
+ * Returns NULL if no GPU op component supports this (op, dtype) combination
+ * or if no GPU op component is loaded — the caller must then use ompi_op_reduce.
+ */
+OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin(struct ompi_op_t *op,
+                                                                struct ompi_datatype_t *dtype,
+                                                                int dev_id);
+
+/**
+ * Post one reduction command (src op dst → dst) to the persistent kernel and
+ * wait for completion.  Behavior is undefined if session is NULL.
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
+                                               const void *src, void *dst, size_t count);
+
+/**
+ * Shut down the persistent kernel, synchronize the GPU stream, and free all
+ * session resources.  NULL-safe.
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_end(ompi_op_gpu_session_t *session);
+
+END_C_DECLS
+
+#endif /* OMPI_OP_GPU_SESSION_H */

From e4631fe8d1351bc2bf24ac192afe5e69dc61a725 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Fri, 10 Apr 2026 16:14:17 -0400
Subject: [PATCH 04/13] op/session: add stop/restart/free pool for GPU
 reduction sessions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the single opc_session_end hook with three separate hooks
(stop, restart, free) enabling a session freelist that avoids GPU
resource reallocation between collective invocations.

Pool lifecycle:
- session_end() signals the persistent kernel to exit, synchronizes
  the stream, then pushes the session onto a flat dev_id-keyed freelist.
  GPU stream and managed memory remain allocated.
- session_begin() pops a matching dev_id entry from the pool and calls
  restart_fn(session, op, dtype) to reset state and relaunch the
  appropriate persistent kernel — no cudaMalloc/hipMalloc overhead on
  the reuse path.  If restart_fn returns false (no kernel for this
  op/dtype combination), the session is freed and NULL is returned.
- pool_finalize() drains the freelist at MPI_Finalize, calling free_fn
  (releases stream, managed memory, priv) then free(session) for each
  entry.

Update ompi/mca/op/cuda and ompi/mca/op/rocm components to implement
the split session_stop / session_restart / session_free functions.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ompi/mca/op/base/op_base_frame.c     |  10 +-
 ompi/mca/op/cuda/Makefile.am         |  69 +++++++
 ompi/mca/op/cuda/configure.m4        | 108 ++++++++++
 ompi/mca/op/cuda/op_cuda.h           |  54 +++++
 ompi/mca/op/cuda/op_cuda_component.c | 113 +++++++++++
 ompi/mca/op/cuda/op_cuda_kernels.cu  | 286 +++++++++++++++++++++++++++
 ompi/mca/op/cuda/op_cuda_session.c   | 268 +++++++++++++++++++++++++
 ompi/mca/op/op.h                     |  34 +++-
 ompi/mca/op/rocm/Makefile.am         |  68 +++++++
 ompi/mca/op/rocm/configure.m4        |  68 +++++++
 ompi/mca/op/rocm/op_rocm.h           |  54 +++++
 ompi/mca/op/rocm/op_rocm_component.c | 109 ++++++++++
 ompi/mca/op/rocm/op_rocm_kernels.cpp | 286 +++++++++++++++++++++++++++
 ompi/mca/op/rocm/op_rocm_session.c   | 261 ++++++++++++++++++++++++
 ompi/op/op_gpu_session.c             | 188 ++++++++++++++++--
 ompi/op/op_gpu_session.h             |  65 +++++-
 16 files changed, 2016 insertions(+), 25 deletions(-)
 create mode 100644 ompi/mca/op/cuda/Makefile.am
 create mode 100644 ompi/mca/op/cuda/configure.m4
 create mode 100644 ompi/mca/op/cuda/op_cuda.h
 create mode 100644 ompi/mca/op/cuda/op_cuda_component.c
 create mode 100644 ompi/mca/op/cuda/op_cuda_kernels.cu
 create mode 100644 ompi/mca/op/cuda/op_cuda_session.c
 create mode 100644 ompi/mca/op/rocm/Makefile.am
 create mode 100644 ompi/mca/op/rocm/configure.m4
 create mode 100644 ompi/mca/op/rocm/op_rocm.h
 create mode 100644 ompi/mca/op/rocm/op_rocm_component.c
 create mode 100644 ompi/mca/op/rocm/op_rocm_kernels.cpp
 create mode 100644 ompi/mca/op/rocm/op_rocm_session.c

diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c
index 90167300851..89a4e912387 100644
--- a/ompi/mca/op/base/op_base_frame.c
+++ b/ompi/mca/op/base/op_base_frame.c
@@ -29,6 +29,7 @@
 #include "ompi/constants.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
 
 
 /*
@@ -59,5 +60,12 @@ OBJ_CLASS_INSTANCE(ompi_op_base_module_t, opal_object_t,
 OBJ_CLASS_INSTANCE(ompi_op_base_module_1_0_0_t, opal_object_t,
                    module_constructor_1_0_0, NULL);
 
-MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, NULL, NULL,
+static int
+op_base_close(void)
+{
+    ompi_op_gpu_session_pool_finalize();
+    return OMPI_SUCCESS;
+}
+
+MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, NULL, op_base_close,
                            mca_op_base_static_components, 0);
diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
new file mode 100644
index 00000000000..c826455d9a7
--- /dev/null
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# C sources (compiled by the normal C compiler)
+sources = \
+    op_cuda.h \
+    op_cuda_component.c \
+    op_cuda_session.c
+
+# The .cu file is compiled separately by nvcc and linked in as a plain
+# object.  Libtool does not know how to compile CUDA, so we use a custom
+# rule.  The resulting object is appended to LIBADD for both DSO and static
+# builds.
+
+EXTRA_DIST  = op_cuda_kernels.cu
+CLEANFILES  = op_cuda_kernels.o
+
+# Include paths forwarded to nvcc so it can find ompi_config.h and the
+# op/mca headers.
+NVCC_INCLUDES = \
+    -I$(top_srcdir) \
+    -I$(top_builddir) \
+    -I$(top_srcdir)/ompi \
+    -I$(top_builddir)/ompi
+
+op_cuda_kernels.o: $(srcdir)/op_cuda_kernels.cu \
+                   $(srcdir)/op_cuda.h
+	$(NVCC) $(NVCCFLAGS) $(NVCC_INCLUDES) \
+	    $(op_cuda_CPPFLAGS) \
+	    --compiler-options "$(DEFS)" \
+	    -c $< -o $@
+
+AM_CPPFLAGS = $(op_cuda_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# DSO build
+# ----------------------------------------------------------------------------
+if MCA_BUILD_ompi_op_cuda_DSO
+component_install = mca_op_cuda.la
+component_noinst  =
+else
+component_install =
+component_noinst  = libmca_op_cuda.la
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+
+mca_op_cuda_la_SOURCES  = $(sources)
+mca_op_cuda_la_LDFLAGS  = -module -avoid-version $(op_cuda_LDFLAGS)
+mca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.o
+mca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# Static (convenience library) build
+# ----------------------------------------------------------------------------
+noinst_LTLIBRARIES = $(component_noinst)
+
+libmca_op_cuda_la_SOURCES  = $(sources)
+libmca_op_cuda_la_LDFLAGS  = -module -avoid-version $(op_cuda_LDFLAGS)
+libmca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.o
+libmca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS)
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
new file mode 100644
index 00000000000..02081ab2090
--- /dev/null
+++ b/ompi/mca/op/cuda/configure.m4
@@ -0,0 +1,108 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_cuda_CONFIG([action-if-can-compile],
+#                          [action-if-cant-compile])
+# ------------------------------------------------
+# Build the CUDA persistent-kernel op component only when the CUDA
+# runtime (libcudart + cuda_runtime.h) and nvcc are available.
+#
+# Requires that OPAL_CHECK_CUDA has already been called (which sets
+# $CUDA_SUPPORT, $opal_cuda_incdir, and $with_cuda).
+#
+# Sets:
+#   op_cuda_CPPFLAGS — include path for cuda_runtime.h
+#   op_cuda_LDFLAGS  — library search path for libcudart
+#   op_cuda_LIBS     — -lcudart
+#   NVCC             — path to the nvcc compiler
+#   NVCCFLAGS        — default nvcc flags (min arch SM 7.0 for __nanosleep)
+#
+AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])
+
+    # Ensure the top-level CUDA driver-API check has been performed.
+    AC_REQUIRE([OPAL_CHECK_CUDA])
+
+    OPAL_VAR_SCOPE_PUSH([op_cuda_happy op_cuda_save_CPPFLAGS op_cuda_save_LDFLAGS op_cuda_save_LIBS op_cuda_libdir op_cuda_nvcc_path])
+
+    op_cuda_happy=no
+
+    AS_IF([test "x$CUDA_SUPPORT" = "x1"],
+      [
+        op_cuda_save_CPPFLAGS="$CPPFLAGS"
+        op_cuda_save_LDFLAGS="$LDFLAGS"
+        op_cuda_save_LIBS="$LIBS"
+
+        CPPFLAGS="-I$opal_cuda_incdir $CPPFLAGS"
+
+        # Verify that the runtime header is present alongside cuda.h.
+        AC_CHECK_HEADER([cuda_runtime.h],
+          [op_cuda_happy=yes],
+          [AC_MSG_WARN([cuda_runtime.h not found; skipping op/cuda component])
+           op_cuda_happy=no])
+
+        # Locate libcudart — prefer lib64, fall back to lib.
+        AS_IF([test "$op_cuda_happy" = "yes"],
+          [op_cuda_libdir=""
+           AS_IF([test -d "$with_cuda/lib64"],
+                 [op_cuda_libdir="$with_cuda/lib64"],
+                 [AS_IF([test -d "$with_cuda/lib"],
+                        [op_cuda_libdir="$with_cuda/lib"],
+                        [AS_IF([test -d "/usr/local/cuda/lib64"],
+                               [op_cuda_libdir="/usr/local/cuda/lib64"])])])
+           AS_IF([test -n "$op_cuda_libdir"],
+                 [LDFLAGS="-L$op_cuda_libdir $LDFLAGS"])
+           AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
+             [op_cuda_happy=yes],
+             [AC_MSG_WARN([libcudart not found; skipping op/cuda component])
+              op_cuda_happy=no])
+          ])
+
+        # Locate nvcc.
+        AS_IF([test "$op_cuda_happy" = "yes"],
+          [op_cuda_nvcc_path="$PATH"
+           AS_IF([test -d "$with_cuda/bin"],
+                 [op_cuda_nvcc_path="$with_cuda/bin:$PATH"])
+           AC_PATH_PROG([NVCC], [nvcc], [not_found], [$op_cuda_nvcc_path])
+           AS_IF([test "$NVCC" = "not_found"],
+                 [AC_MSG_WARN([nvcc not found; skipping op/cuda component])
+                  op_cuda_happy=no])
+          ])
+
+        # Populate the output variables.
+        AS_IF([test "$op_cuda_happy" = "yes"],
+          [op_cuda_CPPFLAGS="-I$opal_cuda_incdir"
+           AS_IF([test -n "$op_cuda_libdir"],
+                 [op_cuda_LDFLAGS="-L$op_cuda_libdir"],
+                 [op_cuda_LDFLAGS=""])
+           op_cuda_LIBS="-lcudart"
+           # __nanosleep requires SM 7.0 (Volta) or later.
+           AS_IF([test "x$NVCCFLAGS" = "x"],
+                 [NVCCFLAGS="-arch=sm_70"])
+          ])
+
+        CPPFLAGS="$op_cuda_save_CPPFLAGS"
+        LDFLAGS="$op_cuda_save_LDFLAGS"
+        LIBS="$op_cuda_save_LIBS"
+      ])
+
+    AC_SUBST([op_cuda_CPPFLAGS])
+    AC_SUBST([op_cuda_LDFLAGS])
+    AC_SUBST([op_cuda_LIBS])
+    AC_SUBST([NVCC])
+    AC_SUBST([NVCCFLAGS])
+
+    OPAL_VAR_SCOPE_POP
+
+    AS_IF([test "$op_cuda_happy" = "yes"],
+          [$1],
+          [$2])
+])dnl
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
new file mode 100644
index 00000000000..58770e435ed
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -0,0 +1,54 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_MCA_OP_CUDA_H
+#define OMPI_MCA_OP_CUDA_H
+
+#include "ompi_config.h"
+#include <cuda_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/op/op_gpu_session.h"  /* defines ompi_op_gpu_cmd_t */
+
+BEGIN_C_DECLS
+
+/**
+ * Private per-session state owned by the cuda component.
+ * Stored in ompi_op_gpu_session_t.backend.
+ */
+typedef struct {
+    ompi_op_gpu_cmd_t   *cmd;       /* managed-memory command slot        */
+    volatile int32_t    *shutdown;  /* managed-memory shutdown flag        */
+    cudaStream_t         stream;    /* private CUDA stream for this session */
+} ompi_op_cuda_session_priv_t;
+
+/**
+ * Host-side launcher function type.
+ * Launches the persistent kernel for one (op, type) combination.
+ */
+typedef void (*ompi_op_cuda_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd,
+                                           volatile int32_t  *shutdown,
+                                           cudaStream_t       stream);
+
+/**
+ * 2D table [op_index][type_index] of launcher functions.
+ * NULL entries indicate unsupported (op, type) combinations; the session
+ * machinery returns NULL for those and the caller falls back to the host path.
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* × OMPI_OP_BASE_TYPE_*.
+ * Defined (and initialized) in op_cuda_kernels.cu.
+ */
+OMPI_DECLSPEC extern ompi_op_cuda_launcher_fn_t
+ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
+#endif /* OMPI_MCA_OP_CUDA_H */
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
new file mode 100644
index 00000000000..97571fdc4bb
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -0,0 +1,113 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <cuda_runtime.h>
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+
+/* Forward declarations of session hooks (implemented in op_cuda_session.c) */
+ompi_op_gpu_session_t *ompi_op_cuda_session_begin(struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype,
+                                                   int dev_id);
+void ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
+                                 const void *src, void *dst, size_t count);
+void ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session);
+bool ompi_op_cuda_session_restart(ompi_op_gpu_session_t *session,
+                                   struct ompi_op_t *op,
+                                   struct ompi_datatype_t *dtype);
+void ompi_op_cuda_session_free(ompi_op_gpu_session_t *session);
+
+static int cuda_component_open(void);
+static int cuda_component_close(void);
+static int cuda_component_init_query(bool enable_progress_threads,
+                                     bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    cuda_component_op_query(struct ompi_op_t *op, int *priority);
+
+/*
+ * Public component descriptor.
+ *
+ * This component does not provide per-op/per-type function pointers
+ * (opc_op_query returns NULL).  Its sole contribution is the three session
+ * hooks that enable persistent GPU reduction kernels.
+ */
+ompi_op_base_component_1_0_0_t mca_op_cuda_component = {
+    .opc_version = {
+        OMPI_OP_BASE_VERSION_1_0_0,
+
+        .mca_component_name = "cuda",
+        MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                              OMPI_RELEASE_VERSION),
+        .mca_open_component  = cuda_component_open,
+        .mca_close_component = cuda_component_close,
+    },
+    .opc_data = {
+        MCA_BASE_METADATA_PARAM_CHECKPOINT
+    },
+
+    .opc_init_query = cuda_component_init_query,
+    .opc_op_query   = cuda_component_op_query,
+
+    /* GPU session hooks */
+    .opc_session_begin   = ompi_op_cuda_session_begin,
+    .opc_session_reduce  = ompi_op_cuda_session_reduce,
+    .opc_session_stop    = ompi_op_cuda_session_stop,
+    .opc_session_restart = ompi_op_cuda_session_restart,
+    .opc_session_free    = ompi_op_cuda_session_free,
+};
+MCA_BASE_COMPONENT_INIT(ompi, op, cuda)
+
+static int
+cuda_component_open(void)
+{
+    return OMPI_SUCCESS;
+}
+
+static int
+cuda_component_close(void)
+{
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Only activate this component when at least one CUDA-capable device is
+ * present in the system.
+ */
+static int
+cuda_component_init_query(bool enable_progress_threads,
+                          bool enable_mpi_thread_multiple)
+{
+    int device_count = 0;
+    cudaError_t err  = cudaGetDeviceCount(&device_count);
+    if (cudaSuccess != err || device_count <= 0) {
+        return OMPI_ERR_NOT_SUPPORTED;
+    }
+    return OMPI_SUCCESS;
+}
+
+/*
+ * We do not provide per-op function pointers, only session hooks, so
+ * always return NULL here.
+ */
+static struct ompi_op_base_module_1_0_0_t *
+cuda_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    (void) op;
+    (void) priority;
+    return NULL;
+}
diff --git a/ompi/mca/op/cuda/op_cuda_kernels.cu b/ompi/mca/op/cuda/op_cuda_kernels.cu
new file mode 100644
index 00000000000..162eeac606d
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_kernels.cu
@@ -0,0 +1,286 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Persistent reduction kernels for the CUDA op component.
+ *
+ * Each kernel runs one block of 256 threads and loops indefinitely,
+ * sleeping between polls to reduce power consumption.  The host posts
+ * a command by writing src/dst/count into the managed-memory slot and
+ * then setting status=1.  The kernel executes the reduction, then sets
+ * status=2.  The host spins on status until it sees 2, then resets it
+ * to 0 for the next call.  A separate shutdown flag terminates the loop
+ * at session end.
+ */
+
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+
+/* -------------------------------------------------------------------------
+ * PERSISTENT_KERNEL(name, ctype, op_expr)
+ *
+ * Generates __global__ void ompi_op_cuda_persistent_<name>(...).
+ * op_expr must be a statement that updates dst[i] in-place using src[i],
+ * e.g. "dst[i] += src[i]" or "dst[i] = dst[i] > src[i] ? dst[i] : src[i]".
+ * ------------------------------------------------------------------------- */
+#define PERSISTENT_KERNEL(kname, ctype, op_expr)                               \
+__global__ void ompi_op_cuda_persistent_##kname(                               \
+        ompi_op_gpu_cmd_t *cmd, volatile int32_t *shutdown)                    \
+{                                                                               \
+    while (!*shutdown) {                                                        \
+        /* Spin-wait for work; sleep 1 µs between polls to save power */        \
+        while (cmd->status != 1 && !*shutdown) { __nanosleep(1000); }          \
+        if (*shutdown) break;                                                   \
+        const ctype * __restrict__ src = (const ctype *) cmd->src;             \
+              ctype * __restrict__ dst = (      ctype *) cmd->dst;             \
+        int64_t n = cmd->count;                                                 \
+        for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) {      \
+            op_expr;                                                            \
+        }                                                                       \
+        __syncthreads();                                                        \
+        if (threadIdx.x == 0) {                                                 \
+            __threadfence_system();   /* ensure dst writes reach host */        \
+            cmd->status = 2;          /* signal done */                         \
+        }                                                                       \
+    }                                                                           \
+}
+
+/* =========================================================================
+ * Kernel instantiations
+ * ========================================================================= */
+
+/* --- MAX --- */
+PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_float,  float,    dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_double, double,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+
+/* --- MIN --- */
+PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_float,  float,    dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_double, double,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+
+/* --- SUM --- */
+PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] += src[i])
+PERSISTENT_KERNEL(sum_float,  float,    dst[i] += src[i])
+PERSISTENT_KERNEL(sum_double, double,   dst[i] += src[i])
+
+/* --- PROD --- */
+PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_float,  float,    dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_double, double,   dst[i] *= src[i])
+
+/* --- BAND (bitwise AND, integer types only) --- */
+PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] &= src[i])
+
+/* --- BOR (bitwise OR) --- */
+PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] |= src[i])
+
+/* --- BXOR (bitwise XOR) --- */
+PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] ^= src[i])
+
+/* =========================================================================
+ * Host-side launcher wrappers — one per kernel, 1 block × 256 threads.
+ * ========================================================================= */
+#define LAUNCHER(kname)                                                        \
+static void launch_##kname(ompi_op_gpu_cmd_t *cmd,                            \
+                            volatile int32_t  *sd,                             \
+                            cudaStream_t       stream)                         \
+{                                                                               \
+    ompi_op_cuda_persistent_##kname<<<1, 256, 0, stream>>>(cmd, sd);          \
+}
+
+LAUNCHER(max_int8)    LAUNCHER(max_uint8)
+LAUNCHER(max_int16)   LAUNCHER(max_uint16)
+LAUNCHER(max_int32)   LAUNCHER(max_uint32)
+LAUNCHER(max_int64)   LAUNCHER(max_uint64)
+LAUNCHER(max_float)   LAUNCHER(max_double)
+
+LAUNCHER(min_int8)    LAUNCHER(min_uint8)
+LAUNCHER(min_int16)   LAUNCHER(min_uint16)
+LAUNCHER(min_int32)   LAUNCHER(min_uint32)
+LAUNCHER(min_int64)   LAUNCHER(min_uint64)
+LAUNCHER(min_float)   LAUNCHER(min_double)
+
+LAUNCHER(sum_int8)    LAUNCHER(sum_uint8)
+LAUNCHER(sum_int16)   LAUNCHER(sum_uint16)
+LAUNCHER(sum_int32)   LAUNCHER(sum_uint32)
+LAUNCHER(sum_int64)   LAUNCHER(sum_uint64)
+LAUNCHER(sum_float)   LAUNCHER(sum_double)
+
+LAUNCHER(prod_int8)   LAUNCHER(prod_uint8)
+LAUNCHER(prod_int16)  LAUNCHER(prod_uint16)
+LAUNCHER(prod_int32)  LAUNCHER(prod_uint32)
+LAUNCHER(prod_int64)  LAUNCHER(prod_uint64)
+LAUNCHER(prod_float)  LAUNCHER(prod_double)
+
+LAUNCHER(band_int8)   LAUNCHER(band_uint8)
+LAUNCHER(band_int16)  LAUNCHER(band_uint16)
+LAUNCHER(band_int32)  LAUNCHER(band_uint32)
+LAUNCHER(band_int64)  LAUNCHER(band_uint64)
+
+LAUNCHER(bor_int8)    LAUNCHER(bor_uint8)
+LAUNCHER(bor_int16)   LAUNCHER(bor_uint16)
+LAUNCHER(bor_int32)   LAUNCHER(bor_uint32)
+LAUNCHER(bor_int64)   LAUNCHER(bor_uint64)
+
+LAUNCHER(bxor_int8)   LAUNCHER(bxor_uint8)
+LAUNCHER(bxor_int16)  LAUNCHER(bxor_uint16)
+LAUNCHER(bxor_int32)  LAUNCHER(bxor_uint32)
+LAUNCHER(bxor_int64)  LAUNCHER(bxor_uint64)
+
+/* =========================================================================
+ * 2D launcher table [op_index][type_index]
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns).
+ * Zero/NULL entries mean "not supported on GPU" → host fallback.
+ * ========================================================================= */
+ompi_op_cuda_launcher_fn_t
+ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = {
+
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_max_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_max_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_max_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_max_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_max_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_max_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_max_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_max_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_max_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_max_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_min_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_min_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_min_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_min_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_min_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_min_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_min_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_min_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_min_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_min_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_sum_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_sum_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_sum_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_sum_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_sum_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_sum_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_sum_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_sum_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_sum_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_sum_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_prod_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_prod_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_prod_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_prod_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_prod_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_prod_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_prod_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_prod_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_prod_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_prod_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_band_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_band_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_band_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_band_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_band_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_band_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_band_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_band_uint64,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bor_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bor_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bor_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bor_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bor_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bor_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bor_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bor_uint64,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bxor_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bxor_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bxor_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bxor_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bxor_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bxor_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bxor_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bxor_uint64,
+    },
+
+    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: all NULL → host path */
+};
diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c
new file mode 100644
index 00000000000..892fc9d2e10
--- /dev/null
+++ b/ompi/mca/op/cuda/op_cuda_session.c
@@ -0,0 +1,268 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Session lifecycle for the CUDA persistent-kernel op component.
+ *
+ * session_begin:   look up the kernel in the 2D launcher table, allocate
+ *                  managed-memory command slot + shutdown flag, create a
+ *                  private CUDA stream, and launch the persistent kernel.
+ *
+ * session_reduce:  write src/dst/count to the command slot, set status=1
+ *                  to wake the kernel, and spin until status==2.
+ *
+ * session_stop:    signal the persistent kernel to exit and synchronize the
+ *                  stream.  GPU stream and managed memory remain allocated
+ *                  so the session can be reused via session_restart.
+ *
+ * session_restart: reconfigure an idle (stopped) session for a new (op, dtype)
+ *                  combination and relaunch the appropriate persistent kernel.
+ *                  Returns false if no GPU kernel exists for the combination.
+ *
+ * session_free:    release the CUDA stream, managed memory, and backend
+ *                  private state when a session is permanently discarded.
+ *                  Does NOT free the ompi_op_gpu_session_t struct.
+ */
+
+#include "ompi_config.h"
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sched.h>
+
+#include <cuda_runtime.h>
+
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/cuda/op_cuda.h"
+
+/* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */
+extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED];
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_begin
+ *
+ * Called by the component.  Returns a malloc'd ompi_op_gpu_session_t on
+ * success, NULL if this (op, type) combination has no GPU kernel or if
+ * CUDA resource allocation fails.
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_cuda_session_begin(struct ompi_op_t *op,
+                           struct ompi_datatype_t *dtype,
+                           int dev_id)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return NULL;
+    }
+
+    ompi_op_cuda_launcher_fn_t launcher = ompi_op_cuda_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
+        return NULL;   /* no GPU kernel for this (op, type) combination */
+    }
+
+    /* Allocate the public session struct returned to the caller */
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
+    }
+
+    /* Allocate component-private state */
+    ompi_op_cuda_session_priv_t *priv =
+        (ompi_op_cuda_session_priv_t *) malloc(sizeof(ompi_op_cuda_session_priv_t));
+    if (NULL == priv) {
+        free(session);
+        return NULL;
+    }
+
+    cudaError_t err;
+
+    /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
+    err = cudaMallocManaged((void **) &priv->cmd,
+                            sizeof(ompi_op_gpu_cmd_t),
+                            cudaMemAttachGlobal);
+    if (cudaSuccess != err) {
+        free(priv);
+        free(session);
+        return NULL;
+    }
+    priv->cmd->src    = NULL;
+    priv->cmd->dst    = NULL;
+    priv->cmd->count  = 0;
+    priv->cmd->status = 0;
+
+    /* Allocate managed-memory shutdown flag */
+    err = cudaMallocManaged((void **) &priv->shutdown,
+                            sizeof(int32_t),
+                            cudaMemAttachGlobal);
+    if (cudaSuccess != err) {
+        cudaFree(priv->cmd);
+        free(priv);
+        free(session);
+        return NULL;
+    }
+    *priv->shutdown = 0;
+
+    /* Create a dedicated non-blocking stream for this session */
+    err = cudaStreamCreateWithFlags(&priv->stream, cudaStreamNonBlocking);
+    if (cudaSuccess != err) {
+        cudaFree(priv->shutdown);
+        cudaFree(priv->cmd);
+        free(priv);
+        free(session);
+        return NULL;
+    }
+
+    /* Launch the persistent kernel (1 block, 256 threads) */
+    launcher(priv->cmd, priv->shutdown, priv->stream);
+    err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        cudaStreamDestroy(priv->stream);
+        cudaFree(priv->shutdown);
+        cudaFree(priv->cmd);
+        free(priv);
+        free(session);
+        return NULL;
+    }
+
+    session->dev_id    = dev_id;
+    session->allocator = NULL;   /* scratch allocator wired in Phase 4 */
+    session->backend   = priv;
+
+    return session;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_reduce
+ *
+ * Posts one reduction command to the persistent kernel and waits for it to
+ * complete.  Semantics: dst[i] = dst[i] op src[i] for i in [0, count).
+ * Both src and dst must be accessible from the GPU (device or managed mem).
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
+                            const void *src, void *dst, size_t count)
+{
+    ompi_op_cuda_session_priv_t *priv =
+        (ompi_op_cuda_session_priv_t *) session->backend;
+
+    /* Write operands before signalling the kernel */
+    priv->cmd->src   = src;
+    priv->cmd->dst   = dst;
+    priv->cmd->count = (int64_t) count;
+
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
+    priv->cmd->status = 1;                     /* wake the kernel */
+
+    /* Spin-wait for the kernel to signal completion */
+    while (2 != priv->cmd->status) {
+        sched_yield();   /* relinquish CPU timeslice while waiting */
+    }
+
+    /* Reset for the next call */
+    priv->cmd->status = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_stop
+ *
+ * Signal the persistent kernel to exit and wait for the stream to drain.
+ * The GPU stream and managed memory remain allocated so the session can be
+ * recycled via ompi_op_cuda_session_restart.
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session)
+{
+    ompi_op_cuda_session_priv_t *priv =
+        (ompi_op_cuda_session_priv_t *) session->backend;
+
+    /* Signal the kernel to exit its loop */
+    *priv->shutdown = 1;
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+    /* Wait for the kernel to finish; stream remains valid after this */
+    cudaStreamSynchronize(priv->stream);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_restart
+ *
+ * Reconfigure an idle (stopped) session for a new (op, dtype) combination
+ * and relaunch the appropriate persistent kernel.  Returns false if no GPU
+ * kernel exists for this combination.
+ * -------------------------------------------------------------------------- */
+bool
+ompi_op_cuda_session_restart(ompi_op_gpu_session_t *session,
+                              struct ompi_op_t *op,
+                              struct ompi_datatype_t *dtype)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return false;
+    }
+
+    ompi_op_cuda_launcher_fn_t launcher = ompi_op_cuda_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
+        return false;
+    }
+
+    ompi_op_cuda_session_priv_t *priv =
+        (ompi_op_cuda_session_priv_t *) session->backend;
+
+    /* Reset state for the new kernel */
+    *priv->shutdown   = 0;
+    priv->cmd->src    = NULL;
+    priv->cmd->dst    = NULL;
+    priv->cmd->count  = 0;
+    priv->cmd->status = 0;
+
+    /* Launch the persistent kernel for the new (op, dtype) */
+    launcher(priv->cmd, priv->shutdown, priv->stream);
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess != err) {
+        return false;
+    }
+
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_free
+ *
+ * Free the CUDA stream, managed memory, and backend private state.
+ * Does NOT free the ompi_op_gpu_session_t struct (that is the caller's
+ * responsibility, done by session_destroy in op_gpu_session.c).
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_cuda_session_free(ompi_op_gpu_session_t *session)
+{
+    ompi_op_cuda_session_priv_t *priv =
+        (ompi_op_cuda_session_priv_t *) session->backend;
+    if (NULL == priv) {
+        return;
+    }
+
+    cudaStreamDestroy(priv->stream);
+    cudaFree((void *) priv->shutdown);
+    cudaFree(priv->cmd);
+    free(priv);
+    session->backend = NULL;
+}
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 45b8f81c1b1..e7034959757 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -348,10 +348,30 @@ typedef void (*ompi_op_base_component_session_reduce_fn_t)(
                   const void *src, void *dst, size_t count);
 
 /**
- * Optional component hook: shut down persistent kernel and free session.
- * Must be NULL-safe.
+ * Optional component hook: signal the persistent kernel to exit and
+ * synchronize the stream.  The session struct and its managed memory remain
+ * allocated so the session can be recycled by opc_session_restart.
  */
-typedef void (*ompi_op_base_component_session_end_fn_t)(
+typedef void (*ompi_op_base_component_session_stop_fn_t)(
+                  struct ompi_op_gpu_session_t *session);
+
+/**
+ * Optional component hook: reconfigure an idle (stopped) session for a new
+ * (op, dtype) combination and relaunch the appropriate persistent kernel.
+ * Returns true on success; false if no GPU kernel exists for this combination
+ * (caller should return the session to the pool and fall back to host path).
+ */
+typedef bool (*ompi_op_base_component_session_restart_fn_t)(
+                  struct ompi_op_gpu_session_t *session,
+                  struct ompi_op_t *op,
+                  struct ompi_datatype_t *dtype);
+
+/**
+ * Optional component hook: free managed memory, GPU stream, and backend
+ * private state.  Called when a pooled session is permanently discarded.
+ * Must NOT free the ompi_op_gpu_session_t struct itself.
+ */
+typedef void (*ompi_op_base_component_session_free_fn_t)(
                   struct ompi_op_gpu_session_t *session);
 
 /**
@@ -373,9 +393,11 @@ typedef struct ompi_op_base_component_1_0_0_t {
     ompi_op_base_component_op_query_1_0_0_fn_t opc_op_query;
 
     /** Optional: GPU session lifecycle hooks.  NULL in host-only components. */
-    ompi_op_base_component_session_begin_fn_t  opc_session_begin;
-    ompi_op_base_component_session_reduce_fn_t opc_session_reduce;
-    ompi_op_base_component_session_end_fn_t    opc_session_end;
+    ompi_op_base_component_session_begin_fn_t   opc_session_begin;
+    ompi_op_base_component_session_reduce_fn_t  opc_session_reduce;
+    ompi_op_base_component_session_stop_fn_t    opc_session_stop;
+    ompi_op_base_component_session_restart_fn_t opc_session_restart;
+    ompi_op_base_component_session_free_fn_t    opc_session_free;
 } ompi_op_base_component_1_0_0_t;
 
 
diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
new file mode 100644
index 00000000000..95f993858c1
--- /dev/null
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# C sources (compiled by the normal C compiler)
+sources = \
+    op_rocm.h \
+    op_rocm_component.c \
+    op_rocm_session.c
+
+# The .cpp file is compiled separately by hipcc and linked in as a plain
+# object.  Libtool does not know how to compile HIP C++, so we use a custom
+# rule.  The resulting object is appended to LIBADD for both DSO and static
+# builds.
+
+EXTRA_DIST  = op_rocm_kernels.cpp
+CLEANFILES  = op_rocm_kernels.o
+
+# Include paths forwarded to hipcc so it can find ompi_config.h and the
+# op/mca headers.
+HIPCC_INCLUDES = \
+    -I$(top_srcdir) \
+    -I$(top_builddir) \
+    -I$(top_srcdir)/ompi \
+    -I$(top_builddir)/ompi
+
+op_rocm_kernels.o: $(srcdir)/op_rocm_kernels.cpp \
+                   $(srcdir)/op_rocm.h
+	$(HIPCC) $(HIPCCFLAGS) $(HIPCC_INCLUDES) \
+	    $(op_rocm_CPPFLAGS) \
+	    -c $< -o $@
+
+AM_CPPFLAGS = $(op_rocm_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# DSO build
+# ----------------------------------------------------------------------------
+if MCA_BUILD_ompi_op_rocm_DSO
+component_install = mca_op_rocm.la
+component_noinst  =
+else
+component_install =
+component_noinst  = libmca_op_rocm.la
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+
+mca_op_rocm_la_SOURCES  = $(sources)
+mca_op_rocm_la_LDFLAGS  = -module -avoid-version $(op_rocm_LDFLAGS)
+mca_op_rocm_la_LIBADD   = $(op_rocm_LIBS) op_rocm_kernels.o
+mca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS)
+
+# ----------------------------------------------------------------------------
+# Static (convenience library) build
+# ----------------------------------------------------------------------------
+noinst_LTLIBRARIES = $(component_noinst)
+
+libmca_op_rocm_la_SOURCES  = $(sources)
+libmca_op_rocm_la_LDFLAGS  = -module -avoid-version $(op_rocm_LDFLAGS)
+libmca_op_rocm_la_LIBADD   = $(op_rocm_LIBS) op_rocm_kernels.o
+libmca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS)
diff --git a/ompi/mca/op/rocm/configure.m4 b/ompi/mca/op/rocm/configure.m4
new file mode 100644
index 00000000000..eaa6c18458c
--- /dev/null
+++ b/ompi/mca/op/rocm/configure.m4
@@ -0,0 +1,68 @@
+# -*- shell-script -*-
+#
+# Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+#                         reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+
+# MCA_ompi_op_rocm_CONFIG([action-if-can-compile],
+#                          [action-if-cant-compile])
+# ------------------------------------------------
+# Build the ROCm persistent-kernel op component only when the HIP runtime
+# (libamdhip64 + hip/hip_runtime.h) and hipcc are available.
+#
+# Calls OPAL_CHECK_ROCM to locate headers and libraries, then separately
+# finds hipcc.  Sets:
+#   op_rocm_CPPFLAGS — include/define flags for HIP (includes -D__HIP_PLATFORM_AMD__)
+#   op_rocm_LDFLAGS  — library search path for libamdhip64
+#   op_rocm_LIBS     — -lamdhip64
+#   HIPCC            — path to the hipcc compiler
+#   HIPCCFLAGS       — default hipcc flags
+#
+AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[
+    AC_CONFIG_FILES([ompi/mca/op/rocm/Makefile])
+
+    OPAL_VAR_SCOPE_PUSH([op_rocm_happy op_rocm_hipcc_path])
+
+    op_rocm_happy=no
+
+    # OPAL_CHECK_ROCM calls OAC_CHECK_PACKAGE and sets:
+    #   op_rocm_CPPFLAGS, op_rocm_LDFLAGS, op_rocm_LIBS
+    # It also sets ROCM_SUPPORT=1 on success.
+    OPAL_CHECK_ROCM([op_rocm],
+        [op_rocm_happy=yes],
+        [op_rocm_happy=no])
+
+    # Find hipcc alongside the ROCm installation.
+    AS_IF([test "$op_rocm_happy" = "yes"],
+      [op_rocm_hipcc_path="$PATH"
+       AS_IF([test -n "$with_rocm" && test "$with_rocm" != "no" && test -d "$with_rocm/bin"],
+             [op_rocm_hipcc_path="$with_rocm/bin:$PATH"],
+             [AS_IF([test -d "/opt/rocm/bin"],
+                    [op_rocm_hipcc_path="/opt/rocm/bin:$PATH"])])
+       AC_PATH_PROG([HIPCC], [hipcc], [not_found], [$op_rocm_hipcc_path])
+       AS_IF([test "$HIPCC" = "not_found"],
+             [AC_MSG_WARN([hipcc not found; skipping op/rocm component])
+              op_rocm_happy=no])
+      ])
+
+    # Default HIPCCFLAGS if not already set by the user.
+    AS_IF([test "$op_rocm_happy" = "yes" && test "x$HIPCCFLAGS" = "x"],
+          [HIPCCFLAGS="--offload-arch=gfx906"])
+
+    AC_SUBST([op_rocm_CPPFLAGS])
+    AC_SUBST([op_rocm_LDFLAGS])
+    AC_SUBST([op_rocm_LIBS])
+    AC_SUBST([HIPCC])
+    AC_SUBST([HIPCCFLAGS])
+
+    OPAL_VAR_SCOPE_POP
+
+    AS_IF([test "$op_rocm_happy" = "yes"],
+          [$1],
+          [$2])
+])dnl
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
new file mode 100644
index 00000000000..7974cc82f04
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -0,0 +1,54 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#ifndef OMPI_MCA_OP_ROCM_H
+#define OMPI_MCA_OP_ROCM_H
+
+#include "ompi_config.h"
+#include <hip/hip_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/op/op_gpu_session.h"  /* defines ompi_op_gpu_cmd_t */
+
+BEGIN_C_DECLS
+
+/**
+ * Private per-session state owned by the rocm component.
+ * Stored in ompi_op_gpu_session_t.backend.
+ */
+typedef struct {
+    ompi_op_gpu_cmd_t   *cmd;       /* managed-memory command slot         */
+    volatile int32_t    *shutdown;  /* managed-memory shutdown flag         */
+    hipStream_t          stream;    /* private HIP stream for this session  */
+} ompi_op_rocm_session_priv_t;
+
+/**
+ * Host-side launcher function type.
+ * Launches the persistent kernel for one (op, type) combination.
+ */
+typedef void (*ompi_op_rocm_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd,
+                                           volatile int32_t  *shutdown,
+                                           hipStream_t        stream);
+
+/**
+ * 2D table [op_index][type_index] of launcher functions.
+ * NULL entries indicate unsupported (op, type) combinations; the session
+ * machinery returns NULL for those and the caller falls back to the host path.
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* × OMPI_OP_BASE_TYPE_*.
+ * Defined (and initialized) in op_rocm_kernels.cpp.
+ */
+OMPI_DECLSPEC extern ompi_op_rocm_launcher_fn_t
+ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+END_C_DECLS
+
+#endif /* OMPI_MCA_OP_ROCM_H */
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
new file mode 100644
index 00000000000..69c801580f0
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -0,0 +1,109 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+
+#include <hip/hip_runtime.h>
+
+#include "ompi/constants.h"
+#include "ompi/op/op.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+/* Forward declarations of session hooks (implemented in op_rocm_session.c) */
+ompi_op_gpu_session_t *ompi_op_rocm_session_begin(struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype,
+                                                   int dev_id);
+void ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
+                                  const void *src, void *dst, size_t count);
+void ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session);
+bool ompi_op_rocm_session_restart(ompi_op_gpu_session_t *session,
+                                   struct ompi_op_t *op,
+                                   struct ompi_datatype_t *dtype);
+void ompi_op_rocm_session_free(ompi_op_gpu_session_t *session);
+
+static int rocm_component_open(void);
+static int rocm_component_close(void);
+static int rocm_component_init_query(bool enable_progress_threads,
+                                      bool enable_mpi_thread_multiple);
+static struct ompi_op_base_module_1_0_0_t *
+    rocm_component_op_query(struct ompi_op_t *op, int *priority);
+
+/*
+ * Public component descriptor.
+ */
+ompi_op_base_component_1_0_0_t mca_op_rocm_component = {
+    .opc_version = {
+        OMPI_OP_BASE_VERSION_1_0_0,
+
+        .mca_component_name = "rocm",
+        MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                              OMPI_RELEASE_VERSION),
+        .mca_open_component  = rocm_component_open,
+        .mca_close_component = rocm_component_close,
+    },
+    .opc_data = {
+        MCA_BASE_METADATA_PARAM_CHECKPOINT
+    },
+
+    .opc_init_query = rocm_component_init_query,
+    .opc_op_query   = rocm_component_op_query,
+
+    /* GPU session hooks */
+    .opc_session_begin   = ompi_op_rocm_session_begin,
+    .opc_session_reduce  = ompi_op_rocm_session_reduce,
+    .opc_session_stop    = ompi_op_rocm_session_stop,
+    .opc_session_restart = ompi_op_rocm_session_restart,
+    .opc_session_free    = ompi_op_rocm_session_free,
+};
+MCA_BASE_COMPONENT_INIT(ompi, op, rocm)
+
+static int
+rocm_component_open(void)
+{
+    return OMPI_SUCCESS;
+}
+
+static int
+rocm_component_close(void)
+{
+    return OMPI_SUCCESS;
+}
+
+/*
+ * Only activate this component when at least one ROCm-capable device is
+ * present in the system.
+ */
+static int
+rocm_component_init_query(bool enable_progress_threads,
+                           bool enable_mpi_thread_multiple)
+{
+    int device_count = 0;
+    hipError_t err   = hipGetDeviceCount(&device_count);
+    if (hipSuccess != err || device_count <= 0) {
+        return OMPI_ERR_NOT_SUPPORTED;
+    }
+    return OMPI_SUCCESS;
+}
+
+/*
+ * We do not provide per-op function pointers, only session hooks, so
+ * always return NULL here.
+ */
+static struct ompi_op_base_module_1_0_0_t *
+rocm_component_op_query(struct ompi_op_t *op, int *priority)
+{
+    (void) op;
+    (void) priority;
+    return NULL;
+}
diff --git a/ompi/mca/op/rocm/op_rocm_kernels.cpp b/ompi/mca/op/rocm/op_rocm_kernels.cpp
new file mode 100644
index 00000000000..cf42e3ac019
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_kernels.cpp
@@ -0,0 +1,286 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Persistent reduction kernels for the ROCm op component.
+ *
+ * Structure mirrors op_cuda_kernels.cu; differences from the CUDA version:
+ *   - #include <hip/hip_runtime.h> instead of <cuda_runtime.h>
+ *   - __builtin_amdgcn_s_sleep(1) replaces __nanosleep(1000) for the
+ *     low-power spin-wait (GCN instruction; ~64 clock sleep)
+ *   - All cuda* API calls replaced by their hip* equivalents
+ *   - Launcher macro uses ompi_op_rocm_persistent_* prefix
+ */
+
+#include <stdint.h>
+#include <hip/hip_runtime.h>
+
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+/* -------------------------------------------------------------------------
+ * PERSISTENT_KERNEL(name, ctype, op_expr)
+ *
+ * Generates __global__ void ompi_op_rocm_persistent_<name>(...).
+ * op_expr must be a statement that updates dst[i] in-place using src[i].
+ * ------------------------------------------------------------------------- */
+#define PERSISTENT_KERNEL(kname, ctype, op_expr)                               \
+__global__ void ompi_op_rocm_persistent_##kname(                               \
+        ompi_op_gpu_cmd_t *cmd, volatile int32_t *shutdown)                    \
+{                                                                               \
+    while (!*shutdown) {                                                        \
+        /* Spin-wait for work; sleep ~64 clocks between polls to save power */ \
+        while (cmd->status != 1 && !*shutdown) {                               \
+            __builtin_amdgcn_s_sleep(1);                                       \
+        }                                                                       \
+        if (*shutdown) break;                                                   \
+        const ctype * __restrict__ src = (const ctype *) cmd->src;             \
+              ctype * __restrict__ dst = (      ctype *) cmd->dst;             \
+        int64_t n = cmd->count;                                                 \
+        for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) {      \
+            op_expr;                                                            \
+        }                                                                       \
+        __syncthreads();                                                        \
+        if (threadIdx.x == 0) {                                                 \
+            __threadfence_system();   /* ensure dst writes reach host */        \
+            cmd->status = 2;          /* signal done */                         \
+        }                                                                       \
+    }                                                                           \
+}
+
+/* =========================================================================
+ * Kernel instantiations
+ * ========================================================================= */
+
+/* --- MAX --- */
+PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_float,  float,    dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_double, double,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+
+/* --- MIN --- */
+PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_float,  float,    dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_double, double,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+
+/* --- SUM --- */
+PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] += src[i])
+PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] += src[i])
+PERSISTENT_KERNEL(sum_float,  float,    dst[i] += src[i])
+PERSISTENT_KERNEL(sum_double, double,   dst[i] += src[i])
+
+/* --- PROD --- */
+PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_float,  float,    dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_double, double,   dst[i] *= src[i])
+
+/* --- BAND (bitwise AND, integer types only) --- */
+PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] &= src[i])
+PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] &= src[i])
+
+/* --- BOR (bitwise OR) --- */
+PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] |= src[i])
+
+/* --- BXOR (bitwise XOR) --- */
+PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] ^= src[i])
+
+/* =========================================================================
+ * Host-side launcher wrappers — one per kernel, 1 block × 256 threads.
+ * ========================================================================= */
+#define LAUNCHER(kname)                                                        \
+static void launch_##kname(ompi_op_gpu_cmd_t *cmd,                            \
+                            volatile int32_t  *sd,                             \
+                            hipStream_t        stream)                         \
+{                                                                               \
+    ompi_op_rocm_persistent_##kname<<<1, 256, 0, stream>>>(cmd, sd);          \
+}
+
+LAUNCHER(max_int8)    LAUNCHER(max_uint8)
+LAUNCHER(max_int16)   LAUNCHER(max_uint16)
+LAUNCHER(max_int32)   LAUNCHER(max_uint32)
+LAUNCHER(max_int64)   LAUNCHER(max_uint64)
+LAUNCHER(max_float)   LAUNCHER(max_double)
+
+LAUNCHER(min_int8)    LAUNCHER(min_uint8)
+LAUNCHER(min_int16)   LAUNCHER(min_uint16)
+LAUNCHER(min_int32)   LAUNCHER(min_uint32)
+LAUNCHER(min_int64)   LAUNCHER(min_uint64)
+LAUNCHER(min_float)   LAUNCHER(min_double)
+
+LAUNCHER(sum_int8)    LAUNCHER(sum_uint8)
+LAUNCHER(sum_int16)   LAUNCHER(sum_uint16)
+LAUNCHER(sum_int32)   LAUNCHER(sum_uint32)
+LAUNCHER(sum_int64)   LAUNCHER(sum_uint64)
+LAUNCHER(sum_float)   LAUNCHER(sum_double)
+
+LAUNCHER(prod_int8)   LAUNCHER(prod_uint8)
+LAUNCHER(prod_int16)  LAUNCHER(prod_uint16)
+LAUNCHER(prod_int32)  LAUNCHER(prod_uint32)
+LAUNCHER(prod_int64)  LAUNCHER(prod_uint64)
+LAUNCHER(prod_float)  LAUNCHER(prod_double)
+
+LAUNCHER(band_int8)   LAUNCHER(band_uint8)
+LAUNCHER(band_int16)  LAUNCHER(band_uint16)
+LAUNCHER(band_int32)  LAUNCHER(band_uint32)
+LAUNCHER(band_int64)  LAUNCHER(band_uint64)
+
+LAUNCHER(bor_int8)    LAUNCHER(bor_uint8)
+LAUNCHER(bor_int16)   LAUNCHER(bor_uint16)
+LAUNCHER(bor_int32)   LAUNCHER(bor_uint32)
+LAUNCHER(bor_int64)   LAUNCHER(bor_uint64)
+
+LAUNCHER(bxor_int8)   LAUNCHER(bxor_uint8)
+LAUNCHER(bxor_int16)  LAUNCHER(bxor_uint16)
+LAUNCHER(bxor_int32)  LAUNCHER(bxor_uint32)
+LAUNCHER(bxor_int64)  LAUNCHER(bxor_uint64)
+
+/* =========================================================================
+ * 2D launcher table [op_index][type_index]
+ *
+ * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns).
+ * Zero/NULL entries mean "not supported on GPU" → host fallback.
+ * ========================================================================= */
+ompi_op_rocm_launcher_fn_t
+ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = {
+
+    [OMPI_OP_BASE_FORTRAN_MAX] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_max_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_max_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_max_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_max_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_max_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_max_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_max_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_max_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_max_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_max_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_MIN] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_min_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_min_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_min_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_min_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_min_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_min_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_min_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_min_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_min_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_min_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_SUM] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_sum_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_sum_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_sum_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_sum_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_sum_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_sum_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_sum_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_sum_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_sum_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_sum_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_PROD] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_prod_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_prod_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_prod_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_prod_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_prod_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_prod_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_prod_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_prod_uint64,
+        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_prod_float,
+        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_prod_double,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_BAND] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_band_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_band_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_band_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_band_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_band_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_band_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_band_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_band_uint64,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_BOR] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bor_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bor_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bor_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bor_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bor_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bor_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bor_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bor_uint64,
+    },
+
+    [OMPI_OP_BASE_FORTRAN_BXOR] = {
+        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bxor_int8,
+        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bxor_uint8,
+        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bxor_int16,
+        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bxor_uint16,
+        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bxor_int32,
+        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bxor_uint32,
+        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bxor_int64,
+        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bxor_uint64,
+    },
+
+    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: all NULL → host path */
+};
diff --git a/ompi/mca/op/rocm/op_rocm_session.c b/ompi/mca/op/rocm/op_rocm_session.c
new file mode 100644
index 00000000000..8dbbe56c87f
--- /dev/null
+++ b/ompi/mca/op/rocm/op_rocm_session.c
@@ -0,0 +1,261 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
+/*
+ * Copyright (c) 2025      Amazon.com, Inc. or its affiliates.  All rights
+ *                         reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+/*
+ * Session lifecycle for the ROCm persistent-kernel op component.
+ * Mirrors op_cuda_session.c with hip* API calls in place of cuda*.
+ *
+ * session_begin:   look up the kernel in the 2D launcher table, allocate
+ *                  managed-memory command slot + shutdown flag, create a
+ *                  private HIP stream, and launch the persistent kernel.
+ *
+ * session_reduce:  write src/dst/count to the command slot, set status=1
+ *                  to wake the kernel, and spin until status==2.
+ *
+ * session_stop:    signal the persistent kernel to exit and synchronize the
+ *                  stream.  GPU stream and managed memory remain allocated
+ *                  so the session can be reused via session_restart.
+ *
+ * session_restart: reconfigure an idle (stopped) session for a new (op, dtype)
+ *                  combination and relaunch the appropriate persistent kernel.
+ *                  Returns false if no GPU kernel exists for the combination.
+ *
+ * session_free:    release the HIP stream, managed memory, and backend
+ *                  private state when a session is permanently discarded.
+ *                  Does NOT free the ompi_op_gpu_session_t struct.
+ */
+
+#include "ompi_config.h"
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sched.h>
+
+#include <hip/hip_runtime.h>
+
+#include "ompi/op/op.h"
+#include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op_gpu_session.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/rocm/op_rocm.h"
+
+/* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */
+extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED];
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_begin
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_rocm_session_begin(struct ompi_op_t *op,
+                            struct ompi_datatype_t *dtype,
+                            int dev_id)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return NULL;
+    }
+
+    ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
+        return NULL;   /* no GPU kernel for this (op, type) combination */
+    }
+
+    /* Allocate the public session struct returned to the caller */
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
+    }
+
+    /* Allocate component-private state */
+    ompi_op_rocm_session_priv_t *priv =
+        (ompi_op_rocm_session_priv_t *) malloc(sizeof(ompi_op_rocm_session_priv_t));
+    if (NULL == priv) {
+        free(session);
+        return NULL;
+    }
+
+    hipError_t err;
+
+    /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
+    err = hipMallocManaged((void **) &priv->cmd,
+                           sizeof(ompi_op_gpu_cmd_t),
+                           hipMemAttachGlobal);
+    if (hipSuccess != err) {
+        free(priv);
+        free(session);
+        return NULL;
+    }
+    priv->cmd->src    = NULL;
+    priv->cmd->dst    = NULL;
+    priv->cmd->count  = 0;
+    priv->cmd->status = 0;
+
+    /* Allocate managed-memory shutdown flag */
+    err = hipMallocManaged((void **) &priv->shutdown,
+                           sizeof(int32_t),
+                           hipMemAttachGlobal);
+    if (hipSuccess != err) {
+        hipFree(priv->cmd);
+        free(priv);
+        free(session);
+        return NULL;
+    }
+    *priv->shutdown = 0;
+
+    /* Create a dedicated non-blocking stream for this session */
+    err = hipStreamCreateWithFlags(&priv->stream, hipStreamNonBlocking);
+    if (hipSuccess != err) {
+        hipFree(priv->shutdown);
+        hipFree(priv->cmd);
+        free(priv);
+        free(session);
+        return NULL;
+    }
+
+    /* Launch the persistent kernel (1 block, 256 threads) */
+    launcher(priv->cmd, priv->shutdown, priv->stream);
+    err = hipGetLastError();
+    if (hipSuccess != err) {
+        hipStreamDestroy(priv->stream);
+        hipFree(priv->shutdown);
+        hipFree(priv->cmd);
+        free(priv);
+        free(session);
+        return NULL;
+    }
+
+    session->dev_id    = dev_id;
+    session->allocator = NULL;   /* scratch allocator wired in Phase 4 */
+    session->backend   = priv;
+
+    return session;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_reduce
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
+                             const void *src, void *dst, size_t count)
+{
+    ompi_op_rocm_session_priv_t *priv =
+        (ompi_op_rocm_session_priv_t *) session->backend;
+
+    /* Write operands before signalling the kernel */
+    priv->cmd->src   = src;
+    priv->cmd->dst   = dst;
+    priv->cmd->count = (int64_t) count;
+
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
+    priv->cmd->status = 1;                     /* wake the kernel */
+
+    /* Spin-wait for the kernel to signal completion */
+    while (2 != priv->cmd->status) {
+        sched_yield();   /* relinquish CPU timeslice while waiting */
+    }
+
+    /* Reset for the next call */
+    priv->cmd->status = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_stop
+ *
+ * Signal the persistent kernel to exit and wait for the stream to drain.
+ * The HIP stream and managed memory remain allocated so the session can be
+ * recycled via ompi_op_rocm_session_restart.
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session)
+{
+    ompi_op_rocm_session_priv_t *priv =
+        (ompi_op_rocm_session_priv_t *) session->backend;
+
+    /* Signal the kernel to exit its loop */
+    *priv->shutdown = 1;
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+    /* Wait for the kernel to finish; stream remains valid after this */
+    hipStreamSynchronize(priv->stream);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_restart
+ *
+ * Reconfigure an idle (stopped) session for a new (op, dtype) combination
+ * and relaunch the appropriate persistent kernel.  Returns false if no GPU
+ * kernel exists for this combination.
+ * -------------------------------------------------------------------------- */
+bool
+ompi_op_rocm_session_restart(ompi_op_gpu_session_t *session,
+                              struct ompi_op_t *op,
+                              struct ompi_datatype_t *dtype)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return false;
+    }
+
+    ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
+        return false;
+    }
+
+    ompi_op_rocm_session_priv_t *priv =
+        (ompi_op_rocm_session_priv_t *) session->backend;
+
+    /* Reset state for the new kernel */
+    *priv->shutdown   = 0;
+    priv->cmd->src    = NULL;
+    priv->cmd->dst    = NULL;
+    priv->cmd->count  = 0;
+    priv->cmd->status = 0;
+
+    /* Launch the persistent kernel for the new (op, dtype) */
+    launcher(priv->cmd, priv->shutdown, priv->stream);
+    hipError_t err = hipGetLastError();
+    if (hipSuccess != err) {
+        return false;
+    }
+
+    return true;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_free
+ *
+ * Free the HIP stream, managed memory, and backend private state.
+ * Does NOT free the ompi_op_gpu_session_t struct (that is the caller's
+ * responsibility, done by session_destroy in op_gpu_session.c).
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_rocm_session_free(ompi_op_gpu_session_t *session)
+{
+    ompi_op_rocm_session_priv_t *priv =
+        (ompi_op_rocm_session_priv_t *) session->backend;
+    if (NULL == priv) {
+        return;
+    }
+
+    hipStreamDestroy(priv->stream);
+    hipFree((void *) priv->shutdown);
+    hipFree(priv->cmd);
+    free(priv);
+    session->backend = NULL;
+}
diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c
index b3aefdd5c98..aec06f44df6 100644
--- a/ompi/op/op_gpu_session.c
+++ b/ompi/op/op_gpu_session.c
@@ -9,38 +9,202 @@
  * $HEADER$
  */
 
+/*
+ * Dispatcher and freelist pool for GPU reduction sessions.
+ *
+ * Sessions are expensive to create: each one allocates managed memory and
+ * creates a private GPU stream.  Rather than destroy a session at the end
+ * of every collective and recreate it at the start of the next, we keep a
+ * flat pool of idle sessions keyed by dev_id.
+ *
+ * Pool lifecycle:
+ *   session_end()  — stops the persistent kernel (GPU stream and managed
+ *                    memory remain allocated), then pushes the session onto
+ *                    the freelist.
+ *   session_begin() — if a matching dev_id entry is found, pops it and calls
+ *                    restart_fn(session, op, dtype) to reconfigure and relaunch
+ *                    the appropriate kernel; no cudaMalloc / hipMalloc overhead.
+ *                    On pool miss, iterates op components to allocate fresh.
+ *
+ * Pool layout:
+ *   session_pool_head — singly-linked freelist, linked through session->pool_next
+ *   session_pool_count — current freelist length (global cap = SESSION_POOL_MAX)
+ *   session_pool_lock  — single mutex protecting all pool state
+ *
+ * SESSION_POOL_MAX caps the total number of idle sessions.  Sessions beyond
+ * the cap are permanently destroyed rather than pooled to bound GPU resource
+ * accumulation.
+ */
+
 #include "ompi_config.h"
 
+#include <stdlib.h>
+
+#include "opal/class/opal_list.h"
+#include "opal/mca/base/base.h"
+#include "opal/mca/threads/mutex.h"
+#include "ompi/mca/op/op.h"
+#include "ompi/mca/op/base/base.h"
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/op/op.h"
 
+/* Maximum number of idle sessions kept in the pool. */
+#define SESSION_POOL_MAX 8
+
+static ompi_op_gpu_session_t *session_pool_head  = NULL;
+static int                    session_pool_count  = 0;
+static opal_mutex_t           session_pool_lock   = OPAL_MUTEX_STATIC_INIT;
+
+/* --------------------------------------------------------------------------
+ * session_destroy — permanently shut down a session and free all resources.
+ * Called when the pool is at capacity or at finalization.
+ * -------------------------------------------------------------------------- */
+static void
+session_destroy(ompi_op_gpu_session_t *session)
+{
+    session->free_fn(session);   /* component frees stream, managed mem, priv */
+    free(session);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_begin
+ *
+ * 1. Walk the pool freelist for a matching dev_id entry.
+ * 2. On hit: pop the idle session, call restart_fn to reconfigure for the
+ *    new (op, dtype) and relaunch the kernel.  If restart fails (no kernel
+ *    for this combination), destroy the session and return NULL.
+ * 3. On pool miss: iterate op components to create a new session; wire
+ *    dispatch hooks before returning.
+ * -------------------------------------------------------------------------- */
 ompi_op_gpu_session_t *
 ompi_op_gpu_session_begin(struct ompi_op_t *op,
                           struct ompi_datatype_t *dtype,
                           int dev_id)
 {
-    /* Phase 1 stub: no GPU op components yet.  Always return NULL so that
-     * all callers use the host ompi_op_reduce path. */
-    (void) op;
-    (void) dtype;
-    (void) dev_id;
+    /* Check pool for a reusable idle session on this device. */
+    OPAL_THREAD_LOCK(&session_pool_lock);
+    ompi_op_gpu_session_t **pp = &session_pool_head;
+    while (NULL != *pp) {
+        if ((*pp)->dev_id == dev_id) {
+            /* Found a matching idle session — remove from freelist. */
+            ompi_op_gpu_session_t *s = *pp;
+            *pp = s->pool_next;
+            session_pool_count--;
+            OPAL_THREAD_UNLOCK(&session_pool_lock);
+            s->pool_next = NULL;
+
+            /* Reconfigure the session for the new (op, dtype). */
+            if (!s->restart_fn(s, op, dtype)) {
+                /* No GPU kernel for this combination; release and return NULL. */
+                session_destroy(s);
+                return NULL;
+            }
+            return s;
+        }
+        pp = &(*pp)->pool_next;
+    }
+    OPAL_THREAD_UNLOCK(&session_pool_lock);
+
+    /* Pool miss — create a fresh session via the first matching component. */
+    mca_base_component_list_item_t *cli;
+    OPAL_LIST_FOREACH(cli, &ompi_op_base_framework.framework_components,
+                      mca_base_component_list_item_t) {
+        const mca_base_component_t *bc = cli->cli_component;
+
+        if (1 != bc->mca_type_major_version ||
+            0 != bc->mca_type_minor_version ||
+            0 != bc->mca_type_release_version) {
+            continue;
+        }
+
+        const ompi_op_base_component_1_0_0_t *opc =
+            (const ompi_op_base_component_1_0_0_t *) bc;
+
+        if (NULL == opc->opc_session_begin   ||
+            NULL == opc->opc_session_reduce  ||
+            NULL == opc->opc_session_stop    ||
+            NULL == opc->opc_session_restart ||
+            NULL == opc->opc_session_free) {
+            continue;
+        }
+
+        ompi_op_gpu_session_t *session = opc->opc_session_begin(op, dtype, dev_id);
+        if (NULL == session) {
+            continue;
+        }
+
+        /* Wire dispatch hooks and pool bookkeeping. */
+        session->reduce_fn  = opc->opc_session_reduce;
+        session->stop_fn    = opc->opc_session_stop;
+        session->restart_fn = opc->opc_session_restart;
+        session->free_fn    = opc->opc_session_free;
+        session->pool_next  = NULL;
+        return session;
+    }
+
     return NULL;
 }
 
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_reduce
+ * -------------------------------------------------------------------------- */
 void
 ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
                            const void *src, void *dst, size_t count)
 {
-    /* Must not be called when session is NULL */
-    (void) session;
-    (void) src;
-    (void) dst;
-    (void) count;
+    session->reduce_fn(session, src, dst, count);
 }
 
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_end
+ *
+ * Stop the persistent kernel and return the session to the pool so its GPU
+ * stream and managed memory can be reused by the next collective on the same
+ * device.  If the pool is already at SESSION_POOL_MAX, destroy immediately.
+ * -------------------------------------------------------------------------- */
 void
 ompi_op_gpu_session_end(ompi_op_gpu_session_t *session)
 {
-    /* NULL-safe no-op in Phase 1 */
-    (void) session;
+    if (NULL == session) {
+        return;
+    }
+
+    /* Signal the kernel to exit and wait for the stream to drain.
+     * GPU stream and managed memory remain allocated for reuse. */
+    session->stop_fn(session);
+
+    OPAL_THREAD_LOCK(&session_pool_lock);
+    if (session_pool_count < SESSION_POOL_MAX) {
+        session->pool_next = session_pool_head;
+        session_pool_head  = session;
+        session_pool_count++;
+        OPAL_THREAD_UNLOCK(&session_pool_lock);
+        return;
+    }
+    OPAL_THREAD_UNLOCK(&session_pool_lock);
+
+    /* Pool full — destroy immediately. */
+    session_destroy(session);
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_pool_finalize
+ *
+ * Drain the pool, release all GPU resources, and free session structs.
+ * Called once from ompi_op_base_close() during MPI_Finalize.
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_gpu_session_pool_finalize(void)
+{
+    OPAL_THREAD_LOCK(&session_pool_lock);
+    ompi_op_gpu_session_t *s = session_pool_head;
+    session_pool_head  = NULL;
+    session_pool_count = 0;
+    OPAL_THREAD_UNLOCK(&session_pool_lock);
+
+    while (NULL != s) {
+        ompi_op_gpu_session_t *next = s->pool_next;
+        session_destroy(s);
+        s = next;
+    }
 }
diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h
index 79ff195e18e..2213d921a80 100644
--- a/ompi/op/op_gpu_session.h
+++ b/ompi/op/op_gpu_session.h
@@ -13,6 +13,8 @@
 #define OMPI_OP_GPU_SESSION_H
 
 #include "ompi_config.h"
+#include <stdbool.h>
+#include <stdint.h>
 #include "opal/mca/allocator/allocator.h"
 
 BEGIN_C_DECLS
@@ -20,17 +22,60 @@ BEGIN_C_DECLS
 struct ompi_op_t;
 struct ompi_datatype_t;
 
+/**
+ * Managed-memory command slot shared between the host and the persistent
+ * reduction kernel (accessible by both CPU and GPU via managed/unified memory).
+ *
+ * status lifecycle (per reduction call):
+ *   0 = idle       (initial; host resets after kernel signals done)
+ *   1 = work_ready (host → kernel: pointers and count are valid)
+ *   2 = done       (kernel → host: reduction complete)
+ */
+typedef struct {
+    const void      *src;
+    void            *dst;
+    int64_t          count;
+    volatile int32_t status;
+} ompi_op_gpu_cmd_t;
+
 /**
  * Per-collective GPU reduction session.  Created by ompi_op_gpu_session_begin()
- * before a collective algorithm's reduction loop starts, and destroyed by
- * ompi_op_gpu_session_end() after the loop completes.  When no GPU op
- * component is available or the (op, dtype) combination has no GPU kernel,
- * begin() returns NULL and all callers fall back to ompi_op_reduce().
+ * before a collective algorithm's reduction loop starts, and returned to the
+ * session pool by ompi_op_gpu_session_end() for reuse by a future collective.
+ *
+ * Pool lifecycle: session_end() stops the persistent kernel (GPU resources
+ * remain allocated) and pushes the session onto a freelist.  A future
+ * session_begin() for the same dev_id pops the idle session and calls
+ * restart_fn to reconfigure and relaunch the appropriate kernel — no
+ * cudaMalloc/hipMalloc or stream creation overhead on the reuse path.
+ *
+ * When no GPU op component supports the (op, dtype) combination, begin()
+ * returns NULL and all callers fall back to ompi_op_reduce().
+ *
+ * reduce_fn, stop_fn, restart_fn, free_fn, and pool_next are managed by
+ * op_gpu_session.c — callers must not set them directly.
  */
 typedef struct ompi_op_gpu_session_t {
     int                          dev_id;
     mca_allocator_base_module_t *allocator;  /* GPU scratch allocator for this session */
     void                        *backend;    /* opaque: cuda or rocm session state */
+    /* Dispatch hooks wired at session_begin time. */
+    void (*reduce_fn)(struct ompi_op_gpu_session_t *session,
+                      const void *src, void *dst, size_t count);
+    /* Signal the persistent kernel to exit and synchronize the stream.
+     * GPU stream and managed memory remain allocated for reuse. */
+    void (*stop_fn)(struct ompi_op_gpu_session_t *session);
+    /* Reconfigure an idle session for a new (op, dtype) and relaunch the
+     * persistent kernel.  Returns false if no GPU kernel exists for this
+     * combination (caller must then free the session and return NULL). */
+    bool (*restart_fn)(struct ompi_op_gpu_session_t *session,
+                       struct ompi_op_t *op,
+                       struct ompi_datatype_t *dtype);
+    /* Release managed memory, GPU stream, and backend private state.
+     * Must NOT free the ompi_op_gpu_session_t struct itself. */
+    void (*free_fn)(struct ompi_op_gpu_session_t *session);
+    /* Pool bookkeeping — do not access directly. */
+    struct ompi_op_gpu_session_t *pool_next;
 } ompi_op_gpu_session_t;
 
 /**
@@ -50,11 +95,19 @@ OMPI_DECLSPEC void ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
                                                const void *src, void *dst, size_t count);
 
 /**
- * Shut down the persistent kernel, synchronize the GPU stream, and free all
- * session resources.  NULL-safe.
+ * Stop the persistent kernel and return the session to the pool for reuse.
+ * GPU stream and managed memory remain allocated; a future begin() call for
+ * the same dev_id will relaunch the kernel without allocating new resources.
+ * NULL-safe.
  */
 OMPI_DECLSPEC void ompi_op_gpu_session_end(ompi_op_gpu_session_t *session);
 
+/**
+ * Drain and permanently destroy all pooled sessions.  Must be called once
+ * during MPI finalization (from ompi_op_base_close).
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_pool_finalize(void);
+
 END_C_DECLS
 
 #endif /* OMPI_OP_GPU_SESSION_H */

From 968b60da3abc09d248945fcb36105260d9f8b8fd Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 13 Apr 2026 23:12:31 -0400
Subject: [PATCH 05/13] op/rocm: fix build integration

Order of popping local variables does matter, I guess.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/op/rocm/Makefile.am  | 21 +++++++++++++--------
 ompi/mca/op/rocm/configure.m4 |  6 ++++--
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/ompi/mca/op/rocm/Makefile.am b/ompi/mca/op/rocm/Makefile.am
index 95f993858c1..a3bdfe08c8a 100644
--- a/ompi/mca/op/rocm/Makefile.am
+++ b/ompi/mca/op/rocm/Makefile.am
@@ -19,8 +19,8 @@ sources = \
 # rule.  The resulting object is appended to LIBADD for both DSO and static
 # builds.
 
-EXTRA_DIST  = op_rocm_kernels.cpp
-CLEANFILES  = op_rocm_kernels.o
+rocm_sources  = op_rocm_kernels.cpp
+CLEANFILES    = op_rocm_kernels.o
 
 # Include paths forwarded to hipcc so it can find ompi_config.h and the
 # op/mca headers.
@@ -28,13 +28,18 @@ HIPCC_INCLUDES = \
     -I$(top_srcdir) \
     -I$(top_builddir) \
     -I$(top_srcdir)/ompi \
-    -I$(top_builddir)/ompi
+    -I$(top_builddir)/ompi \
+    -I$(top_builddir)/opal \
+    -I$(top_builddir)/opal/include \
+    -I$(top_srcdir)/ompi/include \
+    -I$(top_srcdir)/opal/include
 
-op_rocm_kernels.o: $(srcdir)/op_rocm_kernels.cpp \
+op_rocm_kernels.l$(OBJEXT): $(srcdir)/op_rocm_kernels.cpp \
                    $(srcdir)/op_rocm.h
-	$(HIPCC) $(HIPCCFLAGS) $(HIPCC_INCLUDES) \
+	$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile \
+		$(HIPCC) $(HIPCCFLAGS) $(HIPCC_INCLUDES) \
 	    $(op_rocm_CPPFLAGS) \
-	    -c $< -o $@
+	    -c $<
 
 AM_CPPFLAGS = $(op_rocm_CPPFLAGS)
 
@@ -54,7 +59,7 @@ mcacomponent_LTLIBRARIES = $(component_install)
 
 mca_op_rocm_la_SOURCES  = $(sources)
 mca_op_rocm_la_LDFLAGS  = -module -avoid-version $(op_rocm_LDFLAGS)
-mca_op_rocm_la_LIBADD   = $(op_rocm_LIBS) op_rocm_kernels.o
+mca_op_rocm_la_LIBADD   = $(rocm_sources:.cpp=.lo)
 mca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS)
 
 # ----------------------------------------------------------------------------
@@ -64,5 +69,5 @@ noinst_LTLIBRARIES = $(component_noinst)
 
 libmca_op_rocm_la_SOURCES  = $(sources)
 libmca_op_rocm_la_LDFLAGS  = -module -avoid-version $(op_rocm_LDFLAGS)
-libmca_op_rocm_la_LIBADD   = $(op_rocm_LIBS) op_rocm_kernels.o
+libmca_op_rocm_la_LIBADD   = $(op_rocm_LIBS) $(rocm_sources:.cpp=.lo)
 libmca_op_rocm_la_CPPFLAGS = $(op_rocm_CPPFLAGS)
diff --git a/ompi/mca/op/rocm/configure.m4 b/ompi/mca/op/rocm/configure.m4
index eaa6c18458c..6625aa06f7a 100644
--- a/ompi/mca/op/rocm/configure.m4
+++ b/ompi/mca/op/rocm/configure.m4
@@ -50,6 +50,7 @@ AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[
               op_rocm_happy=no])
       ])
 
+    OPAL_SUMMARY_ADD([Accelerators], [ROCm operator support], [], [$op_rocm_happy])
     # Default HIPCCFLAGS if not already set by the user.
     AS_IF([test "$op_rocm_happy" = "yes" && test "x$HIPCCFLAGS" = "x"],
           [HIPCCFLAGS="--offload-arch=gfx906"])
@@ -60,9 +61,10 @@ AC_DEFUN([MCA_ompi_op_rocm_CONFIG],[
     AC_SUBST([HIPCC])
     AC_SUBST([HIPCCFLAGS])
 
-    OPAL_VAR_SCOPE_POP
-
     AS_IF([test "$op_rocm_happy" = "yes"],
           [$1],
           [$2])
+
+    OPAL_VAR_SCOPE_POP
+
 ])dnl

From e59d22ee4a28aeec9808726ae36377374ad6f955 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 25 May 2026 15:00:19 -0400
Subject: [PATCH 06/13] coll/tuned: Allocate sessions and allocators for
 coll/base calls

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/coll/base/coll_base_allgather.c      |   9 +-
 ompi/mca/coll/base/coll_base_functions.h      |   6 +-
 ompi/mca/coll/base/coll_base_scatter.c        |  11 +-
 ompi/mca/coll/tuned/coll_tuned.h              |   4 +-
 .../tuned/coll_tuned_allgather_decision.c     |   5 +-
 .../coll/tuned/coll_tuned_decision_dynamic.c  | 290 +++++++++++++-----
 .../coll/tuned/coll_tuned_decision_fixed.c    | 122 ++++++--
 .../coll/tuned/coll_tuned_scatter_decision.c  |   5 +-
 ompi/mca/op/cuda/op_cuda_kernels.cu           | 137 +++++----
 ompi/mca/op/cuda/op_cuda_session.c            |  20 +-
 ompi/mca/op/op.h                              |   2 +-
 ompi/mca/op/rocm/op_rocm_kernels.cpp          | 136 ++++----
 ompi/mca/op/rocm/op_rocm_session.c            |  15 +-
 ompi/op/op_gpu_session.c                      |  47 ++-
 ompi/op/op_gpu_session.h                      |  22 +-
 15 files changed, 542 insertions(+), 289 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c
index 6d9bd6fcfc3..5b357b511bc 100644
--- a/ompi/mca/coll/base/coll_base_allgather.c
+++ b/ompi/mca/coll/base/coll_base_allgather.c
@@ -771,7 +771,8 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
                                           struct ompi_datatype_t *rdtype,
                                           struct ompi_communicator_t *comm,
                                           mca_coll_base_module_t *module,
-                                          int radix)
+                                          int radix,
+                                          mca_allocator_base_module_t *allocator)
 {
     int line = -1, rank, size, dst, src, err = MPI_SUCCESS;
     int recvcount, distance;
@@ -796,7 +797,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
     if (0 != rank) {
         /* Compute the temporary buffer size, including datatypes empty gaps */
         rsize = opal_datatype_span(&rdtype->super, (size_t)rcount * (size - rank), &rgap);
-        tmp_buf = (char *) malloc(rsize);
+        tmp_buf = (char *) COLL_BASE_ALLOC(allocator, rsize);
         tmp_buf_start = tmp_buf - rgap;
     }
 
@@ -891,7 +892,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
         if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
     }
 
-    if(tmp_buf != NULL) free(tmp_buf);
+    if(tmp_buf != NULL) COLL_BASE_FREE(allocator, tmp_buf);
     return MPI_SUCCESS;
 
 err_hndl:
@@ -911,7 +912,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank));
     if(tmp_buf != NULL) {
-        free(tmp_buf);
+        COLL_BASE_FREE(allocator, tmp_buf);
         tmp_buf = NULL;
         tmp_buf_start = NULL;
     }
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
index 77e07533f72..b2ab8a16d57 100644
--- a/ompi/mca/coll/base/coll_base_functions.h
+++ b/ompi/mca/coll/base/coll_base_functions.h
@@ -54,7 +54,7 @@
 #define COLL_BASE_REDUCE(session, op, src, dst, count, dtype)                  \
     do {                                                                        \
         if (NULL != (session))                                                  \
-            ompi_op_gpu_session_reduce((session), (src), (dst), (count));      \
+            ompi_op_gpu_session_reduce((session), (src), (dst), (dst), (count)); \
         else                                                                    \
             ompi_op_reduce((op), (src), (dst), (count), (dtype));              \
     } while (0)
@@ -222,7 +222,7 @@ int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
-int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS, int radix);
+int ompi_coll_base_allgather_intra_k_bruck(ALLGATHER_ARGS, int radix, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_allgather_direct_messaging(ALLGATHER_ARGS);
 
 /* All GatherV */
@@ -323,7 +323,7 @@ int ompi_coll_base_scan_intra_linear(SCAN_ARGS);
 
 /* Scatter */
 int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
-int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
+int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS, mca_allocator_base_module_t *allocator);
 int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);
 
 /* ScatterV */
diff --git a/ompi/mca/coll/base/coll_base_scatter.c b/ompi/mca/coll/base/coll_base_scatter.c
index 795f79a5c72..87b799acc0e 100644
--- a/ompi/mca/coll/base/coll_base_scatter.c
+++ b/ompi/mca/coll/base/coll_base_scatter.c
@@ -64,7 +64,8 @@ ompi_coll_base_scatter_intra_binomial(
     const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype,
     void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype,
     int root, struct ompi_communicator_t *comm,
-    mca_coll_base_module_t *module)
+    mca_coll_base_module_t *module,
+    mca_allocator_base_module_t *allocator)
 {
     mca_coll_base_module_t *base_module = (mca_coll_base_module_t*)module;
     mca_coll_base_comm_t *data = base_module->base_data;
@@ -110,7 +111,7 @@ ompi_coll_base_scatter_intra_binomial(
             opal_convertor_get_packed_size( &convertor, &packed_sizet );
             packed_size = packed_sizet;
             packed_sizet = packed_sizet / size;
-            ptmp = tempbuf = (char *)malloc(packed_size);
+            ptmp = tempbuf = (char *) COLL_BASE_ALLOC(allocator, packed_size);
             if (NULL == tempbuf) {
                 err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
             }
@@ -147,7 +148,7 @@ ompi_coll_base_scatter_intra_binomial(
             subtree_size = size - vrank;
         packed_size = scount * subtree_size;
 
-        ptmp = tempbuf = (char *)malloc(packed_size);
+        ptmp = tempbuf = (char *) COLL_BASE_ALLOC(allocator, packed_size);
         if (NULL == tempbuf) {
             err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl;
         }
@@ -185,13 +186,13 @@ ompi_coll_base_scatter_intra_binomial(
         curr_count -= send_count;
     }
     if (NULL != tempbuf)
-        free(tempbuf);
+        COLL_BASE_FREE(allocator, tempbuf);
 
     return MPI_SUCCESS;
 
  err_hndl:
     if (NULL != tempbuf)
-        free(tempbuf);
+        COLL_BASE_FREE(allocator, tempbuf);
 
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
                  __FILE__, line, err, rank));
diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h
index 59c1d0ee1f8..d0b9f82f288 100644
--- a/ompi/mca/coll/tuned/coll_tuned.h
+++ b/ompi/mca/coll/tuned/coll_tuned.h
@@ -103,7 +103,7 @@ ompi_coll_tuned_comm_query(struct ompi_communicator_t *comm, int *priority);
 /* All Gather */
 int ompi_coll_tuned_allgather_intra_dec_fixed(ALLGATHER_ARGS);
 int ompi_coll_tuned_allgather_intra_dec_dynamic(ALLGATHER_ARGS);
-int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_allgather_intra_do_this(ALLGATHER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* All GatherV */
@@ -171,7 +171,7 @@ int ompi_coll_tuned_reduce_scatter_block_intra_check_forced_init (coll_tuned_for
 /* Scatter */
 int ompi_coll_tuned_scatter_intra_dec_fixed(SCATTER_ARGS);
 int ompi_coll_tuned_scatter_intra_dec_dynamic(SCATTER_ARGS);
-int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize);
+int ompi_coll_tuned_scatter_intra_do_this(SCATTER_ARGS, int algorithm, int faninout, int segsize, mca_allocator_base_module_t *allocator);
 int ompi_coll_tuned_scatter_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices);
 
 /* Exscan */
diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
index 052c1d5f9e4..4e9d167f79a 100644
--- a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
@@ -133,7 +133,8 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount,
                                             struct ompi_datatype_t *rdtype,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module,
-                                            int algorithm, int faninout, int segsize)
+                                            int algorithm, int faninout, int segsize,
+                                            mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "coll:tuned:allgather_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -150,7 +151,7 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, size_t scount,
     case (2):
         return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
-                                                      comm, module, faninout);
+                                                      comm, module, faninout, allocator);
     case (3):
         return ompi_coll_base_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
                                                                 rbuf, rcount, rdtype,
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
index 31204c472bb..17d6786d949 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_dynamic.c
@@ -28,6 +28,8 @@
 #include "opal/mca/accelerator/accelerator.h"
 #include "opal/mca/accelerator/base/base.h"
 #include "ompi/datatype/ompi_datatype.h"
+#include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "ompi/communicator/communicator.h"
 #include "ompi/mca/coll/base/base.h"
 #include "ompi/mca/coll/coll.h"
@@ -65,15 +67,22 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "ompi_coll_tuned_allreduce_intra_dec_dynamic"));
 
-    /* session=NULL uses host ompi_op_reduce path. */
-
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[ALLREDUCE].algorithm) {
-        return ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module,
-                                                       tuned_module->user_forced[ALLREDUCE].algorithm,
-                                                       tuned_module->user_forced[ALLREDUCE].tree_fanout,
-                                                       tuned_module->user_forced[ALLREDUCE].segsize,
-                                                       NULL);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op, comm, module,
+                                                         tuned_module->user_forced[ALLREDUCE].algorithm,
+                                                         tuned_module->user_forced[ALLREDUCE].tree_fanout,
+                                                         tuned_module->user_forced[ALLREDUCE].segsize,
+                                                         session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -89,10 +98,18 @@ ompi_coll_tuned_allreduce_intra_dec_dynamic (const void *sbuf, void *rbuf, size_
                                                         dsize, &faninout, &segsize, &ignoreme);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                            comm, module,
-                                                            alg, faninout, segsize, NULL);
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op,
+                                                             comm, module,
+                                                             alg, faninout, segsize, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -322,17 +339,24 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_intra_dec_dynamic"));
 
-    /* session=NULL uses host ompi_op_reduce path. */
-
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCE].algorithm) {
-        return ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
-                                                    op, root, comm, module,
-                                                    tuned_module->user_forced[REDUCE].algorithm,
-                                                    tuned_module->user_forced[REDUCE].chain_fanout,
-                                                    tuned_module->user_forced[REDUCE].segsize,
-                                                    tuned_module->user_forced[REDUCE].max_requests,
-                                                    NULL);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
+                                                      op, root, comm, module,
+                                                      tuned_module->user_forced[REDUCE].algorithm,
+                                                      tuned_module->user_forced[REDUCE].chain_fanout,
+                                                      tuned_module->user_forced[REDUCE].segsize,
+                                                      tuned_module->user_forced[REDUCE].max_requests,
+                                                      session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -349,11 +373,19 @@ int ompi_coll_tuned_reduce_intra_dec_dynamic( const void *sbuf, void *rbuf,
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return  ompi_coll_tuned_reduce_intra_do_this (sbuf, rbuf, count, dtype,
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_reduce_intra_do_this(sbuf, rbuf, count, dtype,
                                                           op, root, comm, module,
                                                           alg, faninout,
-                                                          segsize, max_requests, NULL);
+                                                          segsize, max_requests, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -382,16 +414,23 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_intra_dec_dynamic"));
 
-    /* session=NULL uses host ompi_op_reduce path. */
-
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTER].algorithm) {
-        return ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
-                                                            op, comm, module,
-                                                            tuned_module->user_forced[REDUCESCATTER].algorithm,
-                                                            tuned_module->user_forced[REDUCESCATTER].chain_fanout,
-                                                            tuned_module->user_forced[REDUCESCATTER].segsize,
-                                                            NULL);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
+                                                              op, comm, module,
+                                                              tuned_module->user_forced[REDUCESCATTER].algorithm,
+                                                              tuned_module->user_forced[REDUCESCATTER].chain_fanout,
+                                                              tuned_module->user_forced[REDUCESCATTER].segsize,
+                                                              session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -409,10 +448,18 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_dynamic(const void *sbuf, void *rbu
                                                         dsize, &faninout,
                                                         &segsize, &ignoreme);
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
                                                                   op, comm, module,
-                                                                  alg, faninout, segsize, NULL);
+                                                                  alg, faninout, segsize, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -441,16 +488,23 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
         "coll:tuned:reduce_scatter_block_intra_dec_dynamic"));
 
-    /* session=NULL uses host ompi_op_reduce path. */
-
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm) {
-        return ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
-                                                                  op, comm, module,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout,
-                                                                  tuned_module->user_forced[REDUCESCATTERBLOCK].segsize,
-                                                                  NULL);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
+                                                                    op, comm, module,
+                                                                    tuned_module->user_forced[REDUCESCATTERBLOCK].algorithm,
+                                                                    tuned_module->user_forced[REDUCESCATTERBLOCK].chain_fanout,
+                                                                    tuned_module->user_forced[REDUCESCATTERBLOCK].segsize,
+                                                                    session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /* check to see if we have some filebased rules */
@@ -467,10 +521,18 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_dynamic(const void *sbuf, voi
                                                        dsize, &faninout,
                                                        &segsize, &ignoreme);
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
                                                                         op, comm, module,
-                                                                        alg, faninout, segsize, NULL);
+                                                                        alg, faninout, segsize, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /* end if any com rules to check */
 
@@ -501,13 +563,21 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount,
 
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[ALLGATHER].algorithm) {
-        /* User-forced algorithm */
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
         return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
                                                        comm, module,
                                                        tuned_module->user_forced[ALLGATHER].algorithm,
                                                        tuned_module->user_forced[ALLGATHER].tree_fanout,
-                                                       tuned_module->user_forced[ALLGATHER].segsize);
+                                                       tuned_module->user_forced[ALLGATHER].segsize,
+                                                       allocator);
     }
 
     if (tuned_module->com_rules[ALLGATHER]) {
@@ -524,12 +594,18 @@ int ompi_coll_tuned_allgather_intra_dec_dynamic(const void *sbuf, size_t scount,
         alg = ompi_coll_tuned_get_target_method_params (tuned_module->com_rules[ALLGATHER],
                                                         dsize, &faninout, &segsize, &ignoreme);
         if (alg) {
-            /* we have found a valid choice from the file based rules for
-               this message size */
-            return ompi_coll_tuned_allgather_intra_do_this (sbuf, scount, sdtype,
-                                                            rbuf, rcount, rdtype,
-                                                            comm, module,
-                                                            alg, faninout, segsize);
+            mca_allocator_base_module_t *allocator = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE &&
+                 opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+            }
+            return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
+                                                           rbuf, rcount, rdtype,
+                                                           comm, module,
+                                                           alg, faninout, segsize, allocator);
         }
     }
 
@@ -614,11 +690,12 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
                                              mca_coll_base_module_t *module)
 {
     mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
-    mca_allocator_base_module_t *allocator = NULL;
 
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_gather_intra_dec_dynamic"));
 
+    mca_allocator_base_module_t *allocator = NULL;
+
     /* Scratch buffer is used for data movement only (no ompi_op_reduce).
      * Use device allocator when user buffers are on device. */
     {
@@ -657,11 +734,10 @@ int ompi_coll_tuned_gather_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
-                                                         rbuf, rcount, rdtype,
-                                                         root, comm, module,
-                                                         alg, faninout, segsize, allocator);
+            return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype,
+                                                        rbuf, rcount, rdtype,
+                                                        root, comm, module,
+                                                        alg, faninout, segsize, allocator);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -682,6 +758,20 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount,
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_scatter_intra_dec_dynamic"));
 
+    mca_allocator_base_module_t *allocator = NULL;
+
+    /* Scratch buffer is used for data movement only (no ompi_op_reduce).
+     * Use device allocator when user buffers are on device. */
+    {
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+    }
+
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[SCATTER].algorithm) {
         return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype,
@@ -689,7 +779,8 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                      root, comm, module,
                                                      tuned_module->user_forced[SCATTER].algorithm,
                                                      tuned_module->user_forced[SCATTER].chain_fanout,
-                                                     tuned_module->user_forced[SCATTER].segsize);
+                                                     tuned_module->user_forced[SCATTER].segsize,
+                                                     allocator);
     }
 
     /**
@@ -707,11 +798,10 @@ int ompi_coll_tuned_scatter_intra_dec_dynamic(const void *sbuf, size_t scount,
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype,
-                                                          rbuf, rcount, rdtype,
-                                                          root, comm, module,
-                                                          alg, faninout, segsize);
+            return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype,
+                                                         rbuf, rcount, rdtype,
+                                                         root, comm, module,
+                                                         alg, faninout, segsize, allocator);
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -731,14 +821,21 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_exscan_intra_dec_dynamic"));
 
-    /* session=NULL uses host ompi_op_reduce path. */
-
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[EXSCAN].algorithm) {
-        return ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
-                                                    op, comm, module,
-                                                    tuned_module->user_forced[EXSCAN].algorithm,
-                                                    NULL);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                      op, comm, module,
+                                                      tuned_module->user_forced[EXSCAN].algorithm,
+                                                      session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /**
@@ -756,10 +853,18 @@ int ompi_coll_tuned_exscan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_exscan_intra_do_this (sbuf, rbuf, count, dtype,
-                                                         op, comm, module,
-                                                         alg, NULL);
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_exscan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                          op, comm, module,
+                                                          alg, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
@@ -778,14 +883,21 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_scan_intra_dec_dynamic"));
 
-    /* session=NULL uses host ompi_op_reduce path. */
-
     /* Check first if an algorithm is set explicitly for this collective */
     if (tuned_module->user_forced[SCAN].algorithm) {
-        return ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
-                                                  op, comm, module,
-                                                  tuned_module->user_forced[SCAN].algorithm,
-                                                  NULL);
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                    op, comm, module,
+                                                    tuned_module->user_forced[SCAN].algorithm,
+                                                    session);
+        ompi_op_gpu_session_end(session);
+        return rc;
     }
 
     /**
@@ -803,10 +915,18 @@ int ompi_coll_tuned_scan_intra_dec_dynamic(const void *sbuf, void* rbuf, size_t
                                                         dsize, &faninout, &segsize, &max_requests);
 
         if (alg) {
-            /* we have found a valid choice from the file based rules for this message size */
-            return ompi_coll_tuned_scan_intra_do_this (sbuf, rbuf, count, dtype,
-                                                       op, comm, module,
-                                                       alg, NULL);
+            ompi_op_gpu_session_t *session = NULL;
+            int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+            uint64_t _flags;
+            if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+                opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+                session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+            }
+            int rc = ompi_coll_tuned_scan_intra_do_this(sbuf, rbuf, count, dtype,
+                                                        op, comm, module,
+                                                        alg, session);
+            ompi_op_gpu_session_end(session);
+            return rc;
         } /* found a method */
     } /*end if any com rules to check */
 
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
index d4994ba8c4a..99727410629 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@@ -36,6 +36,7 @@
 #include "ompi/mca/coll/coll.h"
 #include "ompi/mca/coll/base/coll_tags.h"
 #include "ompi/op/op.h"
+#include "ompi/op/op_gpu_session.h"
 #include "coll_tuned.h"
 
 /*
@@ -216,9 +217,19 @@ ompi_coll_tuned_allreduce_intra_dec_fixed(const void *sbuf, void *rbuf, size_t c
         }
     }
 
-    /* session=NULL uses host ompi_op_reduce path. */
-    return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                    comm, module, alg, 0, 0, NULL);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_allreduce_intra_do_this(sbuf, rbuf, count, dtype, op,
+                                                         comm, module, alg, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 
@@ -1076,11 +1087,21 @@ int ompi_coll_tuned_reduce_intra_dec_fixed( const void *sendbuf, void *recvbuf,
         }
     }
 
-    /* session=NULL uses host ompi_op_reduce path. */
-    int faninout = 2;
-    return  ompi_coll_tuned_reduce_intra_do_this (sendbuf, recvbuf, count, datatype,
-                                                  op, root, comm, module,
-                                                  alg, faninout, 0, 0, NULL);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sendbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sendbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(recvbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, datatype, _dev_id);
+        }
+        int faninout = 2;
+        int rc = ompi_coll_tuned_reduce_intra_do_this(sendbuf, recvbuf, count, datatype,
+                                                      op, root, comm, module,
+                                                      alg, faninout, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 /*
@@ -1227,10 +1248,20 @@ int ompi_coll_tuned_reduce_scatter_intra_dec_fixed( const void *sbuf, void *rbuf
         }
     }
 
-    /* session=NULL uses host ompi_op_reduce path. */
-    return  ompi_coll_tuned_reduce_scatter_intra_do_this (sbuf, rbuf, rcounts, dtype,
-                                                          op, comm, module,
-                                                          alg, 0, 0, NULL);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_intra_do_this(sbuf, rbuf, rcounts, dtype,
+                                                              op, comm, module,
+                                                              alg, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 /*
@@ -1349,10 +1380,20 @@ int ompi_coll_tuned_reduce_scatter_block_intra_dec_fixed(const void *sbuf, void
         }
     }
 
-    /* session=NULL uses host ompi_op_reduce path. */
-    return  ompi_coll_tuned_reduce_scatter_block_intra_do_this (sbuf, rbuf, rcount, dtype,
-                                                                op, comm, module,
-                                                                alg, 0, 0, NULL);
+    {
+        ompi_op_gpu_session_t *session = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE && opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            session = ompi_op_gpu_session_begin(op, dtype, _dev_id);
+        }
+        int rc = ompi_coll_tuned_reduce_scatter_block_intra_do_this(sbuf, rbuf, rcount, dtype,
+                                                                    op, comm, module,
+                                                                    alg, 0, 0, session);
+        ompi_op_gpu_session_end(session);
+        return rc;
+    }
 }
 
 /*
@@ -1497,10 +1538,21 @@ int ompi_coll_tuned_allgather_intra_dec_fixed(const void *sbuf, size_t scount,
         "ompi_coll_tuned_allgather_intra_dec_fixed rank %d com_size %d",
         ompi_comm_rank(comm), communicator_size));
 
-    int faninout = 2;
-    return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
-                                                   rbuf, rcount, rdtype,
-                                                   comm, module, alg, faninout, 0);
+    {
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+        int faninout = 2;
+        return ompi_coll_tuned_allgather_intra_do_this(sbuf, scount, sdtype,
+                                                       rbuf, rcount, rdtype,
+                                                       comm, module, alg, faninout, 0,
+                                                       allocator);
+    }
 }
 
 /*
@@ -1662,7 +1714,6 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount,
 {
     int communicator_size, alg, rank;
     size_t dsize, total_dsize;
-    mca_allocator_base_module_t *allocator = NULL;
 
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "ompi_coll_tuned_gather_intra_dec_fixed"));
@@ -1727,9 +1778,8 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount,
         alg = 2;
     }
 
-    /* Scratch buffer is used for data movement only (no ompi_op_reduce).
-     * Use device allocator when user buffers are on device. */
     {
+        mca_allocator_base_module_t *allocator = NULL;
         int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
         uint64_t _flags;
         if ((sbuf != MPI_IN_PLACE &&
@@ -1737,11 +1787,11 @@ int ompi_coll_tuned_gather_intra_dec_fixed(const void *sbuf, size_t scount,
             opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
             allocator = opal_accelerator_base_get_device_allocator(_dev_id);
         }
+        return ompi_coll_tuned_gather_intra_do_this(sbuf, scount, sdtype,
+                                                    rbuf, rcount, rdtype,
+                                                    root, comm, module,
+                                                    alg, 0, 0, allocator);
     }
-    return ompi_coll_tuned_gather_intra_do_this (sbuf, scount, sdtype,
-                                                 rbuf, rcount, rdtype,
-                                                 root, comm, module,
-                                                 alg, 0, 0, allocator);
 }
 
 /*
@@ -1843,8 +1893,18 @@ int ompi_coll_tuned_scatter_intra_dec_fixed(const void *sbuf, size_t scount,
         }
     }
 
-    return ompi_coll_tuned_scatter_intra_do_this (sbuf, scount, sdtype,
-                                                  rbuf, rcount, rdtype,
-                                                  root, comm, module,
-                                                  alg, 0, 0);
+    {
+        mca_allocator_base_module_t *allocator = NULL;
+        int _dev_id = MCA_ACCELERATOR_NO_DEVICE_ID;
+        uint64_t _flags;
+        if ((sbuf != MPI_IN_PLACE &&
+             opal_accelerator.check_addr(sbuf, &_dev_id, &_flags) > 0) ||
+            opal_accelerator.check_addr(rbuf, &_dev_id, &_flags) > 0) {
+            allocator = opal_accelerator_base_get_device_allocator(_dev_id);
+        }
+        return ompi_coll_tuned_scatter_intra_do_this(sbuf, scount, sdtype,
+                                                     rbuf, rcount, rdtype,
+                                                     root, comm, module,
+                                                     alg, 0, 0, allocator);
+    }
 }
diff --git a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
index b1449b2955c..89a78f6fab6 100644
--- a/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
+++ b/ompi/mca/coll/tuned/coll_tuned_scatter_decision.c
@@ -163,7 +163,8 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount,
                                       int root,
                                       struct ompi_communicator_t *comm,
                                       mca_coll_base_module_t *module,
-                                      int algorithm, int faninout, int segsize)
+                                      int algorithm, int faninout, int segsize,
+                                      mca_allocator_base_module_t *allocator)
 {
     OPAL_OUTPUT_VERBOSE((COLL_TUNED_TRACING_VERBOSE, ompi_coll_tuned_stream,
                  "coll:tuned:scatter_intra_do_this selected algorithm %d topo faninout %d segsize %d",
@@ -181,7 +182,7 @@ ompi_coll_tuned_scatter_intra_do_this(const void *sbuf, size_t scount,
     case (2):
         return ompi_coll_base_scatter_intra_binomial(sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
-                                                     root, comm, module);
+                                                     root, comm, module, allocator);
     case (3):
         return ompi_coll_base_scatter_intra_linear_nb(sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
diff --git a/ompi/mca/op/cuda/op_cuda_kernels.cu b/ompi/mca/op/cuda/op_cuda_kernels.cu
index 162eeac606d..5708198e9fa 100644
--- a/ompi/mca/op/cuda/op_cuda_kernels.cu
+++ b/ompi/mca/op/cuda/op_cuda_kernels.cu
@@ -31,8 +31,8 @@
  * PERSISTENT_KERNEL(name, ctype, op_expr)
  *
  * Generates __global__ void ompi_op_cuda_persistent_<name>(...).
- * op_expr must be a statement that updates dst[i] in-place using src[i],
- * e.g. "dst[i] += src[i]" or "dst[i] = dst[i] > src[i] ? dst[i] : src[i]".
+ * op_expr must be a statement writing dst[i] from src1[i] and src2[i],
+ * e.g. "dst[i] = src1[i] + src2[i]".  src2 may alias dst for in-place ops.
  * ------------------------------------------------------------------------- */
 #define PERSISTENT_KERNEL(kname, ctype, op_expr)                               \
 __global__ void ompi_op_cuda_persistent_##kname(                               \
@@ -42,8 +42,9 @@ __global__ void ompi_op_cuda_persistent_##kname(                               \
         /* Spin-wait for work; sleep 1 µs between polls to save power */        \
         while (cmd->status != 1 && !*shutdown) { __nanosleep(1000); }          \
         if (*shutdown) break;                                                   \
-        const ctype * __restrict__ src = (const ctype *) cmd->src;             \
-              ctype * __restrict__ dst = (      ctype *) cmd->dst;             \
+        const ctype * __restrict__ src1 = (const ctype *) cmd->src1;           \
+        const ctype * __restrict__ src2 = (const ctype *) cmd->src2;           \
+              ctype * __restrict__ dst  = (      ctype *) cmd->dst;            \
         int64_t n = cmd->count;                                                 \
         for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) {      \
             op_expr;                                                            \
@@ -61,82 +62,82 @@ __global__ void ompi_op_cuda_persistent_##kname(                               \
  * ========================================================================= */
 
 /* --- MAX --- */
-PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_float,  float,    dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_double, double,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_float,  float,    dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_double, double,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
 
 /* --- MIN --- */
-PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_float,  float,    dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_double, double,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_float,  float,    dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_double, double,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
 
 /* --- SUM --- */
-PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] += src[i])
-PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] += src[i])
-PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] += src[i])
-PERSISTENT_KERNEL(sum_float,  float,    dst[i] += src[i])
-PERSISTENT_KERNEL(sum_double, double,   dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_float,  float,    dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_double, double,   dst[i] = src1[i] + src2[i])
 
 /* --- PROD --- */
-PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_float,  float,    dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_double, double,   dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_float,  float,    dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_double, double,   dst[i] = src1[i] * src2[i])
 
 /* --- BAND (bitwise AND, integer types only) --- */
-PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] &= src[i])
-PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] &= src[i])
-PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] = src1[i] & src2[i])
 
 /* --- BOR (bitwise OR) --- */
-PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] = src1[i] | src2[i])
 
 /* --- BXOR (bitwise XOR) --- */
-PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] = src1[i] ^ src2[i])
 
 /* =========================================================================
  * Host-side launcher wrappers — one per kernel, 1 block × 256 threads.
diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c
index 892fc9d2e10..8720f8ef35d 100644
--- a/ompi/mca/op/cuda/op_cuda_session.c
+++ b/ompi/mca/op/cuda/op_cuda_session.c
@@ -39,6 +39,7 @@
 
 #include <cuda_runtime.h>
 
+#include "opal/mca/accelerator/base/base.h"
 #include "ompi/op/op.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/op/op_gpu_session.h"
@@ -100,7 +101,8 @@ ompi_op_cuda_session_begin(struct ompi_op_t *op,
         free(session);
         return NULL;
     }
-    priv->cmd->src    = NULL;
+    priv->cmd->src1   = NULL;
+    priv->cmd->src2   = NULL;
     priv->cmd->dst    = NULL;
     priv->cmd->count  = 0;
     priv->cmd->status = 0;
@@ -140,7 +142,7 @@ ompi_op_cuda_session_begin(struct ompi_op_t *op,
     }
 
     session->dev_id    = dev_id;
-    session->allocator = NULL;   /* scratch allocator wired in Phase 4 */
+    session->allocator = opal_accelerator_base_get_device_allocator(dev_id);
     session->backend   = priv;
 
     return session;
@@ -150,18 +152,21 @@ ompi_op_cuda_session_begin(struct ompi_op_t *op,
  * ompi_op_cuda_session_reduce
  *
  * Posts one reduction command to the persistent kernel and waits for it to
- * complete.  Semantics: dst[i] = dst[i] op src[i] for i in [0, count).
- * Both src and dst must be accessible from the GPU (device or managed mem).
+ * complete.  Semantics: dst[i] = src1[i] op src2[i] for i in [0, count).
+ * src2 may alias dst for in-place operations.  All pointers must be
+ * accessible from the GPU (device or managed memory).
  * -------------------------------------------------------------------------- */
 void
 ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
-                            const void *src, void *dst, size_t count)
+                            const void *src1, const void *src2,
+                            void *dst, size_t count)
 {
     ompi_op_cuda_session_priv_t *priv =
         (ompi_op_cuda_session_priv_t *) session->backend;
 
     /* Write operands before signalling the kernel */
-    priv->cmd->src   = src;
+    priv->cmd->src1  = src1;
+    priv->cmd->src2  = src2;
     priv->cmd->dst   = dst;
     priv->cmd->count = (int64_t) count;
 
@@ -229,7 +234,8 @@ ompi_op_cuda_session_restart(ompi_op_gpu_session_t *session,
 
     /* Reset state for the new kernel */
     *priv->shutdown   = 0;
-    priv->cmd->src    = NULL;
+    priv->cmd->src1   = NULL;
+    priv->cmd->src2   = NULL;
     priv->cmd->dst    = NULL;
     priv->cmd->count  = 0;
     priv->cmd->status = 0;
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index e7034959757..f8c3a5bf447 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -345,7 +345,7 @@ typedef struct ompi_op_gpu_session_t *
  */
 typedef void (*ompi_op_base_component_session_reduce_fn_t)(
                   struct ompi_op_gpu_session_t *session,
-                  const void *src, void *dst, size_t count);
+                  const void *src1, const void *src2, void *dst, size_t count);
 
 /**
  * Optional component hook: signal the persistent kernel to exit and
diff --git a/ompi/mca/op/rocm/op_rocm_kernels.cpp b/ompi/mca/op/rocm/op_rocm_kernels.cpp
index cf42e3ac019..db24237e24d 100644
--- a/ompi/mca/op/rocm/op_rocm_kernels.cpp
+++ b/ompi/mca/op/rocm/op_rocm_kernels.cpp
@@ -30,7 +30,8 @@
  * PERSISTENT_KERNEL(name, ctype, op_expr)
  *
  * Generates __global__ void ompi_op_rocm_persistent_<name>(...).
- * op_expr must be a statement that updates dst[i] in-place using src[i].
+ * op_expr must be a statement writing dst[i] from src1[i] and src2[i],
+ * e.g. "dst[i] = src1[i] + src2[i]".  src2 may alias dst for in-place ops.
  * ------------------------------------------------------------------------- */
 #define PERSISTENT_KERNEL(kname, ctype, op_expr)                               \
 __global__ void ompi_op_rocm_persistent_##kname(                               \
@@ -42,8 +43,9 @@ __global__ void ompi_op_rocm_persistent_##kname(                               \
             __builtin_amdgcn_s_sleep(1);                                       \
         }                                                                       \
         if (*shutdown) break;                                                   \
-        const ctype * __restrict__ src = (const ctype *) cmd->src;             \
-              ctype * __restrict__ dst = (      ctype *) cmd->dst;             \
+        const ctype * __restrict__ src1 = (const ctype *) cmd->src1;           \
+        const ctype * __restrict__ src2 = (const ctype *) cmd->src2;           \
+              ctype * __restrict__ dst  = (      ctype *) cmd->dst;            \
         int64_t n = cmd->count;                                                 \
         for (int64_t i = (int64_t)threadIdx.x; i < n; i += blockDim.x) {      \
             op_expr;                                                            \
@@ -61,82 +63,82 @@ __global__ void ompi_op_rocm_persistent_##kname(                               \
  * ========================================================================= */
 
 /* --- MAX --- */
-PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_float,  float,    dst[i] = dst[i] > src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(max_double, double,   dst[i] = dst[i] > src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(max_int8,   int8_t,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint8,  uint8_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int16,  int16_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint16, uint16_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int32,  int32_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint32, uint32_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_int64,  int64_t,  dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_uint64, uint64_t, dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_float,  float,    dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(max_double, double,   dst[i] = src1[i] > src2[i] ? src1[i] : src2[i])
 
 /* --- MIN --- */
-PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_float,  float,    dst[i] = dst[i] < src[i] ? dst[i] : src[i])
-PERSISTENT_KERNEL(min_double, double,   dst[i] = dst[i] < src[i] ? dst[i] : src[i])
+PERSISTENT_KERNEL(min_int8,   int8_t,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint8,  uint8_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int16,  int16_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint16, uint16_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int32,  int32_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint32, uint32_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_int64,  int64_t,  dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_uint64, uint64_t, dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_float,  float,    dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
+PERSISTENT_KERNEL(min_double, double,   dst[i] = src1[i] < src2[i] ? src1[i] : src2[i])
 
 /* --- SUM --- */
-PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] += src[i])
-PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] += src[i])
-PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] += src[i])
-PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] += src[i])
-PERSISTENT_KERNEL(sum_float,  float,    dst[i] += src[i])
-PERSISTENT_KERNEL(sum_double, double,   dst[i] += src[i])
+PERSISTENT_KERNEL(sum_int8,   int8_t,   dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint8,  uint8_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int16,  int16_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint16, uint16_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int32,  int32_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint32, uint32_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_int64,  int64_t,  dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_uint64, uint64_t, dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_float,  float,    dst[i] = src1[i] + src2[i])
+PERSISTENT_KERNEL(sum_double, double,   dst[i] = src1[i] + src2[i])
 
 /* --- PROD --- */
-PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_float,  float,    dst[i] *= src[i])
-PERSISTENT_KERNEL(prod_double, double,   dst[i] *= src[i])
+PERSISTENT_KERNEL(prod_int8,   int8_t,   dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint8,  uint8_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int16,  int16_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint16, uint16_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int32,  int32_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint32, uint32_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_int64,  int64_t,  dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_uint64, uint64_t, dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_float,  float,    dst[i] = src1[i] * src2[i])
+PERSISTENT_KERNEL(prod_double, double,   dst[i] = src1[i] * src2[i])
 
 /* --- BAND (bitwise AND, integer types only) --- */
-PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] &= src[i])
-PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] &= src[i])
-PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] &= src[i])
-PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] &= src[i])
+PERSISTENT_KERNEL(band_int8,   int8_t,   dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint8,  uint8_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int16,  int16_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint16, uint16_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int32,  int32_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint32, uint32_t, dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_int64,  int64_t,  dst[i] = src1[i] & src2[i])
+PERSISTENT_KERNEL(band_uint64, uint64_t, dst[i] = src1[i] & src2[i])
 
 /* --- BOR (bitwise OR) --- */
-PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] |= src[i])
-PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] |= src[i])
+PERSISTENT_KERNEL(bor_int8,   int8_t,   dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint8,  uint8_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int16,  int16_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint16, uint16_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int32,  int32_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint32, uint32_t, dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_int64,  int64_t,  dst[i] = src1[i] | src2[i])
+PERSISTENT_KERNEL(bor_uint64, uint64_t, dst[i] = src1[i] | src2[i])
 
 /* --- BXOR (bitwise XOR) --- */
-PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] ^= src[i])
-PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] ^= src[i])
+PERSISTENT_KERNEL(bxor_int8,   int8_t,   dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint8,  uint8_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int16,  int16_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint16, uint16_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int32,  int32_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint32, uint32_t, dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_int64,  int64_t,  dst[i] = src1[i] ^ src2[i])
+PERSISTENT_KERNEL(bxor_uint64, uint64_t, dst[i] = src1[i] ^ src2[i])
 
 /* =========================================================================
  * Host-side launcher wrappers — one per kernel, 1 block × 256 threads.
diff --git a/ompi/mca/op/rocm/op_rocm_session.c b/ompi/mca/op/rocm/op_rocm_session.c
index 8dbbe56c87f..68d2f5b5030 100644
--- a/ompi/mca/op/rocm/op_rocm_session.c
+++ b/ompi/mca/op/rocm/op_rocm_session.c
@@ -40,6 +40,7 @@
 
 #include <hip/hip_runtime.h>
 
+#include "opal/mca/accelerator/base/base.h"
 #include "ompi/op/op.h"
 #include "ompi/datatype/ompi_datatype.h"
 #include "ompi/op/op_gpu_session.h"
@@ -97,7 +98,8 @@ ompi_op_rocm_session_begin(struct ompi_op_t *op,
         free(session);
         return NULL;
     }
-    priv->cmd->src    = NULL;
+    priv->cmd->src1   = NULL;
+    priv->cmd->src2   = NULL;
     priv->cmd->dst    = NULL;
     priv->cmd->count  = 0;
     priv->cmd->status = 0;
@@ -137,7 +139,7 @@ ompi_op_rocm_session_begin(struct ompi_op_t *op,
     }
 
     session->dev_id    = dev_id;
-    session->allocator = NULL;   /* scratch allocator wired in Phase 4 */
+    session->allocator = opal_accelerator_base_get_device_allocator(dev_id);
     session->backend   = priv;
 
     return session;
@@ -148,13 +150,15 @@ ompi_op_rocm_session_begin(struct ompi_op_t *op,
  * -------------------------------------------------------------------------- */
 void
 ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
-                             const void *src, void *dst, size_t count)
+                             const void *src1, const void *src2,
+                             void *dst, size_t count)
 {
     ompi_op_rocm_session_priv_t *priv =
         (ompi_op_rocm_session_priv_t *) session->backend;
 
     /* Write operands before signalling the kernel */
-    priv->cmd->src   = src;
+    priv->cmd->src1  = src1;
+    priv->cmd->src2  = src2;
     priv->cmd->dst   = dst;
     priv->cmd->count = (int64_t) count;
 
@@ -222,7 +226,8 @@ ompi_op_rocm_session_restart(ompi_op_gpu_session_t *session,
 
     /* Reset state for the new kernel */
     *priv->shutdown   = 0;
-    priv->cmd->src    = NULL;
+    priv->cmd->src1   = NULL;
+    priv->cmd->src2   = NULL;
     priv->cmd->dst    = NULL;
     priv->cmd->count  = 0;
     priv->cmd->status = 0;
diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c
index aec06f44df6..b459fef6bc0 100644
--- a/ompi/op/op_gpu_session.c
+++ b/ompi/op/op_gpu_session.c
@@ -41,6 +41,7 @@
 #include <stdlib.h>
 
 #include "opal/class/opal_list.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "opal/mca/base/base.h"
 #include "opal/mca/threads/mutex.h"
 #include "ompi/mca/op/op.h"
@@ -145,14 +146,49 @@ ompi_op_gpu_session_begin(struct ompi_op_t *op,
     return NULL;
 }
 
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_begin_alloc
+ *
+ * Create a lightweight session with GPU scratch-memory allocation only.
+ * No persistent kernel is launched; reduce_fn and the other kernel hooks are
+ * NULL.  The session is freed directly by session_end (not pooled) because
+ * it holds no GPU stream or managed memory of its own.
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_gpu_session_begin_alloc(int dev_id)
+{
+    mca_allocator_base_module_t *allocator =
+        opal_accelerator_base_get_device_allocator(dev_id);
+    if (NULL == allocator) {
+        return NULL;
+    }
+
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
+    }
+
+    session->dev_id     = dev_id;
+    session->allocator  = allocator;
+    session->backend    = NULL;
+    session->reduce_fn  = NULL;
+    session->stop_fn    = NULL;
+    session->restart_fn = NULL;
+    session->free_fn    = NULL;
+    session->pool_next  = NULL;
+    return session;
+}
+
 /* --------------------------------------------------------------------------
  * ompi_op_gpu_session_reduce
  * -------------------------------------------------------------------------- */
 void
 ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
-                           const void *src, void *dst, size_t count)
+                           const void *src1, const void *src2,
+                           void *dst, size_t count)
 {
-    session->reduce_fn(session, src, dst, count);
+    session->reduce_fn(session, src1, src2, dst, count);
 }
 
 /* --------------------------------------------------------------------------
@@ -169,6 +205,13 @@ ompi_op_gpu_session_end(ompi_op_gpu_session_t *session)
         return;
     }
 
+    /* Alloc-only sessions (stop_fn == NULL) hold no kernel resources.
+     * Free the struct immediately; they are not pooled. */
+    if (NULL == session->stop_fn) {
+        free(session);
+        return;
+    }
+
     /* Signal the kernel to exit and wait for the stream to drain.
      * GPU stream and managed memory remain allocated for reuse. */
     session->stop_fn(session);
diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h
index 2213d921a80..0050f0b7ed0 100644
--- a/ompi/op/op_gpu_session.h
+++ b/ompi/op/op_gpu_session.h
@@ -32,7 +32,8 @@ struct ompi_datatype_t;
  *   2 = done       (kernel → host: reduction complete)
  */
 typedef struct {
-    const void      *src;
+    const void      *src1;
+    const void      *src2;
     void            *dst;
     int64_t          count;
     volatile int32_t status;
@@ -61,7 +62,7 @@ typedef struct ompi_op_gpu_session_t {
     void                        *backend;    /* opaque: cuda or rocm session state */
     /* Dispatch hooks wired at session_begin time. */
     void (*reduce_fn)(struct ompi_op_gpu_session_t *session,
-                      const void *src, void *dst, size_t count);
+                      const void *src1, const void *src2, void *dst, size_t count);
     /* Signal the persistent kernel to exit and synchronize the stream.
      * GPU stream and managed memory remain allocated for reuse. */
     void (*stop_fn)(struct ompi_op_gpu_session_t *session);
@@ -88,11 +89,22 @@ OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin(struct ompi_op_t
                                                                 int dev_id);
 
 /**
- * Post one reduction command (src op dst → dst) to the persistent kernel and
- * wait for completion.  Behavior is undefined if session is NULL.
+ * Create a lightweight session that provides GPU scratch-memory allocation only,
+ * without launching a persistent reduction kernel.  Suitable for collective
+ * algorithms that need temporary device memory but perform no GPU reduction.
+ * Returns NULL if no device allocator is available for dev_id.
+ * The returned session is freed by ompi_op_gpu_session_end().
+ */
+OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin_alloc(int dev_id);
+
+/**
+ * Post one reduction command (src1 op src2 → dst) to the persistent kernel and
+ * wait for completion.  src2 may alias dst for in-place operations.
+ * Behavior is undefined if session is NULL.
  */
 OMPI_DECLSPEC void ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
-                                               const void *src, void *dst, size_t count);
+                                               const void *src1, const void *src2,
+                                               void *dst, size_t count);
 
 /**
  * Stop the persistent kernel and return the session to the pool for reuse.

From be44a1e5f8f6363f267735bd258da76c184c682b Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 25 May 2026 15:13:16 -0400
Subject: [PATCH 07/13] Cache expensive allocation parts of a GPU session

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/op/base/op_base_frame.c     |   9 +-
 ompi/mca/op/cuda/op_cuda.h           |  12 +-
 ompi/mca/op/cuda/op_cuda_component.c |  27 +--
 ompi/mca/op/cuda/op_cuda_session.c   | 293 +++++++++++++--------------
 ompi/mca/op/op.h                     |  68 +++----
 ompi/mca/op/rocm/op_rocm.h           |  12 +-
 ompi/mca/op/rocm/op_rocm_component.c |  23 +--
 ompi/mca/op/rocm/op_rocm_session.c   | 273 ++++++++++++-------------
 ompi/op/op_gpu_session.c             | 249 ++++++++++-------------
 ompi/op/op_gpu_session.h             |  92 +++++----
 10 files changed, 488 insertions(+), 570 deletions(-)

diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c
index 89a4e912387..37052782b32 100644
--- a/ompi/mca/op/base/op_base_frame.c
+++ b/ompi/mca/op/base/op_base_frame.c
@@ -60,6 +60,13 @@ OBJ_CLASS_INSTANCE(ompi_op_base_module_t, opal_object_t,
 OBJ_CLASS_INSTANCE(ompi_op_base_module_1_0_0_t, opal_object_t,
                    module_constructor_1_0_0, NULL);
 
+static int
+op_base_open(mca_base_open_flag_t flags)
+{
+    ompi_op_gpu_session_pool_init();
+    return OMPI_SUCCESS;
+}
+
 static int
 op_base_close(void)
 {
@@ -67,5 +74,5 @@ op_base_close(void)
     return OMPI_SUCCESS;
 }
 
-MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, NULL, op_base_close,
+MCA_BASE_FRAMEWORK_DECLARE(ompi, op, NULL, NULL, op_base_open, op_base_close,
                            mca_op_base_static_components, 0);
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 58770e435ed..6bdecfcae95 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -21,14 +21,14 @@
 BEGIN_C_DECLS
 
 /**
- * Private per-session state owned by the cuda component.
- * Stored in ompi_op_gpu_session_t.backend.
+ * Component-private state stored in ompi_op_gpu_cmd_queue_t.priv.
+ * Holds the GPU stream and shutdown flag; the command slot lives in the
+ * public cmd field of ompi_op_gpu_cmd_queue_t.
  */
 typedef struct {
-    ompi_op_gpu_cmd_t   *cmd;       /* managed-memory command slot        */
-    volatile int32_t    *shutdown;  /* managed-memory shutdown flag        */
-    cudaStream_t         stream;    /* private CUDA stream for this session */
-} ompi_op_cuda_session_priv_t;
+    volatile int32_t *shutdown;  /* managed-memory shutdown flag */
+    cudaStream_t      stream;    /* private CUDA stream for this cmd_queue */
+} ompi_op_cuda_cmd_queue_priv_t;
 
 /**
  * Host-side launcher function type.
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 97571fdc4bb..b32a83f72bf 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -20,17 +20,12 @@
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/mca/op/cuda/op_cuda.h"
 
-/* Forward declarations of session hooks (implemented in op_cuda_session.c) */
-ompi_op_gpu_session_t *ompi_op_cuda_session_begin(struct ompi_op_t *op,
-                                                   struct ompi_datatype_t *dtype,
-                                                   int dev_id);
-void ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
-                                 const void *src, void *dst, size_t count);
-void ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session);
-bool ompi_op_cuda_session_restart(ompi_op_gpu_session_t *session,
-                                   struct ompi_op_t *op,
-                                   struct ompi_datatype_t *dtype);
-void ompi_op_cuda_session_free(ompi_op_gpu_session_t *session);
+/* Forward declarations of hooks implemented in op_cuda_session.c */
+ompi_op_gpu_cmd_queue_t *ompi_op_cuda_cmd_queue_alloc(int dev_id);
+void ompi_op_cuda_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue);
+ompi_op_gpu_session_t *ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                                                   struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype);
 
 static int cuda_component_open(void);
 static int cuda_component_close(void);
@@ -43,8 +38,8 @@ static struct ompi_op_base_module_1_0_0_t *
  * Public component descriptor.
  *
  * This component does not provide per-op/per-type function pointers
- * (opc_op_query returns NULL).  Its sole contribution is the three session
- * hooks that enable persistent GPU reduction kernels.
+ * (opc_op_query returns NULL).  Its sole contribution is the three GPU
+ * hooks that enable persistent-kernel GPU reduction sessions.
  */
 ompi_op_base_component_1_0_0_t mca_op_cuda_component = {
     .opc_version = {
@@ -64,11 +59,9 @@ ompi_op_base_component_1_0_0_t mca_op_cuda_component = {
     .opc_op_query   = cuda_component_op_query,
 
     /* GPU session hooks */
+    .opc_cmd_queue_alloc = ompi_op_cuda_cmd_queue_alloc,
+    .opc_cmd_queue_free  = ompi_op_cuda_cmd_queue_free,
     .opc_session_begin   = ompi_op_cuda_session_begin,
-    .opc_session_reduce  = ompi_op_cuda_session_reduce,
-    .opc_session_stop    = ompi_op_cuda_session_stop,
-    .opc_session_restart = ompi_op_cuda_session_restart,
-    .opc_session_free    = ompi_op_cuda_session_free,
 };
 MCA_BASE_COMPONENT_INIT(ompi, op, cuda)
 
diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c
index 8720f8ef35d..06df7cc8dd4 100644
--- a/ompi/mca/op/cuda/op_cuda_session.c
+++ b/ompi/mca/op/cuda/op_cuda_session.c
@@ -12,24 +12,23 @@
 /*
  * Session lifecycle for the CUDA persistent-kernel op component.
  *
- * session_begin:   look up the kernel in the 2D launcher table, allocate
- *                  managed-memory command slot + shutdown flag, create a
- *                  private CUDA stream, and launch the persistent kernel.
+ * cmd_queue_alloc: allocate managed-memory command slot + shutdown flag
+ *                  and create a private CUDA stream.
+ *
+ * cmd_queue_free:  release the CUDA stream, managed memory, and
+ *                  component-private state.
+ *
+ * session_begin:   look up the kernel for (op, dtype), reset the cmd_queue
+ *                  state, and launch the persistent kernel on the existing
+ *                  stream.  Wires all session dispatch hooks and returns the
+ *                  session.  Returns NULL if no kernel exists.
  *
  * session_reduce:  write src/dst/count to the command slot, set status=1
  *                  to wake the kernel, and spin until status==2.
  *
  * session_stop:    signal the persistent kernel to exit and synchronize the
- *                  stream.  GPU stream and managed memory remain allocated
- *                  so the session can be reused via session_restart.
- *
- * session_restart: reconfigure an idle (stopped) session for a new (op, dtype)
- *                  combination and relaunch the appropriate persistent kernel.
- *                  Returns false if no GPU kernel exists for the combination.
- *
- * session_free:    release the CUDA stream, managed memory, and backend
- *                  private state when a session is permanently discarded.
- *                  Does NOT free the ompi_op_gpu_session_t struct.
+ *                  stream.  The cmd_queue's GPU stream and managed memory
+ *                  remain allocated for reuse.
  */
 
 #include "ompi_config.h"
@@ -49,171 +48,116 @@
 /* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */
 extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED];
 
+/* Forward declarations of static session hooks referenced from session_begin. */
+static void ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
+                                         const void *src1, const void *src2,
+                                         void *dst, size_t count);
+static void ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session);
+
 /* --------------------------------------------------------------------------
- * ompi_op_cuda_session_begin
+ * ompi_op_cuda_cmd_queue_alloc
  *
- * Called by the component.  Returns a malloc'd ompi_op_gpu_session_t on
- * success, NULL if this (op, type) combination has no GPU kernel or if
- * CUDA resource allocation fails.
+ * Allocate the expensive GPU resources for one device: a managed-memory
+ * command slot, a managed-memory shutdown flag, and a private CUDA stream.
+ * Returns NULL if any allocation fails.
  * -------------------------------------------------------------------------- */
-ompi_op_gpu_session_t *
-ompi_op_cuda_session_begin(struct ompi_op_t *op,
-                           struct ompi_datatype_t *dtype,
-                           int dev_id)
+ompi_op_gpu_cmd_queue_t *
+ompi_op_cuda_cmd_queue_alloc(int dev_id)
 {
-    int op_idx   = op->o_f_to_c_index;
-    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
-                   ? ompi_op_ddt_map[dtype->id] : -1;
-
-    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
-        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
-        return NULL;
-    }
-
-    ompi_op_cuda_launcher_fn_t launcher = ompi_op_cuda_kernel_fns[op_idx][type_idx];
-    if (NULL == launcher) {
-        return NULL;   /* no GPU kernel for this (op, type) combination */
-    }
-
-    /* Allocate the public session struct returned to the caller */
-    ompi_op_gpu_session_t *session =
-        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
-    if (NULL == session) {
+    ompi_op_gpu_cmd_queue_t *queue =
+        (ompi_op_gpu_cmd_queue_t *) malloc(sizeof(ompi_op_gpu_cmd_queue_t));
+    if (NULL == queue) {
         return NULL;
     }
+    OBJ_CONSTRUCT(&queue->super, opal_list_item_t);
 
-    /* Allocate component-private state */
-    ompi_op_cuda_session_priv_t *priv =
-        (ompi_op_cuda_session_priv_t *) malloc(sizeof(ompi_op_cuda_session_priv_t));
+    ompi_op_cuda_cmd_queue_priv_t *priv =
+        (ompi_op_cuda_cmd_queue_priv_t *) malloc(sizeof(ompi_op_cuda_cmd_queue_priv_t));
     if (NULL == priv) {
-        free(session);
+        free(queue);
         return NULL;
     }
 
     cudaError_t err;
 
     /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
-    err = cudaMallocManaged((void **) &priv->cmd,
+    err = cudaMallocManaged((void **) &queue->cmd,
                             sizeof(ompi_op_gpu_cmd_t),
                             cudaMemAttachGlobal);
     if (cudaSuccess != err) {
         free(priv);
-        free(session);
+        free(queue);
         return NULL;
     }
-    priv->cmd->src1   = NULL;
-    priv->cmd->src2   = NULL;
-    priv->cmd->dst    = NULL;
-    priv->cmd->count  = 0;
-    priv->cmd->status = 0;
+    queue->cmd->src1   = NULL;
+    queue->cmd->src2   = NULL;
+    queue->cmd->dst    = NULL;
+    queue->cmd->count  = 0;
+    queue->cmd->status = 0;
 
     /* Allocate managed-memory shutdown flag */
     err = cudaMallocManaged((void **) &priv->shutdown,
                             sizeof(int32_t),
                             cudaMemAttachGlobal);
     if (cudaSuccess != err) {
-        cudaFree(priv->cmd);
+        cudaFree(queue->cmd);
         free(priv);
-        free(session);
+        free(queue);
         return NULL;
     }
     *priv->shutdown = 0;
 
-    /* Create a dedicated non-blocking stream for this session */
+    /* Create a dedicated non-blocking stream for this cmd_queue */
     err = cudaStreamCreateWithFlags(&priv->stream, cudaStreamNonBlocking);
     if (cudaSuccess != err) {
         cudaFree(priv->shutdown);
-        cudaFree(priv->cmd);
+        cudaFree(queue->cmd);
         free(priv);
-        free(session);
+        free(queue);
         return NULL;
     }
 
-    /* Launch the persistent kernel (1 block, 256 threads) */
-    launcher(priv->cmd, priv->shutdown, priv->stream);
-    err = cudaGetLastError();
-    if (cudaSuccess != err) {
-        cudaStreamDestroy(priv->stream);
-        cudaFree(priv->shutdown);
-        cudaFree(priv->cmd);
-        free(priv);
-        free(session);
-        return NULL;
-    }
-
-    session->dev_id    = dev_id;
-    session->allocator = opal_accelerator_base_get_device_allocator(dev_id);
-    session->backend   = priv;
-
-    return session;
+    queue->dev_id    = dev_id;
+    queue->allocator = opal_accelerator_base_get_device_allocator(dev_id);
+    queue->priv      = priv;
+    return queue;
 }
 
 /* --------------------------------------------------------------------------
- * ompi_op_cuda_session_reduce
+ * ompi_op_cuda_cmd_queue_free
  *
- * Posts one reduction command to the persistent kernel and waits for it to
- * complete.  Semantics: dst[i] = src1[i] op src2[i] for i in [0, count).
- * src2 may alias dst for in-place operations.  All pointers must be
- * accessible from the GPU (device or managed memory).
+ * Release the CUDA stream, managed memory, and component-private state.
+ * Does NOT free the ompi_op_gpu_cmd_queue_t struct itself.
  * -------------------------------------------------------------------------- */
 void
-ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
-                            const void *src1, const void *src2,
-                            void *dst, size_t count)
+ompi_op_cuda_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue)
 {
-    ompi_op_cuda_session_priv_t *priv =
-        (ompi_op_cuda_session_priv_t *) session->backend;
-
-    /* Write operands before signalling the kernel */
-    priv->cmd->src1  = src1;
-    priv->cmd->src2  = src2;
-    priv->cmd->dst   = dst;
-    priv->cmd->count = (int64_t) count;
-
-    __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
-    priv->cmd->status = 1;                     /* wake the kernel */
-
-    /* Spin-wait for the kernel to signal completion */
-    while (2 != priv->cmd->status) {
-        sched_yield();   /* relinquish CPU timeslice while waiting */
+    ompi_op_cuda_cmd_queue_priv_t *priv =
+        (ompi_op_cuda_cmd_queue_priv_t *) queue->priv;
+    if (NULL == priv) {
+        return;
     }
 
-    /* Reset for the next call */
-    priv->cmd->status = 0;
-}
-
-/* --------------------------------------------------------------------------
- * ompi_op_cuda_session_stop
- *
- * Signal the persistent kernel to exit and wait for the stream to drain.
- * The GPU stream and managed memory remain allocated so the session can be
- * recycled via ompi_op_cuda_session_restart.
- * -------------------------------------------------------------------------- */
-void
-ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session)
-{
-    ompi_op_cuda_session_priv_t *priv =
-        (ompi_op_cuda_session_priv_t *) session->backend;
-
-    /* Signal the kernel to exit its loop */
-    *priv->shutdown = 1;
-    __atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-    /* Wait for the kernel to finish; stream remains valid after this */
-    cudaStreamSynchronize(priv->stream);
+    cudaStreamDestroy(priv->stream);
+    cudaFree((void *) priv->shutdown);
+    cudaFree(queue->cmd);
+    free(priv);
+    queue->priv = NULL;
+    queue->cmd  = NULL;
 }
 
 /* --------------------------------------------------------------------------
- * ompi_op_cuda_session_restart
+ * ompi_op_cuda_session_begin
  *
- * Reconfigure an idle (stopped) session for a new (op, dtype) combination
- * and relaunch the appropriate persistent kernel.  Returns false if no GPU
- * kernel exists for this combination.
+ * Look up the GPU kernel for (op, dtype), reset the cmd_queue state, and
+ * launch the persistent kernel on the existing stream.  Wires all session
+ * dispatch hooks before returning.  Returns NULL if no GPU kernel exists
+ * for this combination or if the kernel launch fails.
  * -------------------------------------------------------------------------- */
-bool
-ompi_op_cuda_session_restart(ompi_op_gpu_session_t *session,
-                              struct ompi_op_t *op,
-                              struct ompi_datatype_t *dtype)
+ompi_op_gpu_session_t *
+ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                            struct ompi_op_t *op,
+                            struct ompi_datatype_t *dtype)
 {
     int op_idx   = op->o_f_to_c_index;
     int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
@@ -221,54 +165,89 @@ ompi_op_cuda_session_restart(ompi_op_gpu_session_t *session,
 
     if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
         type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
-        return false;
+        return NULL;
     }
 
     ompi_op_cuda_launcher_fn_t launcher = ompi_op_cuda_kernel_fns[op_idx][type_idx];
     if (NULL == launcher) {
-        return false;
+        return NULL;
     }
 
-    ompi_op_cuda_session_priv_t *priv =
-        (ompi_op_cuda_session_priv_t *) session->backend;
+    ompi_op_cuda_cmd_queue_priv_t *priv =
+        (ompi_op_cuda_cmd_queue_priv_t *) queue->priv;
 
-    /* Reset state for the new kernel */
-    *priv->shutdown   = 0;
-    priv->cmd->src1   = NULL;
-    priv->cmd->src2   = NULL;
-    priv->cmd->dst    = NULL;
-    priv->cmd->count  = 0;
-    priv->cmd->status = 0;
+    /* Reset queue state for the new kernel */
+    *priv->shutdown    = 0;
+    queue->cmd->src1   = NULL;
+    queue->cmd->src2   = NULL;
+    queue->cmd->dst    = NULL;
+    queue->cmd->count  = 0;
+    queue->cmd->status = 0;
 
-    /* Launch the persistent kernel for the new (op, dtype) */
-    launcher(priv->cmd, priv->shutdown, priv->stream);
+    /* Launch the persistent kernel (1 block, 256 threads) */
+    launcher(queue->cmd, priv->shutdown, priv->stream);
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess != err) {
-        return false;
+        return NULL;
+    }
+
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
     }
 
-    return true;
+    session->queue     = queue;
+    session->allocator = queue->allocator;
+    session->reduce_fn = ompi_op_cuda_session_reduce;
+    session->stop_fn   = ompi_op_cuda_session_stop;
+    return session;
 }
 
 /* --------------------------------------------------------------------------
- * ompi_op_cuda_session_free
- *
- * Free the CUDA stream, managed memory, and backend private state.
- * Does NOT free the ompi_op_gpu_session_t struct (that is the caller's
- * responsibility, done by session_destroy in op_gpu_session.c).
+ * ompi_op_cuda_session_reduce
  * -------------------------------------------------------------------------- */
-void
-ompi_op_cuda_session_free(ompi_op_gpu_session_t *session)
+static void
+ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
+                             const void *src1, const void *src2,
+                             void *dst, size_t count)
 {
-    ompi_op_cuda_session_priv_t *priv =
-        (ompi_op_cuda_session_priv_t *) session->backend;
-    if (NULL == priv) {
-        return;
+    ompi_op_gpu_cmd_t *cmd = session->queue->cmd;
+
+    /* Write operands before signalling the kernel */
+    cmd->src1  = src1;
+    cmd->src2  = src2;
+    cmd->dst   = dst;
+    cmd->count = (int64_t) count;
+
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
+    cmd->status = 1;                           /* wake the kernel */
+
+    /* Spin-wait for the kernel to signal completion */
+    while (2 != cmd->status) {
+        sched_yield();   /* relinquish CPU timeslice while waiting */
     }
 
-    cudaStreamDestroy(priv->stream);
-    cudaFree((void *) priv->shutdown);
-    cudaFree(priv->cmd);
-    free(priv);
-    session->backend = NULL;
+    /* Reset for the next call */
+    cmd->status = 0;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_cuda_session_stop
+ *
+ * Signal the persistent kernel to exit and wait for the stream to drain.
+ * The cmd_queue's stream and managed memory remain allocated for reuse.
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session)
+{
+    ompi_op_cuda_cmd_queue_priv_t *priv =
+        (ompi_op_cuda_cmd_queue_priv_t *) session->queue->priv;
+
+    /* Signal the kernel to exit its loop */
+    *priv->shutdown = 1;
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+    /* Wait for the kernel to finish; stream remains valid after this */
+    cudaStreamSynchronize(priv->stream);
 }
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index f8c3a5bf447..0777aeddfab 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -326,53 +326,37 @@ typedef struct ompi_op_base_module_1_0_0_t *
   (*ompi_op_base_component_op_query_1_0_0_fn_t)
     (struct ompi_op_t *op, int *priority);
 
-/* Forward declaration for GPU session (defined in ompi/op/op_gpu_session.h) */
+/* Forward declarations for GPU types (defined in ompi/op/op_gpu_session.h) */
+struct ompi_op_gpu_cmd_queue_t;
 struct ompi_op_gpu_session_t;
 
 /**
- * Optional component hook: create a GPU reduction session for the given
- * (op, dtype) on a specific device.  Returns NULL if this component does
- * not support the combination (caller tries the next component).
+ * Optional component hook: allocate the expensive GPU resources for a
+ * cmd_queue on the given device: managed-memory command slot, shutdown flag,
+ * and a private GPU stream.  Returns NULL on allocation failure.
+ * The caller (op_gpu_session.c) wires session_begin_fn and free_fn.
  */
-typedef struct ompi_op_gpu_session_t *
-  (*ompi_op_base_component_session_begin_fn_t)(struct ompi_op_t *op,
-                                               struct ompi_datatype_t *dtype,
-                                               int dev_id);
-
-/**
- * Optional component hook: post one reduction to the persistent kernel and
- * block until done.
- */
-typedef void (*ompi_op_base_component_session_reduce_fn_t)(
-                  struct ompi_op_gpu_session_t *session,
-                  const void *src1, const void *src2, void *dst, size_t count);
+typedef struct ompi_op_gpu_cmd_queue_t *
+  (*ompi_op_base_component_cmd_queue_alloc_fn_t)(int dev_id);
 
 /**
- * Optional component hook: signal the persistent kernel to exit and
- * synchronize the stream.  The session struct and its managed memory remain
- * allocated so the session can be recycled by opc_session_restart.
+ * Optional component hook: release the managed memory, GPU stream, and
+ * component-private state owned by the cmd_queue.
+ * Must NOT free the ompi_op_gpu_cmd_queue_t struct itself.
  */
-typedef void (*ompi_op_base_component_session_stop_fn_t)(
-                  struct ompi_op_gpu_session_t *session);
+typedef void
+  (*ompi_op_base_component_cmd_queue_free_fn_t)(struct ompi_op_gpu_cmd_queue_t *queue);
 
 /**
- * Optional component hook: reconfigure an idle (stopped) session for a new
- * (op, dtype) combination and relaunch the appropriate persistent kernel.
- * Returns true on success; false if no GPU kernel exists for this combination
- * (caller should return the session to the pool and fall back to host path).
+ * Optional component hook: look up the GPU kernel for (op, dtype), reset the
+ * cmd_queue state, and launch the persistent kernel on the queue's stream.
+ * Returns a fully-wired ompi_op_gpu_session_t on success, NULL if no GPU
+ * kernel exists for this (op, dtype) combination.
  */
-typedef bool (*ompi_op_base_component_session_restart_fn_t)(
-                  struct ompi_op_gpu_session_t *session,
-                  struct ompi_op_t *op,
-                  struct ompi_datatype_t *dtype);
-
-/**
- * Optional component hook: free managed memory, GPU stream, and backend
- * private state.  Called when a pooled session is permanently discarded.
- * Must NOT free the ompi_op_gpu_session_t struct itself.
- */
-typedef void (*ompi_op_base_component_session_free_fn_t)(
-                  struct ompi_op_gpu_session_t *session);
+typedef struct ompi_op_gpu_session_t *
+  (*ompi_op_base_component_session_begin_fn_t)(struct ompi_op_gpu_cmd_queue_t *queue,
+                                               struct ompi_op_t *op,
+                                               struct ompi_datatype_t *dtype);
 
 /**
  * Op component interface.
@@ -392,12 +376,10 @@ typedef struct ompi_op_base_component_1_0_0_t {
     /** Query whether component is usable for given op */
     ompi_op_base_component_op_query_1_0_0_fn_t opc_op_query;
 
-    /** Optional: GPU session lifecycle hooks.  NULL in host-only components. */
-    ompi_op_base_component_session_begin_fn_t   opc_session_begin;
-    ompi_op_base_component_session_reduce_fn_t  opc_session_reduce;
-    ompi_op_base_component_session_stop_fn_t    opc_session_stop;
-    ompi_op_base_component_session_restart_fn_t opc_session_restart;
-    ompi_op_base_component_session_free_fn_t    opc_session_free;
+    /** Optional: GPU cmd_queue and session hooks.  NULL in host-only components. */
+    ompi_op_base_component_cmd_queue_alloc_fn_t  opc_cmd_queue_alloc;
+    ompi_op_base_component_cmd_queue_free_fn_t   opc_cmd_queue_free;
+    ompi_op_base_component_session_begin_fn_t    opc_session_begin;
 } ompi_op_base_component_1_0_0_t;
 
 
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
index 7974cc82f04..75410d2bbbd 100644
--- a/ompi/mca/op/rocm/op_rocm.h
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -21,14 +21,14 @@
 BEGIN_C_DECLS
 
 /**
- * Private per-session state owned by the rocm component.
- * Stored in ompi_op_gpu_session_t.backend.
+ * Component-private state stored in ompi_op_gpu_cmd_queue_t.priv.
+ * Holds the GPU stream and shutdown flag; the command slot lives in the
+ * public cmd field of ompi_op_gpu_cmd_queue_t.
  */
 typedef struct {
-    ompi_op_gpu_cmd_t   *cmd;       /* managed-memory command slot         */
-    volatile int32_t    *shutdown;  /* managed-memory shutdown flag         */
-    hipStream_t          stream;    /* private HIP stream for this session  */
-} ompi_op_rocm_session_priv_t;
+    volatile int32_t *shutdown;  /* managed-memory shutdown flag */
+    hipStream_t       stream;    /* private HIP stream for this cmd_queue */
+} ompi_op_rocm_cmd_queue_priv_t;
 
 /**
  * Host-side launcher function type.
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 69c801580f0..40d2425079a 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -20,17 +20,12 @@
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/mca/op/rocm/op_rocm.h"
 
-/* Forward declarations of session hooks (implemented in op_rocm_session.c) */
-ompi_op_gpu_session_t *ompi_op_rocm_session_begin(struct ompi_op_t *op,
-                                                   struct ompi_datatype_t *dtype,
-                                                   int dev_id);
-void ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
-                                  const void *src, void *dst, size_t count);
-void ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session);
-bool ompi_op_rocm_session_restart(ompi_op_gpu_session_t *session,
-                                   struct ompi_op_t *op,
-                                   struct ompi_datatype_t *dtype);
-void ompi_op_rocm_session_free(ompi_op_gpu_session_t *session);
+/* Forward declarations of hooks implemented in op_rocm_session.c */
+ompi_op_gpu_cmd_queue_t *ompi_op_rocm_cmd_queue_alloc(int dev_id);
+void ompi_op_rocm_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue);
+ompi_op_gpu_session_t *ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                                                   struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype);
 
 static int rocm_component_open(void);
 static int rocm_component_close(void);
@@ -60,11 +55,9 @@ ompi_op_base_component_1_0_0_t mca_op_rocm_component = {
     .opc_op_query   = rocm_component_op_query,
 
     /* GPU session hooks */
+    .opc_cmd_queue_alloc = ompi_op_rocm_cmd_queue_alloc,
+    .opc_cmd_queue_free  = ompi_op_rocm_cmd_queue_free,
     .opc_session_begin   = ompi_op_rocm_session_begin,
-    .opc_session_reduce  = ompi_op_rocm_session_reduce,
-    .opc_session_stop    = ompi_op_rocm_session_stop,
-    .opc_session_restart = ompi_op_rocm_session_restart,
-    .opc_session_free    = ompi_op_rocm_session_free,
 };
 MCA_BASE_COMPONENT_INIT(ompi, op, rocm)
 
diff --git a/ompi/mca/op/rocm/op_rocm_session.c b/ompi/mca/op/rocm/op_rocm_session.c
index 68d2f5b5030..ed3468649c7 100644
--- a/ompi/mca/op/rocm/op_rocm_session.c
+++ b/ompi/mca/op/rocm/op_rocm_session.c
@@ -13,24 +13,23 @@
  * Session lifecycle for the ROCm persistent-kernel op component.
  * Mirrors op_cuda_session.c with hip* API calls in place of cuda*.
  *
- * session_begin:   look up the kernel in the 2D launcher table, allocate
- *                  managed-memory command slot + shutdown flag, create a
- *                  private HIP stream, and launch the persistent kernel.
+ * cmd_queue_alloc: allocate managed-memory command slot + shutdown flag
+ *                  and create a private HIP stream.
+ *
+ * cmd_queue_free:  release the HIP stream, managed memory, and
+ *                  component-private state.
+ *
+ * session_begin:   look up the kernel for (op, dtype), reset the cmd_queue
+ *                  state, and launch the persistent kernel on the existing
+ *                  stream.  Wires all session dispatch hooks and returns the
+ *                  session.  Returns NULL if no kernel exists.
  *
  * session_reduce:  write src/dst/count to the command slot, set status=1
  *                  to wake the kernel, and spin until status==2.
  *
  * session_stop:    signal the persistent kernel to exit and synchronize the
- *                  stream.  GPU stream and managed memory remain allocated
- *                  so the session can be reused via session_restart.
- *
- * session_restart: reconfigure an idle (stopped) session for a new (op, dtype)
- *                  combination and relaunch the appropriate persistent kernel.
- *                  Returns false if no GPU kernel exists for the combination.
- *
- * session_free:    release the HIP stream, managed memory, and backend
- *                  private state when a session is permanently discarded.
- *                  Does NOT free the ompi_op_gpu_session_t struct.
+ *                  stream.  The cmd_queue's HIP stream and managed memory
+ *                  remain allocated for reuse.
  */
 
 #include "ompi_config.h"
@@ -50,142 +49,192 @@
 /* ompi_op_ddt_map[] maps dtype->id → OMPI_OP_BASE_TYPE_* (-1 if none) */
 extern int ompi_op_ddt_map[OMPI_DATATYPE_MAX_PREDEFINED];
 
+/* Forward declarations of static session hooks referenced from session_begin. */
+static void ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
+                                         const void *src1, const void *src2,
+                                         void *dst, size_t count);
+static void ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session);
+
 /* --------------------------------------------------------------------------
- * ompi_op_rocm_session_begin
+ * ompi_op_rocm_cmd_queue_alloc
  * -------------------------------------------------------------------------- */
-ompi_op_gpu_session_t *
-ompi_op_rocm_session_begin(struct ompi_op_t *op,
-                            struct ompi_datatype_t *dtype,
-                            int dev_id)
+ompi_op_gpu_cmd_queue_t *
+ompi_op_rocm_cmd_queue_alloc(int dev_id)
 {
-    int op_idx   = op->o_f_to_c_index;
-    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
-                   ? ompi_op_ddt_map[dtype->id] : -1;
-
-    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
-        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+    ompi_op_gpu_cmd_queue_t *queue =
+        (ompi_op_gpu_cmd_queue_t *) malloc(sizeof(ompi_op_gpu_cmd_queue_t));
+    if (NULL == queue) {
         return NULL;
     }
+    OBJ_CONSTRUCT(&queue->super, opal_list_item_t);
 
-    ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx];
-    if (NULL == launcher) {
-        return NULL;   /* no GPU kernel for this (op, type) combination */
-    }
-
-    /* Allocate the public session struct returned to the caller */
-    ompi_op_gpu_session_t *session =
-        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
-    if (NULL == session) {
-        return NULL;
-    }
-
-    /* Allocate component-private state */
-    ompi_op_rocm_session_priv_t *priv =
-        (ompi_op_rocm_session_priv_t *) malloc(sizeof(ompi_op_rocm_session_priv_t));
+    ompi_op_rocm_cmd_queue_priv_t *priv =
+        (ompi_op_rocm_cmd_queue_priv_t *) malloc(sizeof(ompi_op_rocm_cmd_queue_priv_t));
     if (NULL == priv) {
-        free(session);
+        free(queue);
         return NULL;
     }
 
     hipError_t err;
 
     /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
-    err = hipMallocManaged((void **) &priv->cmd,
+    err = hipMallocManaged((void **) &queue->cmd,
                            sizeof(ompi_op_gpu_cmd_t),
                            hipMemAttachGlobal);
     if (hipSuccess != err) {
         free(priv);
-        free(session);
+        free(queue);
         return NULL;
     }
-    priv->cmd->src1   = NULL;
-    priv->cmd->src2   = NULL;
-    priv->cmd->dst    = NULL;
-    priv->cmd->count  = 0;
-    priv->cmd->status = 0;
+    queue->cmd->src1   = NULL;
+    queue->cmd->src2   = NULL;
+    queue->cmd->dst    = NULL;
+    queue->cmd->count  = 0;
+    queue->cmd->status = 0;
 
     /* Allocate managed-memory shutdown flag */
     err = hipMallocManaged((void **) &priv->shutdown,
                            sizeof(int32_t),
                            hipMemAttachGlobal);
     if (hipSuccess != err) {
-        hipFree(priv->cmd);
+        hipFree(queue->cmd);
         free(priv);
-        free(session);
+        free(queue);
         return NULL;
     }
     *priv->shutdown = 0;
 
-    /* Create a dedicated non-blocking stream for this session */
+    /* Create a dedicated non-blocking stream for this cmd_queue */
     err = hipStreamCreateWithFlags(&priv->stream, hipStreamNonBlocking);
     if (hipSuccess != err) {
         hipFree(priv->shutdown);
-        hipFree(priv->cmd);
+        hipFree(queue->cmd);
         free(priv);
-        free(session);
+        free(queue);
+        return NULL;
+    }
+
+    queue->dev_id    = dev_id;
+    queue->allocator = opal_accelerator_base_get_device_allocator(dev_id);
+    queue->priv      = priv;
+    return queue;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_cmd_queue_free
+ *
+ * Release the HIP stream, managed memory, and component-private state.
+ * Does NOT free the ompi_op_gpu_cmd_queue_t struct itself.
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_rocm_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue)
+{
+    ompi_op_rocm_cmd_queue_priv_t *priv =
+        (ompi_op_rocm_cmd_queue_priv_t *) queue->priv;
+    if (NULL == priv) {
+        return;
+    }
+
+    hipStreamDestroy(priv->stream);
+    hipFree((void *) priv->shutdown);
+    hipFree(queue->cmd);
+    free(priv);
+    queue->priv = NULL;
+    queue->cmd  = NULL;
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_rocm_session_begin
+ * -------------------------------------------------------------------------- */
+ompi_op_gpu_session_t *
+ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                            struct ompi_op_t *op,
+                            struct ompi_datatype_t *dtype)
+{
+    int op_idx   = op->o_f_to_c_index;
+    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
+                   ? ompi_op_ddt_map[dtype->id] : -1;
+
+    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
+        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
+        return NULL;
+    }
+
+    ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx];
+    if (NULL == launcher) {
         return NULL;
     }
 
+    ompi_op_rocm_cmd_queue_priv_t *priv =
+        (ompi_op_rocm_cmd_queue_priv_t *) queue->priv;
+
+    /* Reset queue state for the new kernel */
+    *priv->shutdown    = 0;
+    queue->cmd->src1   = NULL;
+    queue->cmd->src2   = NULL;
+    queue->cmd->dst    = NULL;
+    queue->cmd->count  = 0;
+    queue->cmd->status = 0;
+
     /* Launch the persistent kernel (1 block, 256 threads) */
-    launcher(priv->cmd, priv->shutdown, priv->stream);
-    err = hipGetLastError();
+    launcher(queue->cmd, priv->shutdown, priv->stream);
+    hipError_t err = hipGetLastError();
     if (hipSuccess != err) {
-        hipStreamDestroy(priv->stream);
-        hipFree(priv->shutdown);
-        hipFree(priv->cmd);
-        free(priv);
-        free(session);
         return NULL;
     }
 
-    session->dev_id    = dev_id;
-    session->allocator = opal_accelerator_base_get_device_allocator(dev_id);
-    session->backend   = priv;
+    ompi_op_gpu_session_t *session =
+        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
+    if (NULL == session) {
+        return NULL;
+    }
 
+    session->queue     = queue;
+    session->allocator = queue->allocator;
+    session->reduce_fn = ompi_op_rocm_session_reduce;
+    session->stop_fn   = ompi_op_rocm_session_stop;
     return session;
 }
 
 /* --------------------------------------------------------------------------
  * ompi_op_rocm_session_reduce
  * -------------------------------------------------------------------------- */
-void
+static void
 ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
                              const void *src1, const void *src2,
                              void *dst, size_t count)
 {
-    ompi_op_rocm_session_priv_t *priv =
-        (ompi_op_rocm_session_priv_t *) session->backend;
+    ompi_op_gpu_cmd_t *cmd = session->queue->cmd;
 
     /* Write operands before signalling the kernel */
-    priv->cmd->src1  = src1;
-    priv->cmd->src2  = src2;
-    priv->cmd->dst   = dst;
-    priv->cmd->count = (int64_t) count;
+    cmd->src1  = src1;
+    cmd->src2  = src2;
+    cmd->dst   = dst;
+    cmd->count = (int64_t) count;
 
     __atomic_thread_fence(__ATOMIC_SEQ_CST);   /* ensure writes visible to GPU */
-    priv->cmd->status = 1;                     /* wake the kernel */
+    cmd->status = 1;                           /* wake the kernel */
 
     /* Spin-wait for the kernel to signal completion */
-    while (2 != priv->cmd->status) {
+    while (2 != cmd->status) {
         sched_yield();   /* relinquish CPU timeslice while waiting */
     }
 
     /* Reset for the next call */
-    priv->cmd->status = 0;
+    cmd->status = 0;
 }
 
 /* --------------------------------------------------------------------------
  * ompi_op_rocm_session_stop
  *
  * Signal the persistent kernel to exit and wait for the stream to drain.
- * The HIP stream and managed memory remain allocated so the session can be
- * recycled via ompi_op_rocm_session_restart.
+ * The cmd_queue's stream and managed memory remain allocated for reuse.
  * -------------------------------------------------------------------------- */
-void
+static void
 ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session)
 {
-    ompi_op_rocm_session_priv_t *priv =
-        (ompi_op_rocm_session_priv_t *) session->backend;
+    ompi_op_rocm_cmd_queue_priv_t *priv =
+        (ompi_op_rocm_cmd_queue_priv_t *) session->queue->priv;
 
     /* Signal the kernel to exit its loop */
     *priv->shutdown = 1;
@@ -194,73 +243,3 @@ ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session)
     /* Wait for the kernel to finish; stream remains valid after this */
     hipStreamSynchronize(priv->stream);
 }
-
-/* --------------------------------------------------------------------------
- * ompi_op_rocm_session_restart
- *
- * Reconfigure an idle (stopped) session for a new (op, dtype) combination
- * and relaunch the appropriate persistent kernel.  Returns false if no GPU
- * kernel exists for this combination.
- * -------------------------------------------------------------------------- */
-bool
-ompi_op_rocm_session_restart(ompi_op_gpu_session_t *session,
-                              struct ompi_op_t *op,
-                              struct ompi_datatype_t *dtype)
-{
-    int op_idx   = op->o_f_to_c_index;
-    int type_idx = (dtype->id < OMPI_DATATYPE_MAX_PREDEFINED)
-                   ? ompi_op_ddt_map[dtype->id] : -1;
-
-    if (op_idx  < 0 || op_idx  >= OMPI_OP_BASE_FORTRAN_OP_MAX ||
-        type_idx < 0 || type_idx >= OMPI_OP_BASE_TYPE_MAX) {
-        return false;
-    }
-
-    ompi_op_rocm_launcher_fn_t launcher = ompi_op_rocm_kernel_fns[op_idx][type_idx];
-    if (NULL == launcher) {
-        return false;
-    }
-
-    ompi_op_rocm_session_priv_t *priv =
-        (ompi_op_rocm_session_priv_t *) session->backend;
-
-    /* Reset state for the new kernel */
-    *priv->shutdown   = 0;
-    priv->cmd->src1   = NULL;
-    priv->cmd->src2   = NULL;
-    priv->cmd->dst    = NULL;
-    priv->cmd->count  = 0;
-    priv->cmd->status = 0;
-
-    /* Launch the persistent kernel for the new (op, dtype) */
-    launcher(priv->cmd, priv->shutdown, priv->stream);
-    hipError_t err = hipGetLastError();
-    if (hipSuccess != err) {
-        return false;
-    }
-
-    return true;
-}
-
-/* --------------------------------------------------------------------------
- * ompi_op_rocm_session_free
- *
- * Free the HIP stream, managed memory, and backend private state.
- * Does NOT free the ompi_op_gpu_session_t struct (that is the caller's
- * responsibility, done by session_destroy in op_gpu_session.c).
- * -------------------------------------------------------------------------- */
-void
-ompi_op_rocm_session_free(ompi_op_gpu_session_t *session)
-{
-    ompi_op_rocm_session_priv_t *priv =
-        (ompi_op_rocm_session_priv_t *) session->backend;
-    if (NULL == priv) {
-        return;
-    }
-
-    hipStreamDestroy(priv->stream);
-    hipFree((void *) priv->shutdown);
-    hipFree(priv->cmd);
-    free(priv);
-    session->backend = NULL;
-}
diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c
index b459fef6bc0..f393ef2b43c 100644
--- a/ompi/op/op_gpu_session.c
+++ b/ompi/op/op_gpu_session.c
@@ -10,103 +10,131 @@
  */
 
 /*
- * Dispatcher and freelist pool for GPU reduction sessions.
+ * Dispatcher and cmd_queue pool for GPU reduction sessions.
  *
- * Sessions are expensive to create: each one allocates managed memory and
- * creates a private GPU stream.  Rather than destroy a session at the end
- * of every collective and recreate it at the start of the next, we keep a
- * flat pool of idle sessions keyed by dev_id.
+ * The expensive GPU resources — managed-memory command slot, shutdown flag,
+ * and private GPU stream — are bundled into an ompi_op_gpu_cmd_queue_t and
+ * pooled by dev_id.  Sessions themselves are lightweight structs (function
+ * pointers + a pointer to the cmd_queue) and are allocated fresh for each
+ * collective.
  *
- * Pool lifecycle:
- *   session_end()  — stops the persistent kernel (GPU stream and managed
- *                    memory remain allocated), then pushes the session onto
- *                    the freelist.
- *   session_begin() — if a matching dev_id entry is found, pops it and calls
- *                    restart_fn(session, op, dtype) to reconfigure and relaunch
- *                    the appropriate kernel; no cudaMalloc / hipMalloc overhead.
- *                    On pool miss, iterates op components to allocate fresh.
+ * Pool implementation:
+ *   cmd_queue_pool  — opal_lifo_t providing lock-free thread-safe push/pop.
+ *   cmd_queue_pool_count — atomic counter tracking current pool depth;
+ *                          used to enforce CMD_QUEUE_POOL_MAX without a mutex.
  *
- * Pool layout:
- *   session_pool_head — singly-linked freelist, linked through session->pool_next
- *   session_pool_count — current freelist length (global cap = SESSION_POOL_MAX)
- *   session_pool_lock  — single mutex protecting all pool state
+ * Pool lifecycle:
+ *   session_end()   — stops the persistent kernel (cmd_queue resources remain
+ *                     allocated), then pushes the cmd_queue into the lifo pool.
+ *   session_begin() — pops from the lifo looking for a matching dev_id entry
+ *                     and calls queue->session_begin_fn(queue, op, dtype) to
+ *                     configure and relaunch the kernel; no cudaMalloc overhead.
+ *                     On pool miss, iterates op components to allocate a fresh
+ *                     cmd_queue, then calls session_begin.
+ *                     On pool hit with no matching dev_id, the queue is pushed
+ *                     back and a fresh allocation is attempted.
+ *                     On pool hit with no kernel for (op, dtype), the queue is
+ *                     returned to the pool and NULL is returned.
  *
- * SESSION_POOL_MAX caps the total number of idle sessions.  Sessions beyond
- * the cap are permanently destroyed rather than pooled to bound GPU resource
- * accumulation.
+ * CMD_QUEUE_POOL_MAX caps the total number of idle cmd_queues to bound GPU
+ * resource accumulation.
  */
 
 #include "ompi_config.h"
 
 #include <stdlib.h>
 
+#include "opal/class/opal_lifo.h"
 #include "opal/class/opal_list.h"
 #include "opal/mca/accelerator/base/base.h"
 #include "opal/mca/base/base.h"
-#include "opal/mca/threads/mutex.h"
+#include "opal/sys/atomic.h"
 #include "ompi/mca/op/op.h"
 #include "ompi/mca/op/base/base.h"
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/op/op.h"
 
-/* Maximum number of idle sessions kept in the pool. */
-#define SESSION_POOL_MAX 8
+/* Maximum number of idle cmd_queues kept in the pool. */
+#define CMD_QUEUE_POOL_MAX 16
 
-static ompi_op_gpu_session_t *session_pool_head  = NULL;
-static int                    session_pool_count  = 0;
-static opal_mutex_t           session_pool_lock   = OPAL_MUTEX_STATIC_INIT;
+static opal_lifo_t          cmd_queue_pool;
+static opal_atomic_int32_t  cmd_queue_pool_count = 0;
 
 /* --------------------------------------------------------------------------
- * session_destroy — permanently shut down a session and free all resources.
- * Called when the pool is at capacity or at finalization.
+ * cmd_queue_destroy — permanently release a cmd_queue's GPU resources.
  * -------------------------------------------------------------------------- */
 static void
-session_destroy(ompi_op_gpu_session_t *session)
+cmd_queue_destroy(ompi_op_gpu_cmd_queue_t *queue)
 {
-    session->free_fn(session);   /* component frees stream, managed mem, priv */
-    free(session);
+    queue->free_fn(queue);   /* component frees stream, managed mem, priv */
+    OBJ_DESTRUCT(&queue->super);
+    free(queue);
+}
+
+/* --------------------------------------------------------------------------
+ * cmd_queue_pool_push — return a cmd_queue to the pool.
+ * Destroys the queue instead if the pool is already at capacity.
+ * -------------------------------------------------------------------------- */
+static void
+cmd_queue_pool_push(ompi_op_gpu_cmd_queue_t *queue)
+{
+    if (opal_atomic_add_fetch_32(&cmd_queue_pool_count, 1) <= CMD_QUEUE_POOL_MAX) {
+        opal_lifo_push(&cmd_queue_pool, &queue->super);
+    } else {
+        opal_atomic_add_fetch_32(&cmd_queue_pool_count, -1);
+        cmd_queue_destroy(queue);
+    }
+}
+
+/* --------------------------------------------------------------------------
+ * ompi_op_gpu_session_pool_init
+ * -------------------------------------------------------------------------- */
+void
+ompi_op_gpu_session_pool_init(void)
+{
+    OBJ_CONSTRUCT(&cmd_queue_pool, opal_lifo_t);
 }
 
 /* --------------------------------------------------------------------------
  * ompi_op_gpu_session_begin
  *
- * 1. Walk the pool freelist for a matching dev_id entry.
- * 2. On hit: pop the idle session, call restart_fn to reconfigure for the
- *    new (op, dtype) and relaunch the kernel.  If restart fails (no kernel
- *    for this combination), destroy the session and return NULL.
- * 3. On pool miss: iterate op components to create a new session; wire
- *    dispatch hooks before returning.
+ * 1. Pop one entry from the lifo pool.
+ * 2. If dev_id matches: call queue->session_begin_fn to configure and
+ *    relaunch the kernel.  On success return the session.  On failure
+ *    (no kernel for this op/dtype), push the queue back and return NULL.
+ * 3. If dev_id doesn't match: push the queue back and fall through to
+ *    fresh allocation.
+ * 4. Pool miss: iterate op components to allocate a fresh cmd_queue and
+ *    call opc_session_begin.
  * -------------------------------------------------------------------------- */
 ompi_op_gpu_session_t *
 ompi_op_gpu_session_begin(struct ompi_op_t *op,
                           struct ompi_datatype_t *dtype,
                           int dev_id)
 {
-    /* Check pool for a reusable idle session on this device. */
-    OPAL_THREAD_LOCK(&session_pool_lock);
-    ompi_op_gpu_session_t **pp = &session_pool_head;
-    while (NULL != *pp) {
-        if ((*pp)->dev_id == dev_id) {
-            /* Found a matching idle session — remove from freelist. */
-            ompi_op_gpu_session_t *s = *pp;
-            *pp = s->pool_next;
-            session_pool_count--;
-            OPAL_THREAD_UNLOCK(&session_pool_lock);
-            s->pool_next = NULL;
-
-            /* Reconfigure the session for the new (op, dtype). */
-            if (!s->restart_fn(s, op, dtype)) {
-                /* No GPU kernel for this combination; release and return NULL. */
-                session_destroy(s);
-                return NULL;
+    /* Check pool for a reusable cmd_queue. */
+    opal_list_item_t *item = opal_lifo_pop(&cmd_queue_pool);
+    if (NULL != item) {
+        opal_atomic_add_fetch_32(&cmd_queue_pool_count, -1);
+        ompi_op_gpu_cmd_queue_t *q = (ompi_op_gpu_cmd_queue_t *) item;
+
+        if (q->dev_id == dev_id) {
+            ompi_op_gpu_session_t *s = q->session_begin_fn(q, op, dtype);
+            if (NULL != s) {
+                return s;
             }
-            return s;
+            /* No GPU kernel for this (op, dtype).  Return the cmd_queue to
+             * the pool so it can be reused for a future combination that does
+             * have a kernel.  Caller falls back to ompi_op_reduce(). */
+            cmd_queue_pool_push(q);
+            return NULL;
         }
-        pp = &(*pp)->pool_next;
+
+        /* Wrong device — push back and fall through to fresh allocation. */
+        cmd_queue_pool_push(q);
     }
-    OPAL_THREAD_UNLOCK(&session_pool_lock);
 
-    /* Pool miss — create a fresh session via the first matching component. */
+    /* Pool miss (or wrong device) — allocate a fresh cmd_queue. */
     mca_base_component_list_item_t *cli;
     OPAL_LIST_FOREACH(cli, &ompi_op_base_framework.framework_components,
                       mca_base_component_list_item_t) {
@@ -121,65 +149,34 @@ ompi_op_gpu_session_begin(struct ompi_op_t *op,
         const ompi_op_base_component_1_0_0_t *opc =
             (const ompi_op_base_component_1_0_0_t *) bc;
 
-        if (NULL == opc->opc_session_begin   ||
-            NULL == opc->opc_session_reduce  ||
-            NULL == opc->opc_session_stop    ||
-            NULL == opc->opc_session_restart ||
-            NULL == opc->opc_session_free) {
+        if (NULL == opc->opc_cmd_queue_alloc ||
+            NULL == opc->opc_cmd_queue_free  ||
+            NULL == opc->opc_session_begin) {
+            continue;
+        }
+
+        ompi_op_gpu_cmd_queue_t *q = opc->opc_cmd_queue_alloc(dev_id);
+        if (NULL == q) {
             continue;
         }
 
-        ompi_op_gpu_session_t *session = opc->opc_session_begin(op, dtype, dev_id);
+        /* Wire dispatch hooks into the cmd_queue. */
+        q->session_begin_fn = opc->opc_session_begin;
+        q->free_fn          = opc->opc_cmd_queue_free;
+
+        ompi_op_gpu_session_t *session = opc->opc_session_begin(q, op, dtype);
         if (NULL == session) {
+            /* This component has no kernel for (op, dtype); discard the queue. */
+            cmd_queue_destroy(q);
             continue;
         }
 
-        /* Wire dispatch hooks and pool bookkeeping. */
-        session->reduce_fn  = opc->opc_session_reduce;
-        session->stop_fn    = opc->opc_session_stop;
-        session->restart_fn = opc->opc_session_restart;
-        session->free_fn    = opc->opc_session_free;
-        session->pool_next  = NULL;
         return session;
     }
 
     return NULL;
 }
 
-/* --------------------------------------------------------------------------
- * ompi_op_gpu_session_begin_alloc
- *
- * Create a lightweight session with GPU scratch-memory allocation only.
- * No persistent kernel is launched; reduce_fn and the other kernel hooks are
- * NULL.  The session is freed directly by session_end (not pooled) because
- * it holds no GPU stream or managed memory of its own.
- * -------------------------------------------------------------------------- */
-ompi_op_gpu_session_t *
-ompi_op_gpu_session_begin_alloc(int dev_id)
-{
-    mca_allocator_base_module_t *allocator =
-        opal_accelerator_base_get_device_allocator(dev_id);
-    if (NULL == allocator) {
-        return NULL;
-    }
-
-    ompi_op_gpu_session_t *session =
-        (ompi_op_gpu_session_t *) malloc(sizeof(ompi_op_gpu_session_t));
-    if (NULL == session) {
-        return NULL;
-    }
-
-    session->dev_id     = dev_id;
-    session->allocator  = allocator;
-    session->backend    = NULL;
-    session->reduce_fn  = NULL;
-    session->stop_fn    = NULL;
-    session->restart_fn = NULL;
-    session->free_fn    = NULL;
-    session->pool_next  = NULL;
-    return session;
-}
-
 /* --------------------------------------------------------------------------
  * ompi_op_gpu_session_reduce
  * -------------------------------------------------------------------------- */
@@ -194,9 +191,7 @@ ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
 /* --------------------------------------------------------------------------
  * ompi_op_gpu_session_end
  *
- * Stop the persistent kernel and return the session to the pool so its GPU
- * stream and managed memory can be reused by the next collective on the same
- * device.  If the pool is already at SESSION_POOL_MAX, destroy immediately.
+ * Stop the persistent kernel and return the cmd_queue to the pool.
  * -------------------------------------------------------------------------- */
 void
 ompi_op_gpu_session_end(ompi_op_gpu_session_t *session)
@@ -205,49 +200,27 @@ ompi_op_gpu_session_end(ompi_op_gpu_session_t *session)
         return;
     }
 
-    /* Alloc-only sessions (stop_fn == NULL) hold no kernel resources.
-     * Free the struct immediately; they are not pooled. */
-    if (NULL == session->stop_fn) {
-        free(session);
-        return;
-    }
-
-    /* Signal the kernel to exit and wait for the stream to drain.
-     * GPU stream and managed memory remain allocated for reuse. */
+    /* Signal the kernel to exit and wait for the stream to drain. */
     session->stop_fn(session);
 
-    OPAL_THREAD_LOCK(&session_pool_lock);
-    if (session_pool_count < SESSION_POOL_MAX) {
-        session->pool_next = session_pool_head;
-        session_pool_head  = session;
-        session_pool_count++;
-        OPAL_THREAD_UNLOCK(&session_pool_lock);
-        return;
-    }
-    OPAL_THREAD_UNLOCK(&session_pool_lock);
+    ompi_op_gpu_cmd_queue_t *q = session->queue;
+    free(session);
 
-    /* Pool full — destroy immediately. */
-    session_destroy(session);
+    cmd_queue_pool_push(q);
 }
 
 /* --------------------------------------------------------------------------
  * ompi_op_gpu_session_pool_finalize
  *
- * Drain the pool, release all GPU resources, and free session structs.
+ * Drain the pool, release all GPU resources, and destroy the lifo.
  * Called once from ompi_op_base_close() during MPI_Finalize.
  * -------------------------------------------------------------------------- */
 void
 ompi_op_gpu_session_pool_finalize(void)
 {
-    OPAL_THREAD_LOCK(&session_pool_lock);
-    ompi_op_gpu_session_t *s = session_pool_head;
-    session_pool_head  = NULL;
-    session_pool_count = 0;
-    OPAL_THREAD_UNLOCK(&session_pool_lock);
-
-    while (NULL != s) {
-        ompi_op_gpu_session_t *next = s->pool_next;
-        session_destroy(s);
-        s = next;
+    opal_list_item_t *item;
+    while (NULL != (item = opal_lifo_pop(&cmd_queue_pool))) {
+        cmd_queue_destroy((ompi_op_gpu_cmd_queue_t *) item);
     }
+    OBJ_DESTRUCT(&cmd_queue_pool);
 }
diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h
index 0050f0b7ed0..2e5873a14bf 100644
--- a/ompi/op/op_gpu_session.h
+++ b/ompi/op/op_gpu_session.h
@@ -15,6 +15,7 @@
 #include "ompi_config.h"
 #include <stdbool.h>
 #include <stdint.h>
+#include "opal/class/opal_list.h"
 #include "opal/mca/allocator/allocator.h"
 
 BEGIN_C_DECLS
@@ -39,44 +40,58 @@ typedef struct {
     volatile int32_t status;
 } ompi_op_gpu_cmd_t;
 
+/**
+ * The expensive-to-create GPU resources needed by a persistent reduction
+ * kernel: a managed-memory command slot and a private GPU stream.  Pooled
+ * by dev_id so they can be reused across collectives without paying
+ * cudaMallocManaged/hipMallocManaged overhead on every call.
+ *
+ * cmd is public (the host communicates with the kernel through it directly).
+ * priv is component-private and holds the stream and shutdown flag.
+ *
+ * session_begin_fn and free_fn are managed by op_gpu_session.c
+ * and must not be set by callers.
+ */
+typedef struct ompi_op_gpu_cmd_queue_t {
+    opal_list_item_t             super;       /* MUST be first: used by opal_lifo_t pool */
+    int                          dev_id;
+    mca_allocator_base_module_t *allocator;  /* GPU scratch allocator for this device */
+    ompi_op_gpu_cmd_t           *cmd;        /* managed memory — shared with GPU */
+    void                        *priv;       /* component-private: stream, shutdown flag */
+    /* Session creation hook — wired at cmd_queue_alloc time by op_gpu_session.c. */
+    struct ompi_op_gpu_session_t *(*session_begin_fn)(
+        struct ompi_op_gpu_cmd_queue_t *queue,
+        struct ompi_op_t *op,
+        struct ompi_datatype_t *dtype);
+    /* Release managed memory, GPU stream, and priv.
+     * Must NOT free the ompi_op_gpu_cmd_queue_t struct itself. */
+    void (*free_fn)(struct ompi_op_gpu_cmd_queue_t *queue);
+} ompi_op_gpu_cmd_queue_t;
+
 /**
  * Per-collective GPU reduction session.  Created by ompi_op_gpu_session_begin()
- * before a collective algorithm's reduction loop starts, and returned to the
- * session pool by ompi_op_gpu_session_end() for reuse by a future collective.
+ * before a collective algorithm's reduction loop, and destroyed (with its
+ * cmd_queue recycled to the pool) by ompi_op_gpu_session_end().
  *
- * Pool lifecycle: session_end() stops the persistent kernel (GPU resources
- * remain allocated) and pushes the session onto a freelist.  A future
- * session_begin() for the same dev_id pops the idle session and calls
- * restart_fn to reconfigure and relaunch the appropriate kernel — no
- * cudaMalloc/hipMalloc or stream creation overhead on the reuse path.
+ * Sessions are lightweight: all expensive GPU resources (managed memory,
+ * GPU stream) live in the cmd_queue, which is pooled separately.  The session
+ * holds only a pointer to the cmd_queue and the dispatch function pointers.
+ *
+ * The component's opc_session_begin wires queue, allocator, reduce_fn, and
+ * stop_fn.  Callers must not set these fields directly.
  *
  * When no GPU op component supports the (op, dtype) combination, begin()
  * returns NULL and all callers fall back to ompi_op_reduce().
- *
- * reduce_fn, stop_fn, restart_fn, free_fn, and pool_next are managed by
- * op_gpu_session.c — callers must not set them directly.
  */
 typedef struct ompi_op_gpu_session_t {
-    int                          dev_id;
-    mca_allocator_base_module_t *allocator;  /* GPU scratch allocator for this session */
-    void                        *backend;    /* opaque: cuda or rocm session state */
-    /* Dispatch hooks wired at session_begin time. */
+    ompi_op_gpu_cmd_queue_t     *queue;
+    mca_allocator_base_module_t *allocator;  /* GPU scratch allocator (= queue->allocator) */
+    /* Dispatch hooks wired by the component's opc_session_begin. */
     void (*reduce_fn)(struct ompi_op_gpu_session_t *session,
                       const void *src1, const void *src2, void *dst, size_t count);
     /* Signal the persistent kernel to exit and synchronize the stream.
-     * GPU stream and managed memory remain allocated for reuse. */
+     * The cmd_queue's resources remain valid for reuse after this call. */
     void (*stop_fn)(struct ompi_op_gpu_session_t *session);
-    /* Reconfigure an idle session for a new (op, dtype) and relaunch the
-     * persistent kernel.  Returns false if no GPU kernel exists for this
-     * combination (caller must then free the session and return NULL). */
-    bool (*restart_fn)(struct ompi_op_gpu_session_t *session,
-                       struct ompi_op_t *op,
-                       struct ompi_datatype_t *dtype);
-    /* Release managed memory, GPU stream, and backend private state.
-     * Must NOT free the ompi_op_gpu_session_t struct itself. */
-    void (*free_fn)(struct ompi_op_gpu_session_t *session);
-    /* Pool bookkeeping — do not access directly. */
-    struct ompi_op_gpu_session_t *pool_next;
 } ompi_op_gpu_session_t;
 
 /**
@@ -88,15 +103,6 @@ OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin(struct ompi_op_t
                                                                 struct ompi_datatype_t *dtype,
                                                                 int dev_id);
 
-/**
- * Create a lightweight session that provides GPU scratch-memory allocation only,
- * without launching a persistent reduction kernel.  Suitable for collective
- * algorithms that need temporary device memory but perform no GPU reduction.
- * Returns NULL if no device allocator is available for dev_id.
- * The returned session is freed by ompi_op_gpu_session_end().
- */
-OMPI_DECLSPEC ompi_op_gpu_session_t *ompi_op_gpu_session_begin_alloc(int dev_id);
-
 /**
  * Post one reduction command (src1 op src2 → dst) to the persistent kernel and
  * wait for completion.  src2 may alias dst for in-place operations.
@@ -107,15 +113,21 @@ OMPI_DECLSPEC void ompi_op_gpu_session_reduce(ompi_op_gpu_session_t *session,
                                                void *dst, size_t count);
 
 /**
- * Stop the persistent kernel and return the session to the pool for reuse.
- * GPU stream and managed memory remain allocated; a future begin() call for
- * the same dev_id will relaunch the kernel without allocating new resources.
- * NULL-safe.
+ * Stop the persistent kernel and return the session's cmd_queue to the pool
+ * for reuse.  GPU stream and managed memory remain allocated; a future begin()
+ * call for the same dev_id will relaunch the kernel without allocating new
+ * resources.  NULL-safe.
  */
 OMPI_DECLSPEC void ompi_op_gpu_session_end(ompi_op_gpu_session_t *session);
 
 /**
- * Drain and permanently destroy all pooled sessions.  Must be called once
+ * Initialize the cmd_queue pool.  Must be called once before any session
+ * operations (from ompi_op_base_open via the framework open hook).
+ */
+OMPI_DECLSPEC void ompi_op_gpu_session_pool_init(void);
+
+/**
+ * Drain and permanently destroy all pooled cmd_queues.  Must be called once
  * during MPI finalization (from ompi_op_base_close).
  */
 OMPI_DECLSPEC void ompi_op_gpu_session_pool_finalize(void);

From d8ccda76be1edeae27bcc32cb90acec0cc06ae32 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 25 May 2026 19:54:57 -0400
Subject: [PATCH 08/13] Fix compile issues

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/coll/base/coll_base_allgather.c        | 2 +-
 ompi/mca/coll/tuned/coll_tuned_decision_fixed.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c
index 5b357b511bc..a9480f09169 100644
--- a/ompi/mca/coll/base/coll_base_allgather.c
+++ b/ompi/mca/coll/base/coll_base_allgather.c
@@ -114,7 +114,7 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, size_t scount
         int k = 2;
         return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
-                                                      comm, module, k);
+                                                      comm, module, k, NULL);
     }
 
     OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
index 99727410629..c8299c68a66 100644
--- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
+++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c
@@ -416,7 +416,7 @@ ompi_coll_tuned_allreduce_intra_disjoint_dec_fixed(const void *sbuf, void *rbuf,
     }
 
     return ompi_coll_tuned_allreduce_intra_do_this (sbuf, rbuf, count, dtype, op,
-                                                    comm, module, alg, 0, 0);
+                                                    comm, module, alg, 0, 0, NULL);
 }
 
                            

From 2f34ecf86e36f198be0a2e75f9691a58bfb7b219 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Tue, 26 May 2026 08:48:58 -0400
Subject: [PATCH 09/13] Introduce inheritance for op queues

The CUDA/rocm queues inherit from the generic queue and provide callbacks.
The generic code will release the queue once done.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/op/cuda/op_cuda.h           |  17 +--
 ompi/mca/op/cuda/op_cuda_component.c |   2 -
 ompi/mca/op/cuda/op_cuda_session.c   | 149 +++++++++++++--------------
 ompi/mca/op/op.h                     |   9 --
 ompi/mca/op/rocm/op_rocm.h           |  17 +--
 ompi/mca/op/rocm/op_rocm_component.c |   2 -
 ompi/mca/op/rocm/op_rocm_session.c   | 132 +++++++++++-------------
 ompi/op/op_gpu_session.c             |  12 +--
 ompi/op/op_gpu_session.h             |  16 ++-
 9 files changed, 166 insertions(+), 190 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index 6bdecfcae95..e6da8c73ecf 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -21,14 +21,17 @@
 BEGIN_C_DECLS
 
 /**
- * Component-private state stored in ompi_op_gpu_cmd_queue_t.priv.
- * Holds the GPU stream and shutdown flag; the command slot lives in the
- * public cmd field of ompi_op_gpu_cmd_queue_t.
+ * CUDA-specific cmd_queue.  Inherits ompi_op_gpu_cmd_queue_t by placing it
+ * as the first member named "super".  The CUDA stream and shutdown flag are
+ * stored directly here rather than in a separate priv allocation.
+ * Allocated with OBJ_NEW; the OBJ destructor chain releases GPU resources.
  */
-typedef struct {
-    volatile int32_t *shutdown;  /* managed-memory shutdown flag */
-    cudaStream_t      stream;    /* private CUDA stream for this cmd_queue */
-} ompi_op_cuda_cmd_queue_priv_t;
+typedef struct ompi_op_cuda_cmd_queue_t {
+    ompi_op_gpu_cmd_queue_t  super;       /* MUST be first */
+    volatile int32_t        *shutdown;    /* managed-memory shutdown flag */
+    cudaStream_t             stream;      /* private CUDA stream */
+} ompi_op_cuda_cmd_queue_t;
+OBJ_CLASS_DECLARATION(ompi_op_cuda_cmd_queue_t);
 
 /**
  * Host-side launcher function type.
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index b32a83f72bf..75b7c642128 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -22,7 +22,6 @@
 
 /* Forward declarations of hooks implemented in op_cuda_session.c */
 ompi_op_gpu_cmd_queue_t *ompi_op_cuda_cmd_queue_alloc(int dev_id);
-void ompi_op_cuda_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue);
 ompi_op_gpu_session_t *ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
                                                    struct ompi_op_t *op,
                                                    struct ompi_datatype_t *dtype);
@@ -60,7 +59,6 @@ ompi_op_base_component_1_0_0_t mca_op_cuda_component = {
 
     /* GPU session hooks */
     .opc_cmd_queue_alloc = ompi_op_cuda_cmd_queue_alloc,
-    .opc_cmd_queue_free  = ompi_op_cuda_cmd_queue_free,
     .opc_session_begin   = ompi_op_cuda_session_begin,
 };
 MCA_BASE_COMPONENT_INIT(ompi, op, cuda)
diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c
index 06df7cc8dd4..329e20cd441 100644
--- a/ompi/mca/op/cuda/op_cuda_session.c
+++ b/ompi/mca/op/cuda/op_cuda_session.c
@@ -12,11 +12,10 @@
 /*
  * Session lifecycle for the CUDA persistent-kernel op component.
  *
- * cmd_queue_alloc: allocate managed-memory command slot + shutdown flag
- *                  and create a private CUDA stream.
- *
- * cmd_queue_free:  release the CUDA stream, managed memory, and
- *                  component-private state.
+ * ompi_op_cuda_cmd_queue_t inherits ompi_op_gpu_cmd_queue_t.  OBJ_NEW
+ * allocates the object; the OBJ destructor releases the CUDA stream and
+ * managed memory.  The component returns the base pointer from alloc so
+ * callers need no knowledge of the concrete type.
  *
  * session_begin:   look up the kernel for (op, dtype), reset the cmd_queue
  *                  state, and launch the persistent kernel on the existing
@@ -54,96 +53,90 @@ static void ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
                                          void *dst, size_t count);
 static void ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session);
 
+/* --------------------------------------------------------------------------
+ * OBJ constructor / destructor for ompi_op_cuda_cmd_queue_t
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_cuda_cmd_queue_construct(ompi_op_cuda_cmd_queue_t *q)
+{
+    q->shutdown       = NULL;
+    q->stream         = NULL;
+    q->super.cmd      = NULL;
+    q->super.dev_id   = -1;
+    q->super.allocator = NULL;
+    q->super.session_begin_fn = NULL;
+}
+
+static void
+ompi_op_cuda_cmd_queue_destruct(ompi_op_cuda_cmd_queue_t *q)
+{
+    if (NULL != q->stream) {
+        cudaStreamDestroy(q->stream);
+        q->stream = NULL;
+    }
+    if (NULL != q->shutdown) {
+        cudaFree((void *) q->shutdown);
+        q->shutdown = NULL;
+    }
+    if (NULL != q->super.cmd) {
+        cudaFree(q->super.cmd);
+        q->super.cmd = NULL;
+    }
+}
+
+OBJ_CLASS_INSTANCE(ompi_op_cuda_cmd_queue_t,
+                   ompi_op_gpu_cmd_queue_t,
+                   ompi_op_cuda_cmd_queue_construct,
+                   ompi_op_cuda_cmd_queue_destruct);
+
 /* --------------------------------------------------------------------------
  * ompi_op_cuda_cmd_queue_alloc
  *
  * Allocate the expensive GPU resources for one device: a managed-memory
  * command slot, a managed-memory shutdown flag, and a private CUDA stream.
- * Returns NULL if any allocation fails.
+ * Returns the base pointer (ompi_op_gpu_cmd_queue_t *); NULL on failure.
  * -------------------------------------------------------------------------- */
 ompi_op_gpu_cmd_queue_t *
 ompi_op_cuda_cmd_queue_alloc(int dev_id)
 {
-    ompi_op_gpu_cmd_queue_t *queue =
-        (ompi_op_gpu_cmd_queue_t *) malloc(sizeof(ompi_op_gpu_cmd_queue_t));
-    if (NULL == queue) {
-        return NULL;
-    }
-    OBJ_CONSTRUCT(&queue->super, opal_list_item_t);
-
-    ompi_op_cuda_cmd_queue_priv_t *priv =
-        (ompi_op_cuda_cmd_queue_priv_t *) malloc(sizeof(ompi_op_cuda_cmd_queue_priv_t));
-    if (NULL == priv) {
-        free(queue);
+    ompi_op_cuda_cmd_queue_t *q = OBJ_NEW(ompi_op_cuda_cmd_queue_t);
+    if (NULL == q) {
         return NULL;
     }
 
     cudaError_t err;
 
-    /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
-    err = cudaMallocManaged((void **) &queue->cmd,
+    err = cudaMallocManaged((void **) &q->super.cmd,
                             sizeof(ompi_op_gpu_cmd_t),
                             cudaMemAttachGlobal);
     if (cudaSuccess != err) {
-        free(priv);
-        free(queue);
+        OBJ_RELEASE(q);
         return NULL;
     }
-    queue->cmd->src1   = NULL;
-    queue->cmd->src2   = NULL;
-    queue->cmd->dst    = NULL;
-    queue->cmd->count  = 0;
-    queue->cmd->status = 0;
-
-    /* Allocate managed-memory shutdown flag */
-    err = cudaMallocManaged((void **) &priv->shutdown,
+    q->super.cmd->src1   = NULL;
+    q->super.cmd->src2   = NULL;
+    q->super.cmd->dst    = NULL;
+    q->super.cmd->count  = 0;
+    q->super.cmd->status = 0;
+
+    err = cudaMallocManaged((void **) &q->shutdown,
                             sizeof(int32_t),
                             cudaMemAttachGlobal);
     if (cudaSuccess != err) {
-        cudaFree(queue->cmd);
-        free(priv);
-        free(queue);
+        OBJ_RELEASE(q);
         return NULL;
     }
-    *priv->shutdown = 0;
+    *q->shutdown = 0;
 
-    /* Create a dedicated non-blocking stream for this cmd_queue */
-    err = cudaStreamCreateWithFlags(&priv->stream, cudaStreamNonBlocking);
+    err = cudaStreamCreateWithFlags(&q->stream, cudaStreamNonBlocking);
     if (cudaSuccess != err) {
-        cudaFree(priv->shutdown);
-        cudaFree(queue->cmd);
-        free(priv);
-        free(queue);
+        OBJ_RELEASE(q);
         return NULL;
     }
 
-    queue->dev_id    = dev_id;
-    queue->allocator = opal_accelerator_base_get_device_allocator(dev_id);
-    queue->priv      = priv;
-    return queue;
-}
-
-/* --------------------------------------------------------------------------
- * ompi_op_cuda_cmd_queue_free
- *
- * Release the CUDA stream, managed memory, and component-private state.
- * Does NOT free the ompi_op_gpu_cmd_queue_t struct itself.
- * -------------------------------------------------------------------------- */
-void
-ompi_op_cuda_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue)
-{
-    ompi_op_cuda_cmd_queue_priv_t *priv =
-        (ompi_op_cuda_cmd_queue_priv_t *) queue->priv;
-    if (NULL == priv) {
-        return;
-    }
-
-    cudaStreamDestroy(priv->stream);
-    cudaFree((void *) priv->shutdown);
-    cudaFree(queue->cmd);
-    free(priv);
-    queue->priv = NULL;
-    queue->cmd  = NULL;
+    q->super.dev_id    = dev_id;
+    q->super.allocator = opal_accelerator_base_get_device_allocator(dev_id);
+    return &q->super;
 }
 
 /* --------------------------------------------------------------------------
@@ -173,19 +166,18 @@ ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
         return NULL;
     }
 
-    ompi_op_cuda_cmd_queue_priv_t *priv =
-        (ompi_op_cuda_cmd_queue_priv_t *) queue->priv;
+    ompi_op_cuda_cmd_queue_t *cq = (ompi_op_cuda_cmd_queue_t *) queue;
 
     /* Reset queue state for the new kernel */
-    *priv->shutdown    = 0;
-    queue->cmd->src1   = NULL;
-    queue->cmd->src2   = NULL;
-    queue->cmd->dst    = NULL;
-    queue->cmd->count  = 0;
-    queue->cmd->status = 0;
+    *cq->shutdown        = 0;
+    queue->cmd->src1     = NULL;
+    queue->cmd->src2     = NULL;
+    queue->cmd->dst      = NULL;
+    queue->cmd->count    = 0;
+    queue->cmd->status   = 0;
 
     /* Launch the persistent kernel (1 block, 256 threads) */
-    launcher(queue->cmd, priv->shutdown, priv->stream);
+    launcher(queue->cmd, cq->shutdown, cq->stream);
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess != err) {
         return NULL;
@@ -241,13 +233,12 @@ ompi_op_cuda_session_reduce(ompi_op_gpu_session_t *session,
 static void
 ompi_op_cuda_session_stop(ompi_op_gpu_session_t *session)
 {
-    ompi_op_cuda_cmd_queue_priv_t *priv =
-        (ompi_op_cuda_cmd_queue_priv_t *) session->queue->priv;
+    ompi_op_cuda_cmd_queue_t *cq = (ompi_op_cuda_cmd_queue_t *) session->queue;
 
     /* Signal the kernel to exit its loop */
-    *priv->shutdown = 1;
+    *cq->shutdown = 1;
     __atomic_thread_fence(__ATOMIC_SEQ_CST);
 
     /* Wait for the kernel to finish; stream remains valid after this */
-    cudaStreamSynchronize(priv->stream);
+    cudaStreamSynchronize(cq->stream);
 }
diff --git a/ompi/mca/op/op.h b/ompi/mca/op/op.h
index 0777aeddfab..ffb21da034a 100644
--- a/ompi/mca/op/op.h
+++ b/ompi/mca/op/op.h
@@ -339,14 +339,6 @@ struct ompi_op_gpu_session_t;
 typedef struct ompi_op_gpu_cmd_queue_t *
   (*ompi_op_base_component_cmd_queue_alloc_fn_t)(int dev_id);
 
-/**
- * Optional component hook: release the managed memory, GPU stream, and
- * component-private state owned by the cmd_queue.
- * Must NOT free the ompi_op_gpu_cmd_queue_t struct itself.
- */
-typedef void
-  (*ompi_op_base_component_cmd_queue_free_fn_t)(struct ompi_op_gpu_cmd_queue_t *queue);
-
 /**
  * Optional component hook: look up the GPU kernel for (op, dtype), reset the
  * cmd_queue state, and launch the persistent kernel on the queue's stream.
@@ -378,7 +370,6 @@ typedef struct ompi_op_base_component_1_0_0_t {
 
     /** Optional: GPU cmd_queue and session hooks.  NULL in host-only components. */
     ompi_op_base_component_cmd_queue_alloc_fn_t  opc_cmd_queue_alloc;
-    ompi_op_base_component_cmd_queue_free_fn_t   opc_cmd_queue_free;
     ompi_op_base_component_session_begin_fn_t    opc_session_begin;
 } ompi_op_base_component_1_0_0_t;
 
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
index 75410d2bbbd..2c35902879a 100644
--- a/ompi/mca/op/rocm/op_rocm.h
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -21,14 +21,17 @@
 BEGIN_C_DECLS
 
 /**
- * Component-private state stored in ompi_op_gpu_cmd_queue_t.priv.
- * Holds the GPU stream and shutdown flag; the command slot lives in the
- * public cmd field of ompi_op_gpu_cmd_queue_t.
+ * ROCm-specific cmd_queue.  Inherits ompi_op_gpu_cmd_queue_t by placing it
+ * as the first member named "super".  The HIP stream and shutdown flag are
+ * stored directly here rather than in a separate priv allocation.
+ * Allocated with OBJ_NEW; the OBJ destructor chain releases GPU resources.
  */
-typedef struct {
-    volatile int32_t *shutdown;  /* managed-memory shutdown flag */
-    hipStream_t       stream;    /* private HIP stream for this cmd_queue */
-} ompi_op_rocm_cmd_queue_priv_t;
+typedef struct ompi_op_rocm_cmd_queue_t {
+    ompi_op_gpu_cmd_queue_t  super;       /* MUST be first */
+    volatile int32_t        *shutdown;    /* managed-memory shutdown flag */
+    hipStream_t              stream;      /* private HIP stream */
+} ompi_op_rocm_cmd_queue_t;
+OBJ_CLASS_DECLARATION(ompi_op_rocm_cmd_queue_t);
 
 /**
  * Host-side launcher function type.
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 40d2425079a..09c1cbdb08a 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -22,7 +22,6 @@
 
 /* Forward declarations of hooks implemented in op_rocm_session.c */
 ompi_op_gpu_cmd_queue_t *ompi_op_rocm_cmd_queue_alloc(int dev_id);
-void ompi_op_rocm_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue);
 ompi_op_gpu_session_t *ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
                                                    struct ompi_op_t *op,
                                                    struct ompi_datatype_t *dtype);
@@ -56,7 +55,6 @@ ompi_op_base_component_1_0_0_t mca_op_rocm_component = {
 
     /* GPU session hooks */
     .opc_cmd_queue_alloc = ompi_op_rocm_cmd_queue_alloc,
-    .opc_cmd_queue_free  = ompi_op_rocm_cmd_queue_free,
     .opc_session_begin   = ompi_op_rocm_session_begin,
 };
 MCA_BASE_COMPONENT_INIT(ompi, op, rocm)
diff --git a/ompi/mca/op/rocm/op_rocm_session.c b/ompi/mca/op/rocm/op_rocm_session.c
index ed3468649c7..b445d5fc3f5 100644
--- a/ompi/mca/op/rocm/op_rocm_session.c
+++ b/ompi/mca/op/rocm/op_rocm_session.c
@@ -13,11 +13,10 @@
  * Session lifecycle for the ROCm persistent-kernel op component.
  * Mirrors op_cuda_session.c with hip* API calls in place of cuda*.
  *
- * cmd_queue_alloc: allocate managed-memory command slot + shutdown flag
- *                  and create a private HIP stream.
- *
- * cmd_queue_free:  release the HIP stream, managed memory, and
- *                  component-private state.
+ * ompi_op_rocm_cmd_queue_t inherits ompi_op_gpu_cmd_queue_t.  OBJ_NEW
+ * allocates the object; the OBJ destructor releases the HIP stream and
+ * managed memory.  The component returns the base pointer from alloc so
+ * callers need no knowledge of the concrete type.
  *
  * session_begin:   look up the kernel for (op, dtype), reset the cmd_queue
  *                  state, and launch the persistent kernel on the existing
@@ -55,92 +54,89 @@ static void ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
                                          void *dst, size_t count);
 static void ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session);
 
+/* --------------------------------------------------------------------------
+ * OBJ constructor / destructor for ompi_op_rocm_cmd_queue_t
+ * -------------------------------------------------------------------------- */
+static void
+ompi_op_rocm_cmd_queue_construct(ompi_op_rocm_cmd_queue_t *q)
+{
+    q->shutdown       = NULL;
+    q->stream         = NULL;
+    q->super.cmd      = NULL;
+    q->super.dev_id   = -1;
+    q->super.allocator = NULL;
+    q->super.session_begin_fn = NULL;
+}
+
+static void
+ompi_op_rocm_cmd_queue_destruct(ompi_op_rocm_cmd_queue_t *q)
+{
+    if (NULL != q->stream) {
+        hipStreamDestroy(q->stream);
+        q->stream = NULL;
+    }
+    if (NULL != q->shutdown) {
+        hipFree((void *) q->shutdown);
+        q->shutdown = NULL;
+    }
+    if (NULL != q->super.cmd) {
+        hipFree(q->super.cmd);
+        q->super.cmd = NULL;
+    }
+}
+
+OBJ_CLASS_INSTANCE(ompi_op_rocm_cmd_queue_t,
+                   ompi_op_gpu_cmd_queue_t,
+                   ompi_op_rocm_cmd_queue_construct,
+                   ompi_op_rocm_cmd_queue_destruct);
+
 /* --------------------------------------------------------------------------
  * ompi_op_rocm_cmd_queue_alloc
  * -------------------------------------------------------------------------- */
 ompi_op_gpu_cmd_queue_t *
 ompi_op_rocm_cmd_queue_alloc(int dev_id)
 {
-    ompi_op_gpu_cmd_queue_t *queue =
-        (ompi_op_gpu_cmd_queue_t *) malloc(sizeof(ompi_op_gpu_cmd_queue_t));
-    if (NULL == queue) {
-        return NULL;
-    }
-    OBJ_CONSTRUCT(&queue->super, opal_list_item_t);
-
-    ompi_op_rocm_cmd_queue_priv_t *priv =
-        (ompi_op_rocm_cmd_queue_priv_t *) malloc(sizeof(ompi_op_rocm_cmd_queue_priv_t));
-    if (NULL == priv) {
-        free(queue);
+    ompi_op_rocm_cmd_queue_t *q = OBJ_NEW(ompi_op_rocm_cmd_queue_t);
+    if (NULL == q) {
         return NULL;
     }
 
     hipError_t err;
 
     /* Allocate managed-memory command slot (accessible by both CPU and GPU) */
-    err = hipMallocManaged((void **) &queue->cmd,
+    err = hipMallocManaged((void **) &q->super.cmd,
                            sizeof(ompi_op_gpu_cmd_t),
                            hipMemAttachGlobal);
     if (hipSuccess != err) {
-        free(priv);
-        free(queue);
+        OBJ_RELEASE(q);
         return NULL;
     }
-    queue->cmd->src1   = NULL;
-    queue->cmd->src2   = NULL;
-    queue->cmd->dst    = NULL;
-    queue->cmd->count  = 0;
-    queue->cmd->status = 0;
+    q->super.cmd->src1   = NULL;
+    q->super.cmd->src2   = NULL;
+    q->super.cmd->dst    = NULL;
+    q->super.cmd->count  = 0;
+    q->super.cmd->status = 0;
 
     /* Allocate managed-memory shutdown flag */
-    err = hipMallocManaged((void **) &priv->shutdown,
+    err = hipMallocManaged((void **) &q->shutdown,
                            sizeof(int32_t),
                            hipMemAttachGlobal);
     if (hipSuccess != err) {
-        hipFree(queue->cmd);
-        free(priv);
-        free(queue);
+        OBJ_RELEASE(q);
         return NULL;
     }
-    *priv->shutdown = 0;
+    *q->shutdown = 0;
 
     /* Create a dedicated non-blocking stream for this cmd_queue */
-    err = hipStreamCreateWithFlags(&priv->stream, hipStreamNonBlocking);
+    err = hipStreamCreateWithFlags(&q->stream, hipStreamNonBlocking);
     if (hipSuccess != err) {
-        hipFree(priv->shutdown);
-        hipFree(queue->cmd);
-        free(priv);
-        free(queue);
+        OBJ_RELEASE(q);
         return NULL;
     }
 
-    queue->dev_id    = dev_id;
-    queue->allocator = opal_accelerator_base_get_device_allocator(dev_id);
-    queue->priv      = priv;
-    return queue;
-}
-
-/* --------------------------------------------------------------------------
- * ompi_op_rocm_cmd_queue_free
- *
- * Release the HIP stream, managed memory, and component-private state.
- * Does NOT free the ompi_op_gpu_cmd_queue_t struct itself.
- * -------------------------------------------------------------------------- */
-void
-ompi_op_rocm_cmd_queue_free(ompi_op_gpu_cmd_queue_t *queue)
-{
-    ompi_op_rocm_cmd_queue_priv_t *priv =
-        (ompi_op_rocm_cmd_queue_priv_t *) queue->priv;
-    if (NULL == priv) {
-        return;
-    }
-
-    hipStreamDestroy(priv->stream);
-    hipFree((void *) priv->shutdown);
-    hipFree(queue->cmd);
-    free(priv);
-    queue->priv = NULL;
-    queue->cmd  = NULL;
+    q->super.dev_id    = dev_id;
+    q->super.allocator = opal_accelerator_base_get_device_allocator(dev_id);
+    return &q->super;
 }
 
 /* --------------------------------------------------------------------------
@@ -165,11 +161,10 @@ ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
         return NULL;
     }
 
-    ompi_op_rocm_cmd_queue_priv_t *priv =
-        (ompi_op_rocm_cmd_queue_priv_t *) queue->priv;
+    ompi_op_rocm_cmd_queue_t *cq = (ompi_op_rocm_cmd_queue_t *) queue;
 
     /* Reset queue state for the new kernel */
-    *priv->shutdown    = 0;
+    *cq->shutdown      = 0;
     queue->cmd->src1   = NULL;
     queue->cmd->src2   = NULL;
     queue->cmd->dst    = NULL;
@@ -177,7 +172,7 @@ ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
     queue->cmd->status = 0;
 
     /* Launch the persistent kernel (1 block, 256 threads) */
-    launcher(queue->cmd, priv->shutdown, priv->stream);
+    launcher(queue->cmd, cq->shutdown, cq->stream);
     hipError_t err = hipGetLastError();
     if (hipSuccess != err) {
         return NULL;
@@ -233,13 +228,12 @@ ompi_op_rocm_session_reduce(ompi_op_gpu_session_t *session,
 static void
 ompi_op_rocm_session_stop(ompi_op_gpu_session_t *session)
 {
-    ompi_op_rocm_cmd_queue_priv_t *priv =
-        (ompi_op_rocm_cmd_queue_priv_t *) session->queue->priv;
+    ompi_op_rocm_cmd_queue_t *cq = (ompi_op_rocm_cmd_queue_t *) session->queue;
 
     /* Signal the kernel to exit its loop */
-    *priv->shutdown = 1;
+    *cq->shutdown = 1;
     __atomic_thread_fence(__ATOMIC_SEQ_CST);
 
     /* Wait for the kernel to finish; stream remains valid after this */
-    hipStreamSynchronize(priv->stream);
+    hipStreamSynchronize(cq->stream);
 }
diff --git a/ompi/op/op_gpu_session.c b/ompi/op/op_gpu_session.c
index f393ef2b43c..f9217764e1f 100644
--- a/ompi/op/op_gpu_session.c
+++ b/ompi/op/op_gpu_session.c
@@ -54,6 +54,8 @@
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/op/op.h"
 
+OBJ_CLASS_INSTANCE(ompi_op_gpu_cmd_queue_t, opal_list_item_t, NULL, NULL);
+
 /* Maximum number of idle cmd_queues kept in the pool. */
 #define CMD_QUEUE_POOL_MAX 16
 
@@ -62,13 +64,13 @@ static opal_atomic_int32_t  cmd_queue_pool_count = 0;
 
 /* --------------------------------------------------------------------------
  * cmd_queue_destroy — permanently release a cmd_queue's GPU resources.
+ * OBJ_RELEASE dispatches through the concrete class destructor chain
+ * (e.g. ompi_op_cuda_cmd_queue_t) and frees the allocation.
  * -------------------------------------------------------------------------- */
 static void
 cmd_queue_destroy(ompi_op_gpu_cmd_queue_t *queue)
 {
-    queue->free_fn(queue);   /* component frees stream, managed mem, priv */
-    OBJ_DESTRUCT(&queue->super);
-    free(queue);
+    OBJ_RELEASE(queue);
 }
 
 /* --------------------------------------------------------------------------
@@ -150,7 +152,6 @@ ompi_op_gpu_session_begin(struct ompi_op_t *op,
             (const ompi_op_base_component_1_0_0_t *) bc;
 
         if (NULL == opc->opc_cmd_queue_alloc ||
-            NULL == opc->opc_cmd_queue_free  ||
             NULL == opc->opc_session_begin) {
             continue;
         }
@@ -160,9 +161,8 @@ ompi_op_gpu_session_begin(struct ompi_op_t *op,
             continue;
         }
 
-        /* Wire dispatch hooks into the cmd_queue. */
+        /* Wire session_begin_fn into the cmd_queue. */
         q->session_begin_fn = opc->opc_session_begin;
-        q->free_fn          = opc->opc_cmd_queue_free;
 
         ompi_op_gpu_session_t *session = opc->opc_session_begin(q, op, dtype);
         if (NULL == session) {
diff --git a/ompi/op/op_gpu_session.h b/ompi/op/op_gpu_session.h
index 2e5873a14bf..36cba08fefb 100644
--- a/ompi/op/op_gpu_session.h
+++ b/ompi/op/op_gpu_session.h
@@ -46,27 +46,25 @@ typedef struct {
  * by dev_id so they can be reused across collectives without paying
  * cudaMallocManaged/hipMallocManaged overhead on every call.
  *
- * cmd is public (the host communicates with the kernel through it directly).
- * priv is component-private and holds the stream and shutdown flag.
+ * GPU components (cuda, rocm) inherit from this base by placing it as the
+ * first member named "super" in their own cmd_queue struct, then allocate
+ * with OBJ_NEW and return the base pointer.  Destruction (including GPU
+ * resource cleanup) is dispatched automatically through the OBJ class chain.
  *
- * session_begin_fn and free_fn are managed by op_gpu_session.c
- * and must not be set by callers.
+ * session_begin_fn is wired at cmd_queue_alloc time by the component.
  */
 typedef struct ompi_op_gpu_cmd_queue_t {
     opal_list_item_t             super;       /* MUST be first: used by opal_lifo_t pool */
     int                          dev_id;
     mca_allocator_base_module_t *allocator;  /* GPU scratch allocator for this device */
     ompi_op_gpu_cmd_t           *cmd;        /* managed memory — shared with GPU */
-    void                        *priv;       /* component-private: stream, shutdown flag */
-    /* Session creation hook — wired at cmd_queue_alloc time by op_gpu_session.c. */
+    /* Session creation hook — wired at cmd_queue_alloc time by the component. */
     struct ompi_op_gpu_session_t *(*session_begin_fn)(
         struct ompi_op_gpu_cmd_queue_t *queue,
         struct ompi_op_t *op,
         struct ompi_datatype_t *dtype);
-    /* Release managed memory, GPU stream, and priv.
-     * Must NOT free the ompi_op_gpu_cmd_queue_t struct itself. */
-    void (*free_fn)(struct ompi_op_gpu_cmd_queue_t *queue);
 } ompi_op_gpu_cmd_queue_t;
+OBJ_CLASS_DECLARATION(ompi_op_gpu_cmd_queue_t);
 
 /**
  * Per-collective GPU reduction session.  Created by ompi_op_gpu_session_begin()

From 65e65ff532525013ae7d5aafc79213f2bedbdd5e Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Sat, 30 May 2026 10:47:04 -0400
Subject: [PATCH 10/13] Make op/cuda and op/rocm dso-by-default

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 config/opal_mca.m4 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/opal_mca.m4 b/config/opal_mca.m4
index bb51d3bc5f1..7c597166289 100644
--- a/config/opal_mca.m4
+++ b/config/opal_mca.m4
@@ -186,7 +186,7 @@ of type-component pairs.  For example, --enable-mca-no-build=pml-ob1])
     else
        msg=
        if test -z "$enable_mca_dso"; then
-           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze"
+           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,op-cuda,op-rocm"
            msg="(default)"
        fi
        DSO_all=0

From 573322685a241ed7d087959de36379dd84314405 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Sat, 30 May 2026 10:48:18 -0400
Subject: [PATCH 11/13] Fix NVCC include paths

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/op/cuda/Makefile.am | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index c826455d9a7..7e66d80d09e 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -28,7 +28,11 @@ NVCC_INCLUDES = \
     -I$(top_srcdir) \
     -I$(top_builddir) \
     -I$(top_srcdir)/ompi \
-    -I$(top_builddir)/ompi
+    -I$(top_builddir)/ompi \
+    -I$(top_builddir)/opal \
+    -I$(top_builddir)/opal/include \
+    -I$(top_srcdir)/ompi/include \
+    -I$(top_srcdir)/opal/include
 
 op_cuda_kernels.o: $(srcdir)/op_cuda_kernels.cu \
                    $(srcdir)/op_cuda.h

From 24af1d248c68c2bdb1a5bf1cc8ad7da015ba0662 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Sun, 31 May 2026 11:02:08 -0400
Subject: [PATCH 12/13] Fix CUDA build, static initializers not supported

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/op/cuda/Makefile.am         |  19 +++-
 ompi/mca/op/cuda/configure.m4        |  55 ++++++---
 ompi/mca/op/cuda/op_cuda.h           |   9 ++
 ompi/mca/op/cuda/op_cuda_component.c |   6 +-
 ompi/mca/op/cuda/op_cuda_kernels.cu  | 160 +++++++++++++--------------
 ompi/mca/op/rocm/op_rocm.h           |   9 ++
 ompi/mca/op/rocm/op_rocm_component.c |   6 +-
 ompi/mca/op/rocm/op_rocm_kernels.cpp | 159 +++++++++++++-------------
 8 files changed, 229 insertions(+), 194 deletions(-)

diff --git a/ompi/mca/op/cuda/Makefile.am b/ompi/mca/op/cuda/Makefile.am
index 7e66d80d09e..b8d8ee71dd7 100644
--- a/ompi/mca/op/cuda/Makefile.am
+++ b/ompi/mca/op/cuda/Makefile.am
@@ -20,7 +20,7 @@ sources = \
 # builds.
 
 EXTRA_DIST  = op_cuda_kernels.cu
-CLEANFILES  = op_cuda_kernels.o
+CLEANFILES  = op_cuda_kernels.o op_cuda_kernels.lo
 
 # Include paths forwarded to nvcc so it can find ompi_config.h and the
 # op/mca headers.
@@ -34,13 +34,24 @@ NVCC_INCLUDES = \
     -I$(top_srcdir)/ompi/include \
     -I$(top_srcdir)/opal/include
 
+# Compile the .cu file with nvcc.  Always pass -fPIC so the same object can
+# be used for both static and DSO builds.  -D_Float16=short papers over a
+# GCC extension (_Float16) that NVCC's host-compiler frontend does not support.
 op_cuda_kernels.o: $(srcdir)/op_cuda_kernels.cu \
                    $(srcdir)/op_cuda.h
 	$(NVCC) $(NVCCFLAGS) $(NVCC_INCLUDES) \
 	    $(op_cuda_CPPFLAGS) \
-	    --compiler-options "$(DEFS)" \
+	    --compiler-options "$(DEFS) -D_Float16=short -fPIC" \
 	    -c $< -o $@
 
+# Wrap the nvcc output in a libtool object file (.lo) so it can be properly
+# included in noinst_LTLIBRARIES via LIBADD.
+op_cuda_kernels.lo: op_cuda_kernels.o
+	@{ echo '# Generated by libtool (CUDA kernel; PIC via --compiler-options)'; \
+	   echo "pic_object='op_cuda_kernels.o'"; \
+	   echo "non_pic_object='op_cuda_kernels.o'"; \
+	} > $@
+
 AM_CPPFLAGS = $(op_cuda_CPPFLAGS)
 
 # ----------------------------------------------------------------------------
@@ -59,7 +70,7 @@ mcacomponent_LTLIBRARIES = $(component_install)
 
 mca_op_cuda_la_SOURCES  = $(sources)
 mca_op_cuda_la_LDFLAGS  = -module -avoid-version $(op_cuda_LDFLAGS)
-mca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.o
+mca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.lo
 mca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS)
 
 # ----------------------------------------------------------------------------
@@ -69,5 +80,5 @@ noinst_LTLIBRARIES = $(component_noinst)
 
 libmca_op_cuda_la_SOURCES  = $(sources)
 libmca_op_cuda_la_LDFLAGS  = -module -avoid-version $(op_cuda_LDFLAGS)
-libmca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.o
+libmca_op_cuda_la_LIBADD   = $(op_cuda_LIBS) op_cuda_kernels.lo
 libmca_op_cuda_la_CPPFLAGS = $(op_cuda_CPPFLAGS)
diff --git a/ompi/mca/op/cuda/configure.m4 b/ompi/mca/op/cuda/configure.m4
index 02081ab2090..71096bca74d 100644
--- a/ompi/mca/op/cuda/configure.m4
+++ b/ompi/mca/op/cuda/configure.m4
@@ -12,11 +12,15 @@
 # MCA_ompi_op_cuda_CONFIG([action-if-can-compile],
 #                          [action-if-cant-compile])
 # ------------------------------------------------
-# Build the CUDA persistent-kernel op component only when the CUDA
-# runtime (libcudart + cuda_runtime.h) and nvcc are available.
+# Build the CUDA persistent-kernel op component when the CUDA runtime
+# toolkit (cuda_runtime.h, libcudart, nvcc) is available.
 #
-# Requires that OPAL_CHECK_CUDA has already been called (which sets
-# $CUDA_SUPPORT, $opal_cuda_incdir, and $with_cuda).
+# Deliberately does NOT require CUDA_SUPPORT=1 (which gates on libcuda.so,
+# the GPU driver API library).  The op/cuda component only uses the runtime
+# API and can therefore be compiled in build environments that have the CUDA
+# toolkit installed but no GPU driver (e.g., CI containers, cross-build nodes).
+#
+# Requires --with-cuda[=DIR] to locate the toolkit.
 #
 # Sets:
 #   op_cuda_CPPFLAGS — include path for cuda_runtime.h
@@ -28,36 +32,51 @@
 AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
     AC_CONFIG_FILES([ompi/mca/op/cuda/Makefile])
 
-    # Ensure the top-level CUDA driver-API check has been performed.
+    # Ensure with_cuda is defined (OPAL_CHECK_CUDA parses --with-cuda).
     AC_REQUIRE([OPAL_CHECK_CUDA])
 
-    OPAL_VAR_SCOPE_PUSH([op_cuda_happy op_cuda_save_CPPFLAGS op_cuda_save_LDFLAGS op_cuda_save_LIBS op_cuda_libdir op_cuda_nvcc_path])
+    OPAL_VAR_SCOPE_PUSH([op_cuda_save_CPPFLAGS op_cuda_save_LDFLAGS op_cuda_save_LIBS op_cuda_libdir op_cuda_nvcc_path op_cuda_incdir])
 
     op_cuda_happy=no
+    op_cuda_incdir=""
 
-    AS_IF([test "x$CUDA_SUPPORT" = "x1"],
+    # Only attempt a build when the user asked for CUDA (--with-cuda[=DIR]).
+    AS_IF([test "x$with_cuda" != "x" && test "$with_cuda" != "no"],
       [
+        # Derive the include directory from $with_cuda, mirroring OPAL_CHECK_CUDA.
+        AS_IF([test -f "${with_cuda}/include/cuda_runtime.h"],
+              [op_cuda_incdir="${with_cuda}/include"],
+              [AS_IF([test -f "${with_cuda}/cuda_runtime.h"],
+                     [op_cuda_incdir="${with_cuda}"],
+                     [AS_IF([test -f "/usr/local/cuda/include/cuda_runtime.h"],
+                            [op_cuda_incdir="/usr/local/cuda/include"])])])
+
         op_cuda_save_CPPFLAGS="$CPPFLAGS"
         op_cuda_save_LDFLAGS="$LDFLAGS"
         op_cuda_save_LIBS="$LIBS"
 
-        CPPFLAGS="-I$opal_cuda_incdir $CPPFLAGS"
+        AS_IF([test -n "$op_cuda_incdir"],
+              [CPPFLAGS="-I$op_cuda_incdir $CPPFLAGS"])
 
-        # Verify that the runtime header is present alongside cuda.h.
+        # Verify the runtime header is present.
         AC_CHECK_HEADER([cuda_runtime.h],
           [op_cuda_happy=yes],
           [AC_MSG_WARN([cuda_runtime.h not found; skipping op/cuda component])
            op_cuda_happy=no])
 
-        # Locate libcudart — prefer lib64, fall back to lib.
+        # Locate libcudart — prefer lib64, fall back to lib, then /usr/local/cuda.
         AS_IF([test "$op_cuda_happy" = "yes"],
           [op_cuda_libdir=""
-           AS_IF([test -d "$with_cuda/lib64"],
-                 [op_cuda_libdir="$with_cuda/lib64"],
-                 [AS_IF([test -d "$with_cuda/lib"],
-                        [op_cuda_libdir="$with_cuda/lib"],
-                        [AS_IF([test -d "/usr/local/cuda/lib64"],
-                               [op_cuda_libdir="/usr/local/cuda/lib64"])])])
+           AS_IF([test "$with_cuda" != "yes"],
+                 [AS_IF([test -d "$with_cuda/lib64"],
+                        [op_cuda_libdir="$with_cuda/lib64"],
+                        [AS_IF([test -d "$with_cuda/lib"],
+                               [op_cuda_libdir="$with_cuda/lib"])])])
+           AS_IF([test -z "$op_cuda_libdir"],
+                 [AS_IF([test -d "/usr/local/cuda/lib64"],
+                        [op_cuda_libdir="/usr/local/cuda/lib64"],
+                        [AS_IF([test -d "/usr/local/cuda/lib"],
+                               [op_cuda_libdir="/usr/local/cuda/lib"])])])
            AS_IF([test -n "$op_cuda_libdir"],
                  [LDFLAGS="-L$op_cuda_libdir $LDFLAGS"])
            AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
@@ -69,7 +88,7 @@ AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
         # Locate nvcc.
         AS_IF([test "$op_cuda_happy" = "yes"],
           [op_cuda_nvcc_path="$PATH"
-           AS_IF([test -d "$with_cuda/bin"],
+           AS_IF([test "$with_cuda" != "yes" && test -d "$with_cuda/bin"],
                  [op_cuda_nvcc_path="$with_cuda/bin:$PATH"])
            AC_PATH_PROG([NVCC], [nvcc], [not_found], [$op_cuda_nvcc_path])
            AS_IF([test "$NVCC" = "not_found"],
@@ -79,7 +98,7 @@ AC_DEFUN([MCA_ompi_op_cuda_CONFIG],[
 
         # Populate the output variables.
         AS_IF([test "$op_cuda_happy" = "yes"],
-          [op_cuda_CPPFLAGS="-I$opal_cuda_incdir"
+          [op_cuda_CPPFLAGS="-I$op_cuda_incdir"
            AS_IF([test -n "$op_cuda_libdir"],
                  [op_cuda_LDFLAGS="-L$op_cuda_libdir"],
                  [op_cuda_LDFLAGS=""])
diff --git a/ompi/mca/op/cuda/op_cuda.h b/ompi/mca/op/cuda/op_cuda.h
index e6da8c73ecf..45930c8351f 100644
--- a/ompi/mca/op/cuda/op_cuda.h
+++ b/ompi/mca/op/cuda/op_cuda.h
@@ -52,6 +52,15 @@ typedef void (*ompi_op_cuda_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd,
 OMPI_DECLSPEC extern ompi_op_cuda_launcher_fn_t
 ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
 
+/* Defined in op_cuda_kernels.cu (extern "C") */
+void ompi_op_cuda_kernel_fns_init(void);
+
+/* Defined in op_cuda_session.c */
+ompi_op_gpu_cmd_queue_t *ompi_op_cuda_cmd_queue_alloc(int dev_id);
+ompi_op_gpu_session_t *ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                                                   struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype);
+
 END_C_DECLS
 
 #endif /* OMPI_MCA_OP_CUDA_H */
diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index 75b7c642128..ea1473c45fb 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -20,11 +20,6 @@
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/mca/op/cuda/op_cuda.h"
 
-/* Forward declarations of hooks implemented in op_cuda_session.c */
-ompi_op_gpu_cmd_queue_t *ompi_op_cuda_cmd_queue_alloc(int dev_id);
-ompi_op_gpu_session_t *ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
-                                                   struct ompi_op_t *op,
-                                                   struct ompi_datatype_t *dtype);
 
 static int cuda_component_open(void);
 static int cuda_component_close(void);
@@ -66,6 +61,7 @@ MCA_BASE_COMPONENT_INIT(ompi, op, cuda)
 static int
 cuda_component_open(void)
 {
+    ompi_op_cuda_kernel_fns_init();
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/mca/op/cuda/op_cuda_kernels.cu b/ompi/mca/op/cuda/op_cuda_kernels.cu
index 5708198e9fa..03b1b11bd7a 100644
--- a/ompi/mca/op/cuda/op_cuda_kernels.cu
+++ b/ompi/mca/op/cuda/op_cuda_kernels.cu
@@ -194,94 +194,92 @@ LAUNCHER(bxor_int64)  LAUNCHER(bxor_uint64)
  *
  * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns).
  * Zero/NULL entries mean "not supported on GPU" → host fallback.
+ *
+ * Zero-initialized here; filled by ompi_op_cuda_kernel_fns_init() called
+ * from cuda_component_open().  The init function uses plain assignment
+ * instead of designated initializers to stay compatible with NVCC's C++
+ * frontend, which does not support non-trivial designated initializers.
  * ========================================================================= */
 ompi_op_cuda_launcher_fn_t
-ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = {
+ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+void
+ompi_op_cuda_kernel_fns_init(void)
+{
+#define SET(op, type, fn) \
+    ompi_op_cuda_kernel_fns[OMPI_OP_BASE_FORTRAN_##op][OMPI_OP_BASE_TYPE_##type] = (fn)
 
-    [OMPI_OP_BASE_FORTRAN_MAX] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_max_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_max_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_max_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_max_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_max_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_max_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_max_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_max_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_max_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_max_double,
-    },
+    SET(MAX, INT8_T,   launch_max_int8);
+    SET(MAX, UINT8_T,  launch_max_uint8);
+    SET(MAX, INT16_T,  launch_max_int16);
+    SET(MAX, UINT16_T, launch_max_uint16);
+    SET(MAX, INT32_T,  launch_max_int32);
+    SET(MAX, UINT32_T, launch_max_uint32);
+    SET(MAX, INT64_T,  launch_max_int64);
+    SET(MAX, UINT64_T, launch_max_uint64);
+    SET(MAX, FLOAT,    launch_max_float);
+    SET(MAX, DOUBLE,   launch_max_double);
 
-    [OMPI_OP_BASE_FORTRAN_MIN] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_min_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_min_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_min_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_min_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_min_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_min_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_min_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_min_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_min_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_min_double,
-    },
+    SET(MIN, INT8_T,   launch_min_int8);
+    SET(MIN, UINT8_T,  launch_min_uint8);
+    SET(MIN, INT16_T,  launch_min_int16);
+    SET(MIN, UINT16_T, launch_min_uint16);
+    SET(MIN, INT32_T,  launch_min_int32);
+    SET(MIN, UINT32_T, launch_min_uint32);
+    SET(MIN, INT64_T,  launch_min_int64);
+    SET(MIN, UINT64_T, launch_min_uint64);
+    SET(MIN, FLOAT,    launch_min_float);
+    SET(MIN, DOUBLE,   launch_min_double);
 
-    [OMPI_OP_BASE_FORTRAN_SUM] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_sum_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_sum_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_sum_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_sum_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_sum_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_sum_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_sum_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_sum_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_sum_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_sum_double,
-    },
+    SET(SUM, INT8_T,   launch_sum_int8);
+    SET(SUM, UINT8_T,  launch_sum_uint8);
+    SET(SUM, INT16_T,  launch_sum_int16);
+    SET(SUM, UINT16_T, launch_sum_uint16);
+    SET(SUM, INT32_T,  launch_sum_int32);
+    SET(SUM, UINT32_T, launch_sum_uint32);
+    SET(SUM, INT64_T,  launch_sum_int64);
+    SET(SUM, UINT64_T, launch_sum_uint64);
+    SET(SUM, FLOAT,    launch_sum_float);
+    SET(SUM, DOUBLE,   launch_sum_double);
 
-    [OMPI_OP_BASE_FORTRAN_PROD] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_prod_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_prod_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_prod_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_prod_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_prod_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_prod_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_prod_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_prod_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_prod_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_prod_double,
-    },
+    SET(PROD, INT8_T,   launch_prod_int8);
+    SET(PROD, UINT8_T,  launch_prod_uint8);
+    SET(PROD, INT16_T,  launch_prod_int16);
+    SET(PROD, UINT16_T, launch_prod_uint16);
+    SET(PROD, INT32_T,  launch_prod_int32);
+    SET(PROD, UINT32_T, launch_prod_uint32);
+    SET(PROD, INT64_T,  launch_prod_int64);
+    SET(PROD, UINT64_T, launch_prod_uint64);
+    SET(PROD, FLOAT,    launch_prod_float);
+    SET(PROD, DOUBLE,   launch_prod_double);
 
-    [OMPI_OP_BASE_FORTRAN_BAND] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_band_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_band_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_band_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_band_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_band_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_band_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_band_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_band_uint64,
-    },
+    SET(BAND, INT8_T,   launch_band_int8);
+    SET(BAND, UINT8_T,  launch_band_uint8);
+    SET(BAND, INT16_T,  launch_band_int16);
+    SET(BAND, UINT16_T, launch_band_uint16);
+    SET(BAND, INT32_T,  launch_band_int32);
+    SET(BAND, UINT32_T, launch_band_uint32);
+    SET(BAND, INT64_T,  launch_band_int64);
+    SET(BAND, UINT64_T, launch_band_uint64);
 
-    [OMPI_OP_BASE_FORTRAN_BOR] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bor_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bor_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bor_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bor_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bor_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bor_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bor_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bor_uint64,
-    },
+    SET(BOR, INT8_T,   launch_bor_int8);
+    SET(BOR, UINT8_T,  launch_bor_uint8);
+    SET(BOR, INT16_T,  launch_bor_int16);
+    SET(BOR, UINT16_T, launch_bor_uint16);
+    SET(BOR, INT32_T,  launch_bor_int32);
+    SET(BOR, UINT32_T, launch_bor_uint32);
+    SET(BOR, INT64_T,  launch_bor_int64);
+    SET(BOR, UINT64_T, launch_bor_uint64);
 
-    [OMPI_OP_BASE_FORTRAN_BXOR] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bxor_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bxor_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bxor_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bxor_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bxor_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bxor_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bxor_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bxor_uint64,
-    },
+    SET(BXOR, INT8_T,   launch_bxor_int8);
+    SET(BXOR, UINT8_T,  launch_bxor_uint8);
+    SET(BXOR, INT16_T,  launch_bxor_int16);
+    SET(BXOR, UINT16_T, launch_bxor_uint16);
+    SET(BXOR, INT32_T,  launch_bxor_int32);
+    SET(BXOR, UINT32_T, launch_bxor_uint32);
+    SET(BXOR, INT64_T,  launch_bxor_int64);
+    SET(BXOR, UINT64_T, launch_bxor_uint64);
 
-    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: all NULL → host path */
-};
+    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: NULL → host path */
+#undef SET
+}
diff --git a/ompi/mca/op/rocm/op_rocm.h b/ompi/mca/op/rocm/op_rocm.h
index 2c35902879a..df80d4f915e 100644
--- a/ompi/mca/op/rocm/op_rocm.h
+++ b/ompi/mca/op/rocm/op_rocm.h
@@ -52,6 +52,15 @@ typedef void (*ompi_op_rocm_launcher_fn_t)(ompi_op_gpu_cmd_t *cmd,
 OMPI_DECLSPEC extern ompi_op_rocm_launcher_fn_t
 ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
 
+/* Defined in op_rocm_kernels.cpp (extern "C") */
+void ompi_op_rocm_kernel_fns_init(void);
+
+/* Defined in op_rocm_session.c */
+ompi_op_gpu_cmd_queue_t *ompi_op_rocm_cmd_queue_alloc(int dev_id);
+ompi_op_gpu_session_t *ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
+                                                   struct ompi_op_t *op,
+                                                   struct ompi_datatype_t *dtype);
+
 END_C_DECLS
 
 #endif /* OMPI_MCA_OP_ROCM_H */
diff --git a/ompi/mca/op/rocm/op_rocm_component.c b/ompi/mca/op/rocm/op_rocm_component.c
index 09c1cbdb08a..7966a79f1e2 100644
--- a/ompi/mca/op/rocm/op_rocm_component.c
+++ b/ompi/mca/op/rocm/op_rocm_component.c
@@ -20,11 +20,6 @@
 #include "ompi/op/op_gpu_session.h"
 #include "ompi/mca/op/rocm/op_rocm.h"
 
-/* Forward declarations of hooks implemented in op_rocm_session.c */
-ompi_op_gpu_cmd_queue_t *ompi_op_rocm_cmd_queue_alloc(int dev_id);
-ompi_op_gpu_session_t *ompi_op_rocm_session_begin(ompi_op_gpu_cmd_queue_t *queue,
-                                                   struct ompi_op_t *op,
-                                                   struct ompi_datatype_t *dtype);
 
 static int rocm_component_open(void);
 static int rocm_component_close(void);
@@ -62,6 +57,7 @@ MCA_BASE_COMPONENT_INIT(ompi, op, rocm)
 static int
 rocm_component_open(void)
 {
+    ompi_op_rocm_kernel_fns_init();
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/mca/op/rocm/op_rocm_kernels.cpp b/ompi/mca/op/rocm/op_rocm_kernels.cpp
index db24237e24d..d492957c002 100644
--- a/ompi/mca/op/rocm/op_rocm_kernels.cpp
+++ b/ompi/mca/op/rocm/op_rocm_kernels.cpp
@@ -195,94 +195,91 @@ LAUNCHER(bxor_int64)  LAUNCHER(bxor_uint64)
  *
  * Indexed by OMPI_OP_BASE_FORTRAN_* (rows) × OMPI_OP_BASE_TYPE_* (columns).
  * Zero/NULL entries mean "not supported on GPU" → host fallback.
+ *
+ * Zero-initialized here; filled by ompi_op_rocm_kernel_fns_init() called
+ * from rocm_component_open().  Plain assignment avoids non-trivial designated
+ * initializers which are not supported by all GPU compiler C++ frontends.
  * ========================================================================= */
 ompi_op_rocm_launcher_fn_t
-ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX] = {
+ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_OP_MAX][OMPI_OP_BASE_TYPE_MAX];
+
+void
+ompi_op_rocm_kernel_fns_init(void)
+{
+#define SET(op, type, fn) \
+    ompi_op_rocm_kernel_fns[OMPI_OP_BASE_FORTRAN_##op][OMPI_OP_BASE_TYPE_##type] = (fn)
 
-    [OMPI_OP_BASE_FORTRAN_MAX] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_max_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_max_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_max_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_max_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_max_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_max_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_max_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_max_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_max_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_max_double,
-    },
+    SET(MAX, INT8_T,   launch_max_int8);
+    SET(MAX, UINT8_T,  launch_max_uint8);
+    SET(MAX, INT16_T,  launch_max_int16);
+    SET(MAX, UINT16_T, launch_max_uint16);
+    SET(MAX, INT32_T,  launch_max_int32);
+    SET(MAX, UINT32_T, launch_max_uint32);
+    SET(MAX, INT64_T,  launch_max_int64);
+    SET(MAX, UINT64_T, launch_max_uint64);
+    SET(MAX, FLOAT,    launch_max_float);
+    SET(MAX, DOUBLE,   launch_max_double);
 
-    [OMPI_OP_BASE_FORTRAN_MIN] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_min_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_min_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_min_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_min_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_min_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_min_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_min_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_min_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_min_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_min_double,
-    },
+    SET(MIN, INT8_T,   launch_min_int8);
+    SET(MIN, UINT8_T,  launch_min_uint8);
+    SET(MIN, INT16_T,  launch_min_int16);
+    SET(MIN, UINT16_T, launch_min_uint16);
+    SET(MIN, INT32_T,  launch_min_int32);
+    SET(MIN, UINT32_T, launch_min_uint32);
+    SET(MIN, INT64_T,  launch_min_int64);
+    SET(MIN, UINT64_T, launch_min_uint64);
+    SET(MIN, FLOAT,    launch_min_float);
+    SET(MIN, DOUBLE,   launch_min_double);
 
-    [OMPI_OP_BASE_FORTRAN_SUM] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_sum_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_sum_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_sum_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_sum_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_sum_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_sum_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_sum_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_sum_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_sum_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_sum_double,
-    },
+    SET(SUM, INT8_T,   launch_sum_int8);
+    SET(SUM, UINT8_T,  launch_sum_uint8);
+    SET(SUM, INT16_T,  launch_sum_int16);
+    SET(SUM, UINT16_T, launch_sum_uint16);
+    SET(SUM, INT32_T,  launch_sum_int32);
+    SET(SUM, UINT32_T, launch_sum_uint32);
+    SET(SUM, INT64_T,  launch_sum_int64);
+    SET(SUM, UINT64_T, launch_sum_uint64);
+    SET(SUM, FLOAT,    launch_sum_float);
+    SET(SUM, DOUBLE,   launch_sum_double);
 
-    [OMPI_OP_BASE_FORTRAN_PROD] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_prod_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_prod_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_prod_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_prod_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_prod_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_prod_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_prod_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_prod_uint64,
-        [OMPI_OP_BASE_TYPE_FLOAT]    = launch_prod_float,
-        [OMPI_OP_BASE_TYPE_DOUBLE]   = launch_prod_double,
-    },
+    SET(PROD, INT8_T,   launch_prod_int8);
+    SET(PROD, UINT8_T,  launch_prod_uint8);
+    SET(PROD, INT16_T,  launch_prod_int16);
+    SET(PROD, UINT16_T, launch_prod_uint16);
+    SET(PROD, INT32_T,  launch_prod_int32);
+    SET(PROD, UINT32_T, launch_prod_uint32);
+    SET(PROD, INT64_T,  launch_prod_int64);
+    SET(PROD, UINT64_T, launch_prod_uint64);
+    SET(PROD, FLOAT,    launch_prod_float);
+    SET(PROD, DOUBLE,   launch_prod_double);
 
-    [OMPI_OP_BASE_FORTRAN_BAND] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_band_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_band_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_band_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_band_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_band_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_band_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_band_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_band_uint64,
-    },
+    SET(BAND, INT8_T,   launch_band_int8);
+    SET(BAND, UINT8_T,  launch_band_uint8);
+    SET(BAND, INT16_T,  launch_band_int16);
+    SET(BAND, UINT16_T, launch_band_uint16);
+    SET(BAND, INT32_T,  launch_band_int32);
+    SET(BAND, UINT32_T, launch_band_uint32);
+    SET(BAND, INT64_T,  launch_band_int64);
+    SET(BAND, UINT64_T, launch_band_uint64);
 
-    [OMPI_OP_BASE_FORTRAN_BOR] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bor_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bor_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bor_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bor_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bor_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bor_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bor_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bor_uint64,
-    },
+    SET(BOR, INT8_T,   launch_bor_int8);
+    SET(BOR, UINT8_T,  launch_bor_uint8);
+    SET(BOR, INT16_T,  launch_bor_int16);
+    SET(BOR, UINT16_T, launch_bor_uint16);
+    SET(BOR, INT32_T,  launch_bor_int32);
+    SET(BOR, UINT32_T, launch_bor_uint32);
+    SET(BOR, INT64_T,  launch_bor_int64);
+    SET(BOR, UINT64_T, launch_bor_uint64);
 
-    [OMPI_OP_BASE_FORTRAN_BXOR] = {
-        [OMPI_OP_BASE_TYPE_INT8_T]   = launch_bxor_int8,
-        [OMPI_OP_BASE_TYPE_UINT8_T]  = launch_bxor_uint8,
-        [OMPI_OP_BASE_TYPE_INT16_T]  = launch_bxor_int16,
-        [OMPI_OP_BASE_TYPE_UINT16_T] = launch_bxor_uint16,
-        [OMPI_OP_BASE_TYPE_INT32_T]  = launch_bxor_int32,
-        [OMPI_OP_BASE_TYPE_UINT32_T] = launch_bxor_uint32,
-        [OMPI_OP_BASE_TYPE_INT64_T]  = launch_bxor_int64,
-        [OMPI_OP_BASE_TYPE_UINT64_T] = launch_bxor_uint64,
-    },
+    SET(BXOR, INT8_T,   launch_bxor_int8);
+    SET(BXOR, UINT8_T,  launch_bxor_uint8);
+    SET(BXOR, INT16_T,  launch_bxor_int16);
+    SET(BXOR, UINT16_T, launch_bxor_uint16);
+    SET(BXOR, INT32_T,  launch_bxor_int32);
+    SET(BXOR, UINT32_T, launch_bxor_uint32);
+    SET(BXOR, INT64_T,  launch_bxor_int64);
+    SET(BXOR, UINT64_T, launch_bxor_uint64);
 
-    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: all NULL → host path */
-};
+    /* LAND, LOR, LXOR, MAXLOC, MINLOC, REPLACE, NO_OP: NULL → host path */
+#undef SET
+}

From ce49af186e3e6e96cf41540ea31b1dedb0a2c854 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Mon, 8 Jun 2026 13:51:31 -0400
Subject: [PATCH 13/13] Register launcher callbacks in init_query

Our component_open is never called. Bummer!

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 ompi/mca/op/cuda/op_cuda_component.c | 3 ++-
 ompi/mca/op/cuda/op_cuda_session.c   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ompi/mca/op/cuda/op_cuda_component.c b/ompi/mca/op/cuda/op_cuda_component.c
index ea1473c45fb..36638f8a578 100644
--- a/ompi/mca/op/cuda/op_cuda_component.c
+++ b/ompi/mca/op/cuda/op_cuda_component.c
@@ -61,7 +61,6 @@ MCA_BASE_COMPONENT_INIT(ompi, op, cuda)
 static int
 cuda_component_open(void)
 {
-    ompi_op_cuda_kernel_fns_init();
     return OMPI_SUCCESS;
 }
 
@@ -84,6 +83,8 @@ cuda_component_init_query(bool enable_progress_threads,
     if (cudaSuccess != err || device_count <= 0) {
         return OMPI_ERR_NOT_SUPPORTED;
     }
+    // register launchers here, component_open seems to be never called
+    ompi_op_cuda_kernel_fns_init();
     return OMPI_SUCCESS;
 }
 
diff --git a/ompi/mca/op/cuda/op_cuda_session.c b/ompi/mca/op/cuda/op_cuda_session.c
index 329e20cd441..c51c47307a8 100644
--- a/ompi/mca/op/cuda/op_cuda_session.c
+++ b/ompi/mca/op/cuda/op_cuda_session.c
@@ -176,7 +176,7 @@ ompi_op_cuda_session_begin(ompi_op_gpu_cmd_queue_t *queue,
     queue->cmd->count    = 0;
     queue->cmd->status   = 0;
 
-    /* Launch the persistent kernel (1 block, 256 threads) */
+    /* Launch the persistent kernel */
     launcher(queue->cmd, cq->shutdown, cq->stream);
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess != err) {