Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/opal_mca.m4
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ of type-component pairs. For example, --enable-mca-no-build=pml-ob1])
else
msg=
if test -z "$enable_mca_dso"; then
enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze"
enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,op-cuda,op-rocm"
msg="(default)"
fi
DSO_all=0
Expand Down
30 changes: 15 additions & 15 deletions ompi/mca/coll/acoll/coll_acoll_allreduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
/* Falling back to recursivedoubling for non-commutative operators to be safe */
if (!ompi_op_is_commute(op)) {
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op, comm,
module);
module, NULL);
}

/* Obtain the subcomms structure */
Expand All @@ -497,7 +497,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
/* Fallback to knomial if subc is not obtained */
if (NULL == subc) {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm,
module);
module, NULL);
}
if (!subc->initialized) {
err = mca_coll_acoll_comm_split_init(comm, acoll_module, subc, 0);
Expand All @@ -513,7 +513,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
if (num_nodes > 1) {
if (total_dsize > 16384) {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op,
comm, module);
comm, module, NULL);
}
int use_socket = acoll_module->use_socket != -1 ? acoll_module->use_socket : 0;
coll_acoll_subcomms_t *soc_subc = NULL;
Expand All @@ -525,7 +525,7 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
/* Validate communicator hierarchy before proceeding */
if (NULL == soc_comm || NULL == ldr_comm) {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op,
comm, module);
comm, module, NULL);
}

err = check_and_create_subc(soc_comm, acoll_module, &soc_subc);
Expand Down Expand Up @@ -573,10 +573,10 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
if (ompi_comm_size(ldr_comm) > 1 && -1 != ldr_root) {
if ((MPI_IN_PLACE == sbuf)) {
err = ompi_coll_base_allreduce_intra_recursivedoubling(MPI_IN_PLACE, rbuf, count, dtype, op,
ldr_comm, module);
ldr_comm, module, NULL);
} else {
err = ompi_coll_base_allreduce_intra_recursivedoubling(tmp_sbuf, rbuf, count, dtype, op,
ldr_comm, module);
ldr_comm, module, NULL);
}
if (MPI_SUCCESS != err) {
if (NULL != inplacebuf_free) {
Expand Down Expand Up @@ -607,31 +607,31 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
if (1 == num_nodes) {
if (total_dsize < 32) {
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
comm, module);
comm, module, NULL);
} else if ((total_dsize < 512) && is_opt) {
return mca_coll_acoll_allreduce_small_msgs_h(sbuf, rbuf, count, dtype, op, comm, module,
subc, 1);
} else if (total_dsize <= 2048) {
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
comm, module);
comm, module, NULL);
} else if (total_dsize < 65536) {
if (1 == alg) {
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype,
op, comm, module);
op, comm, module, NULL);
} else if (2 == alg) {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
op, comm, module);
op, comm, module, NULL);
} else { /*3 == alg */
return ompi_coll_base_allreduce_intra_ring_segmented(sbuf, rbuf, count, dtype, op,
comm, module, 0);
comm, module, 0, NULL);
}
} else if (total_dsize < 4194304) {
if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
&& (1 != subc->without_smsc) && is_opt) {
return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc);
} else {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
op, comm, module);
op, comm, module, NULL);
}
} else if (total_dsize <= 16777216) {
if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
Expand All @@ -640,21 +640,21 @@ int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
return mca_coll_acoll_bcast(rbuf, count, dtype, 0, comm, module);
} else {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
op, comm, module);
op, comm, module, NULL);
}
} else {
if (((0 != subc->smsc_use_sr_buf) || (subc->smsc_buf_size > 2 * total_dsize))
&& (1 != subc->without_smsc) && is_opt) {
return mca_coll_acoll_allreduce_smsc_f(sbuf, rbuf, count, dtype, op, comm, module, subc);
} else {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype,
op, comm, module);
op, comm, module, NULL);
}
}

} else {
return ompi_coll_base_allreduce_intra_redscat_allgather(sbuf, rbuf, count, dtype, op, comm,
module);
module, NULL);
}
return MPI_SUCCESS;
}
14 changes: 7 additions & 7 deletions ompi/mca/coll/acoll/coll_acoll_reduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,11 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
/* Falling back to inorder binary for non-commutative operators to be safe */
if (!ompi_op_is_commute(op)) {
return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op, root, comm,
module, 0, 0);
module, 0, 0, NULL);
}
if (0 != root) { // ToDo: support non-zero root
return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
module, 0, 0);
module, 0, 0, NULL);
}

/* Disable shm/xpmem based optimizations if: */
Expand Down Expand Up @@ -396,7 +396,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
/* Fallback to knomial if subc is not obtained */
if (NULL == subc) {
return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
module, 0, 0);
module, 0, 0, NULL);
}

if (!subc->initialized || (root != subc->prev_init_root)) {
Expand All @@ -422,10 +422,10 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
comm, module);
} else if (2 == alg) {
return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root,
comm, module, 0, 0);
comm, module, 0, 0, NULL);
} else { /* either 3 == alg or acoll_module->red_algo is not 0, 1, 2*/
return ompi_coll_base_reduce_intra_in_order_binary(sbuf, rbuf, count, dtype, op,
root, comm, module, 0, 0);
root, comm, module, 0, 0, NULL);
}
} else {
if ((((0 != subc->smsc_use_sr_buf)
Expand All @@ -437,7 +437,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
module, subc);
} else {
return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op,
root, comm, module, 0, 0);
root, comm, module, 0, 0, NULL);
}
}
} else {
Expand All @@ -446,7 +446,7 @@ int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
subc);
} else {
return ompi_coll_base_reduce_intra_binomial(sbuf, rbuf, count, dtype, op, root, comm,
module, 0, 0);
module, 0, 0, NULL);
}
}
return MPI_SUCCESS;
Expand Down
19 changes: 16 additions & 3 deletions ompi/mca/coll/acoll/coll_acoll_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,19 @@ extern int mca_coll_acoll_without_smsc;
extern int mca_coll_acoll_smsc_use_sr_buf;
extern int mca_coll_acoll_barrier_algo;

/* Wrapper so recursivedoubling can be stored as a module function pointer
* despite having gained an allocator parameter in coll_base_functions.h. */
static int
ompi_coll_acoll_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, size_t count,
struct ompi_datatype_t *dtype,
struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
return ompi_coll_base_allreduce_intra_recursivedoubling(sbuf, rbuf, count, dtype, op,
comm, module, NULL);
}

/*
* Hybrid backoff spin-wait with adaptive progress calls.
* Optimized for intra-node shared memory synchronization.
Expand Down Expand Up @@ -440,7 +453,7 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm,
int rank = ompi_comm_rank(comm);

(comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
(comm)->c_coll->coll_allreduce = ompi_coll_base_allreduce_intra_recursivedoubling;
(comm)->c_coll->coll_allreduce = ompi_coll_acoll_allreduce_intra_recursivedoubling;
(comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
if (!subc->initialized) {
OBJ_CONSTRUCT(&comm_info, opal_info_t);
Expand Down Expand Up @@ -538,14 +551,14 @@ static inline int mca_coll_acoll_comm_split_init(ompi_communicator_t *comm,
coll_bcast_loc = (subc->local_comm)->c_coll->coll_bcast;
(subc->local_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
(subc->local_comm)->c_coll->coll_allreduce
= ompi_coll_base_allreduce_intra_recursivedoubling;
= ompi_coll_acoll_allreduce_intra_recursivedoubling;
(subc->local_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
coll_allreduce_soc = (subc->socket_comm)->c_coll->coll_allreduce;
coll_allgather_soc = (subc->socket_comm)->c_coll->coll_allgather;
coll_bcast_soc = (subc->socket_comm)->c_coll->coll_bcast;
(subc->socket_comm)->c_coll->coll_allgather = ompi_coll_base_allgather_intra_ring;
(subc->socket_comm)->c_coll->coll_allreduce
= ompi_coll_base_allreduce_intra_recursivedoubling;
= ompi_coll_acoll_allreduce_intra_recursivedoubling;
(subc->socket_comm)->c_coll->coll_bcast = ompi_coll_base_bcast_intra_basic_linear;
}

Expand Down
11 changes: 6 additions & 5 deletions ompi/mca/coll/base/coll_base_allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, size_t scount
int k = 2;
return ompi_coll_base_allgather_intra_k_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
comm, module, k);
comm, module, k, NULL);
}

OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
Expand Down Expand Up @@ -771,7 +771,8 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module,
int radix)
int radix,
mca_allocator_base_module_t *allocator)
{
int line = -1, rank, size, dst, src, err = MPI_SUCCESS;
int recvcount, distance;
Expand All @@ -796,7 +797,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
if (0 != rank) {
/* Compute the temporary buffer size, including datatypes empty gaps */
rsize = opal_datatype_span(&rdtype->super, (size_t)rcount * (size - rank), &rgap);
tmp_buf = (char *) malloc(rsize);
tmp_buf = (char *) COLL_BASE_ALLOC(allocator, rsize);
tmp_buf_start = tmp_buf - rgap;
}

Expand Down Expand Up @@ -891,7 +892,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
}

if(tmp_buf != NULL) free(tmp_buf);
if(tmp_buf != NULL) COLL_BASE_FREE(allocator, tmp_buf);
return MPI_SUCCESS;

err_hndl:
Expand All @@ -911,7 +912,7 @@ int ompi_coll_base_allgather_intra_k_bruck(const void *sbuf, size_t scount,
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d",
__FILE__, line, err, rank));
if(tmp_buf != NULL) {
free(tmp_buf);
COLL_BASE_FREE(allocator, tmp_buf);
tmp_buf = NULL;
tmp_buf_start = NULL;
}
Expand Down
Loading
Loading