Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions apps/qsim_base_custatevecex.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,19 @@ struct Options {
std::string circuit_file;
unsigned maxtime = std::numeric_limits<unsigned>::max();
unsigned seed = 1;
unsigned lbuf = 30;
unsigned verbosity = 0;
};

Options GetOptions(int argc, char* argv[]) {
constexpr char usage[] = "usage:\n ./qsim_base -c circuit -d maxtime "
"-s seed -v verbosity\n";
constexpr char usage[] = "usage:\n ./qsim_base_custatevecex.x -c circuit "
"-d maxtime -s seed -l lbuf -v verbosity\n";

Options opt;

int k;

while ((k = getopt(argc, argv, "c:d:s:v:")) != -1) {
while ((k = getopt(argc, argv, "c:d:s:l:v:")) != -1) {
switch (k) {
case 'c':
opt.circuit_file = optarg;
Expand All @@ -54,6 +55,9 @@ Options GetOptions(int argc, char* argv[]) {
case 's':
opt.seed = std::atoi(optarg);
break;
case 'l':
opt.lbuf = std::atoi(optarg);
break;
case 'v':
opt.verbosity = std::atoi(optarg);
break;
Expand Down Expand Up @@ -112,8 +116,14 @@ int main(int argc, char* argv[]) {
using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
using StateSpace = Simulator::StateSpace;

explicit Factory(unsigned verbosity = 0) : verbosity(verbosity) {
mp.initialize();
explicit Factory(uint64_t transfer_buffer_size, unsigned verbosity = 0)
: verbosity(verbosity) {
MultiProcessCuStateVecEx::Parameter param = {transfer_buffer_size};
mp.Initialize(param);

if (verbosity > 2 && mp.Initialized()) {
qsim::IO::messagef("# transfer_buf_size=%lu\n", transfer_buffer_size);
}
}

StateSpace CreateStateSpace() const {
Expand All @@ -136,7 +146,7 @@ int main(int argc, char* argv[]) {
using State = StateSpace::State;
using Runner = CuStateVecExRunner<IO, Factory>;

Factory factory(opt.verbosity);
Factory factory(uint64_t{1} << opt.lbuf, opt.verbosity);

StateSpace state_space = factory.CreateStateSpace();
State state = state_space.Create(circuit.num_qubits);
Expand Down
15 changes: 12 additions & 3 deletions docs/cirq_interface.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,20 +183,29 @@ To compile with the NVIDIA cuStateVec library (v1.0.0 or higher is required),
set the environmment variable `CUQUANTUM_ROOT` to the path to the cuStateVec
library.

`QSimOptions` provides five parameters to configure GPU execution. `use_gpu`
`QSimOptions` provides six parameters to configure GPU execution. `use_gpu`
is required to enable GPU execution:
* `use_gpu`: if True, use GPU instead of CPU for simulation.
* `gpu_mode`: use CUDA if set to 0 (default value), use the NVIDIA cuStateVec
if set to 1 or use the NVIDIA cuStateVecEx library if set to any other value.

In the case of the NVIDIA cuStateVecEx library, simulations can be performed
in multi-device / multi-node environments.
in multi-device / multi-node environments. A CUDA-aware MPI library is required
for multi-node. Currently, only Open MPI is supported.

If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
If `use_gpu` is set and `gpu_mode` is set to 0, two parameters can
optionally be set to fine-tune StateSpace performance for a specific device.
In most cases, the default values provide good performance.
* `gpu_state_threads`: number of threads per CUDA block to use for the GPU
StateSpace. This must be a power of 2 in the range [32, 1024].
* `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace.
Below 16 data blocks, performance is noticeably reduced.

If `use_gpu` is set and `gpu_mode` is set to 2 or greater (cuStateVecEx), two
parameters can be set to adjust the transfer buffer size for MPI communication
or network type.
* `gpu_cusvex_log_buf_size`: log2 of the buffer size. Default value is 30,
i.e. the buffer size is 2^30 bytes.
* `gpu_cusvex_network_type`: Device network type for multi-device:
0=Switch (default), 1=FullMesh. Or layered network type for multi-process:
0=SuperPOD (default), 1=GB200NVL, 2=SwitchTree, 3=Communicator.
27 changes: 16 additions & 11 deletions lib/multiprocess_custatevecex.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,11 @@ struct MultiProcessCuStateVecEx {
};

struct Parameter {
uint64_t transfer_buffer_size = 16777216;
uint64_t transfer_buffer_size = uint64_t{1} << 30;
NetworkType network_type = kSuperPod;
};

MultiProcessCuStateVecEx(Parameter param = Parameter{16777216, kSuperPod})
: param_(param), communicator_(nullptr), initialized_(false) {}
MultiProcessCuStateVecEx() : communicator_(nullptr), initialized_(false) {}

~MultiProcessCuStateVecEx() {
if (communicator_) {
Expand All @@ -55,23 +54,29 @@ struct MultiProcessCuStateVecEx {
custatevecExCommunicatorFinalize(&status);
}

custatevecExCommunicatorDescriptor_t communicator() const {
custatevecExCommunicatorDescriptor_t Communicator() const {
return communicator_;
}

unsigned num_processes() const {
unsigned NumProcesses() const {
return num_processes_;
}

unsigned rank() const {
unsigned Rank() const {
return rank_;
}

bool initialized() const {
static bool ValidNetworkType(unsigned network_type) {
return network_type < 4;
}

bool Initialized() const {
return initialized_;
}

void initialize() {
void Initialize(Parameter param) {
param_ = param;

int argc = 0;
char** argv = nullptr;

Expand Down Expand Up @@ -109,7 +114,7 @@ struct MultiProcessCuStateVecEx {
num_global_qubits_ = get_num_global_qubits(num_processes);

unsigned num_acc_global_qubits = 0;
auto network_layers = get_network_layers(param_.network_type);
auto network_layers = GetNetworkLayers(param_.network_type);

num_global_qubits_per_layer_.reserve(2);
global_index_bit_classes_.reserve(2);
Expand Down Expand Up @@ -149,7 +154,7 @@ struct MultiProcessCuStateVecEx {
initialized_ = true;
}

auto create_sv_config(unsigned num_qubits, cudaDataType_t data_type) const {
auto CreateSVConfig(unsigned num_qubits, cudaDataType_t data_type) const {
custatevecExDictionaryDescriptor_t sv_config = nullptr;

if (!initialized_ ||
Expand Down Expand Up @@ -187,7 +192,7 @@ struct MultiProcessCuStateVecEx {

using NetworkLayers = std::vector<NetworkLayer>;

static NetworkLayers get_network_layers(NetworkType id) {
static NetworkLayers GetNetworkLayers(NetworkType id) {
switch (id) {
case kSuperPod:
return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 3},
Expand Down
8 changes: 4 additions & 4 deletions lib/statespace_custatevecex.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class StateSpaceCuStateVecEx :
unsigned required_rank = k / size;

if (state.distr_type() != Base::kMultiProcess
|| Base::mp.rank() == required_rank) {
|| Base::mp.Rank() == required_rank) {
ErrorCheck(custatevecExStateVectorGetState(
state.get(), buf, kStateDataType, k, k + 1, 1));
}
Expand All @@ -139,7 +139,7 @@ class StateSpaceCuStateVecEx :

if (state.distr_type() == Base::kMultiProcess) {
auto cuda_type = GetCudaType<std::complex<fp_type>>();
auto comm = Base::mp.communicator();
auto comm = Base::mp.Communicator();
ErrorCheck(comm->intf->bcast(comm, buf, 1, cuda_type, required_rank));
}

Expand All @@ -161,7 +161,7 @@ class StateSpaceCuStateVecEx :
unsigned required_rank = k / size;

if (state.distr_type() != Base::kMultiProcess
|| Base::mp.rank() == required_rank) {
|| Base::mp.Rank() == required_rank) {
ErrorCheck(custatevecExStateVectorSetState(
state.get(), buf, kStateDataType, k, k + 1, 1));
}
Expand All @@ -183,7 +183,7 @@ class StateSpaceCuStateVecEx :
unsigned required_rank = k / size;

if (state.distr_type() != Base::kMultiProcess
|| Base::mp.rank() == required_rank) {
|| Base::mp.Rank() == required_rank) {
ErrorCheck(custatevecExStateVectorSetState(
state.get(), buf, kStateDataType, k, k + 1, 1));
}
Expand Down
22 changes: 13 additions & 9 deletions lib/vectorspace_custatevecex.h
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class VectorSpaceCuStateVecEx {
ResultType local_r = callback(k, res);

auto cuda_type = GetCudaType<ResultType>();
auto comm = mp_->communicator();
auto comm = mp_->Communicator();
ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));

return r;
Expand Down Expand Up @@ -338,7 +338,7 @@ class VectorSpaceCuStateVecEx {
ResultType local_r = callback(k, res1, res2);

auto cuda_type = GetCudaType<ResultType>();
auto comm = mp_->communicator();
auto comm = mp_->Communicator();
ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));

return r;
Expand Down Expand Up @@ -385,16 +385,16 @@ class VectorSpaceCuStateVecEx {
Vector Create(unsigned num_qubits) const {
custatevecExStateVectorDescriptor_t state_vec;
custatevecExDictionaryDescriptor_t sv_config
= mp.create_sv_config(num_qubits, kStateDataType);
= mp.CreateSVConfig(num_qubits, kStateDataType);

unsigned num_substates = 1;
DistributionType distr_type = kNoDistr;

if (sv_config != nullptr) {
ErrorCheck(custatevecExStateVectorCreateMultiProcess(
&state_vec, sv_config, nullptr, mp.communicator(), nullptr));
&state_vec, sv_config, nullptr, mp.Communicator(), nullptr));

num_substates = mp.num_processes();
num_substates = mp.NumProcesses();
distr_type = kMultiProcess;

if (param.verbosity > 2) {
Expand Down Expand Up @@ -480,6 +480,10 @@ class VectorSpaceCuStateVecEx {
return vector.get() == nullptr;
}

static bool ValidDeviceNetworkType(unsigned network_type) {
return network_type < 2;
}

bool Copy(const Vector& src, Vector& dest) const {
if (src.num_qubits() != dest.num_qubits()) {
return false;
Expand All @@ -503,15 +507,15 @@ class VectorSpaceCuStateVecEx {
bool Copy(const Vector& src, fp_type* dest) const {
if (src.distr_type() == kMultiProcess) {
uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
uint64_t offset = size * mp.rank();
uint64_t offset = size * mp.Rank();

ErrorCheck(custatevecExStateVectorGetState(
src.get(), dest + 2 * offset, kStateDataType,
offset, offset + size, 1));
ErrorCheck(custatevecExStateVectorSynchronize(src.get()));

auto cuda_type = GetCudaType<std::complex<fp_type>>();
auto comm = mp.communicator();
auto comm = mp.Communicator();
ErrorCheck(comm->intf->allgather(
comm, dest + 2 * offset, dest, size, cuda_type));
} else {
Expand All @@ -529,7 +533,7 @@ class VectorSpaceCuStateVecEx {
bool Copy(const fp_type* src, Vector& dest) const {
if (dest.distr_type() == kMultiProcess) {
uint64_t size = (uint64_t{1} << dest.num_qubits()) / dest.num_substates();
uint64_t offset = size * mp.rank();
uint64_t offset = size * mp.Rank();

ErrorCheck(custatevecExStateVectorSetState(
dest.get(), src + 2 * offset, kStateDataType,
Expand Down Expand Up @@ -560,7 +564,7 @@ class VectorSpaceCuStateVecEx {

if (dest.distr_type() == kMultiProcess) {
size /= dest.num_substates();
uint64_t offset = size * mp.rank();
uint64_t offset = size * mp.Rank();

ErrorCheck(custatevecExStateVectorSetState(
dest.get(), src + 2 * offset, kStateDataType,
Expand Down
9 changes: 3 additions & 6 deletions pybind_interface/avx2/pybind_main_avx2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,9 @@ namespace qsim {
using Simulator = SimulatorAVX<For>;

struct Factory {
// num_state_threads and num_dblocks are unused, but kept for consistency
// with the GPU Factory.
Factory(
unsigned num_sim_threads,
unsigned num_state_threads,
unsigned num_dblocks) : num_threads(num_sim_threads) {}
explicit Factory(const py::dict& options) {
num_threads = ParseOptions<unsigned>(options, "t\0");
}

using Simulator = qsim::Simulator<For>;
using StateSpace = Simulator::StateSpace;
Expand Down
9 changes: 3 additions & 6 deletions pybind_interface/avx512/pybind_main_avx512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,9 @@ namespace qsim {
using Simulator = SimulatorAVX512<For>;

struct Factory {
// num_state_threads and num_dblocks are unused, but kept for consistency
// with the GPU Factory.
Factory(
unsigned num_sim_threads,
unsigned num_state_threads,
unsigned num_dblocks) : num_threads(num_sim_threads) {}
explicit Factory(const py::dict& options) {
num_threads = ParseOptions<unsigned>(options, "t\0");
}

using Simulator = qsim::Simulator<For>;
using StateSpace = Simulator::StateSpace;
Expand Down
9 changes: 3 additions & 6 deletions pybind_interface/basic/pybind_main_basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,9 @@ namespace qsim {
using Simulator = SimulatorBasic<For>;

struct Factory {
// num_state_threads and num_dblocks are unused, but kept for consistency
// with the GPU Factory.
Factory(
unsigned num_sim_threads,
unsigned num_state_threads,
unsigned num_dblocks) : num_threads(num_sim_threads) {}
explicit Factory(const py::dict& options) {
num_threads = ParseOptions<unsigned>(options, "t\0");
}

using Simulator = qsim::Simulator<For>;
using StateSpace = Simulator::StateSpace;
Expand Down
10 changes: 3 additions & 7 deletions pybind_interface/cuda/pybind_main_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,9 @@ namespace qsim {
using Simulator = SimulatorCUDA<float>;

struct Factory {
Factory(
unsigned num_sim_threads,
unsigned num_state_threads,
unsigned num_dblocks
) {
ss_params.num_threads = num_state_threads;
ss_params.num_dblocks = num_dblocks;
explicit Factory(const py::dict& options) {
ss_params.num_threads = ParseOptions<unsigned>(options, "gsst\0");
ss_params.num_dblocks = ParseOptions<unsigned>(options, "gdb\0");
}

using Simulator = qsim::Simulator;
Expand Down
4 changes: 1 addition & 3 deletions pybind_interface/custatevec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)

add_library(qsim_custatevec MODULE pybind_main_custatevec.cpp)
target_link_libraries(qsim_custatevec -lcustatevec -lcublas)

set_target_properties(qsim_custatevec PROPERTIES
PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}"
)
set_source_files_properties(pybind_main_custatevec.cpp PROPERTIES LANGUAGE CUDA)

target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config)
target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config -lcustatevec -lcublas)
6 changes: 1 addition & 5 deletions pybind_interface/custatevec/pybind_main_custatevec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,7 @@ namespace qsim {
using Simulator = SimulatorCuStateVec<float>;

struct Factory {
// num_sim_threads, num_state_threads and num_dblocks are unused, but kept
// for consistency with other factories.
Factory(unsigned num_sim_threads,
unsigned num_state_threads,
unsigned num_dblocks) {
explicit Factory(const py::dict& options) {
ErrorCheck(cublasCreate(&cublas_handle));
ErrorCheck(custatevecCreate(&custatevec_handle));
}
Expand Down
4 changes: 1 addition & 3 deletions pybind_interface/custatevecex/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)

add_library(qsim_custatevecex MODULE pybind_main_custatevecex.cpp)
target_link_libraries(qsim_custatevecex -lcustatevec -lcublas)

set_target_properties(qsim_custatevecex PROPERTIES
PREFIX "${PYTHON_MODULE_PREFIX}"
SUFFIX "${PYTHON_MODULE_EXTENSION}"
)
set_source_files_properties(pybind_main_custatevecex.cpp PROPERTIES LANGUAGE CUDA)

target_link_libraries(qsim_custatevecex PRIVATE qsim_openmp_config)
target_link_libraries(qsim_custatevecex PRIVATE qsim_openmp_config -lcustatevec -lcublas)
Loading
Loading