quantumlib · sergeisakov · Feb 3, 2026 · Jan 30, 2026 · Feb 1, 2026 · Feb 1, 2026
diff --git a/apps/qsim_base_custatevecex.cu b/apps/qsim_base_custatevecex.cu
@@ -32,18 +32,19 @@ struct Options {
   std::string circuit_file;
   unsigned maxtime = std::numeric_limits<unsigned>::max();
   unsigned seed = 1;
+  unsigned lbuf = 30;
   unsigned verbosity = 0;
 };
 
 Options GetOptions(int argc, char* argv[]) {
-  constexpr char usage[] = "usage:\n  ./qsim_base -c circuit -d maxtime "
-                           "-s seed -v verbosity\n";
+  constexpr char usage[] = "usage:\n  ./qsim_base_custatevecex.x -c circuit "
+                           "-d maxtime -s seed  -l lbuf -v verbosity\n";
 
   Options opt;
 
   int k;
 
-  while ((k = getopt(argc, argv, "c:d:s:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:s:l:v:")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -54,6 +55,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 's':
         opt.seed = std::atoi(optarg);
         break;
+      case 'l':
+        opt.lbuf = std::atoi(optarg);
+        break;
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
@@ -112,8 +116,14 @@ int main(int argc, char* argv[]) {
     using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
     using StateSpace = Simulator::StateSpace;
 
-    explicit Factory(unsigned verbosity = 0) : verbosity(verbosity) {
-      mp.initialize();
+    explicit Factory(uint64_t transfer_buffer_size, unsigned verbosity = 0)
+        : verbosity(verbosity) {
+      MultiProcessCuStateVecEx::Parameter param = {transfer_buffer_size};
+      mp.Initialize(param);
+
+      if (verbosity > 2 && mp.Initialized()) {
+        qsim::IO::messagef("# transfer_buf_size=%lu\n", transfer_buffer_size);
+      }
     }
 
     StateSpace CreateStateSpace() const {
@@ -136,7 +146,7 @@ int main(int argc, char* argv[]) {
   using State = StateSpace::State;
   using Runner = CuStateVecExRunner<IO, Factory>;
 
-  Factory factory(opt.verbosity);
+  Factory factory(uint64_t{1} << opt.lbuf, opt.verbosity);
 
   StateSpace state_space = factory.CreateStateSpace();
   State state = state_space.Create(circuit.num_qubits);

diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md
@@ -183,20 +183,29 @@ To compile with the NVIDIA cuStateVec library (v1.0.0 or higher is required),
 set the environmment variable `CUQUANTUM_ROOT` to the path to the cuStateVec
 library.
 
-`QSimOptions` provides five parameters to configure GPU execution. `use_gpu`
+`QSimOptions` provides six parameters to configure GPU execution. `use_gpu`
 is required to enable GPU execution:
 * `use_gpu`: if True, use GPU instead of CPU for simulation.
 * `gpu_mode`: use CUDA if set to 0 (default value), use the NVIDIA cuStateVec
 if set to 1 or use the NVIDIA cuStateVecEx library if set to any other value.
 
 In the case of the NVIDIA cuStateVecEx library, simulations can be performed
-in multi-device / multi-node environments.
+in multi-device / multi-node environments. A CUDA-aware MPI library is required
+for multi-node. Currently, only Open MPI is supported.
 
-If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
+If `use_gpu` is set and `gpu_mode` is set to 0, two parameters can
 optionally be set to fine-tune StateSpace performance for a specific device.
 In most cases, the default values provide good performance.
 * `gpu_state_threads`: number of threads per CUDA block to use for the GPU
 StateSpace. This must be a power of 2 in the range [32, 1024].
 * `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace.
 Below 16 data blocks, performance is noticeably reduced.
 
+If `use_gpu` is set and `gpu_mode` is set to 2 or greater (cuStateVecEx), two
+parameters can be set to adjust the transfer buffer size for MPI communication
+or network type.
+* `gpu_cusvex_log_buf_size`: log2 of the buffer size. Default value is 30,
+i.e. the buffer size is 2^30 bytes.
+* `gpu_cusvex_network_type`: Device network type for multi-device:
+0=Switch (default), 1=FullMesh. Or layered network type for multi-process:
+0=SuperPOD (default), 1=GB200NVL, 2=SwitchTree, 3=Communicator.
diff --git a/lib/multiprocess_custatevecex.h b/lib/multiprocess_custatevecex.h
@@ -39,12 +39,11 @@ struct MultiProcessCuStateVecEx {
   };
 
   struct Parameter {
-    uint64_t transfer_buffer_size = 16777216;
+    uint64_t transfer_buffer_size = uint64_t{1} << 30;
     NetworkType network_type = kSuperPod;
   };
 
-  MultiProcessCuStateVecEx(Parameter param = Parameter{16777216, kSuperPod})
-      : param_(param), communicator_(nullptr), initialized_(false) {}
+  MultiProcessCuStateVecEx() : communicator_(nullptr), initialized_(false) {}
 
   ~MultiProcessCuStateVecEx() {
     if (communicator_) {
@@ -55,23 +54,29 @@ struct MultiProcessCuStateVecEx {
     custatevecExCommunicatorFinalize(&status);
   }
 
-  custatevecExCommunicatorDescriptor_t communicator() const {
+  custatevecExCommunicatorDescriptor_t Communicator() const {
     return communicator_;
   }
 
-  unsigned num_processes() const {
+  unsigned NumProcesses() const {
     return num_processes_;
   }
 
-  unsigned rank() const {
+  unsigned Rank() const {
     return rank_;
   }
 
-  bool initialized() const {
+  static bool ValidNetworkType(unsigned network_type) {
+    return network_type < 4;
+  }
+
+  bool Initialized() const {
     return initialized_;
   }
 
-  void initialize() {
+  void Initialize(Parameter param) {
+    param_ = param;
+
     int argc = 0;
     char** argv = nullptr;
 
@@ -109,7 +114,7 @@ struct MultiProcessCuStateVecEx {
     num_global_qubits_ = get_num_global_qubits(num_processes);
 
     unsigned num_acc_global_qubits = 0;
-    auto network_layers = get_network_layers(param_.network_type);
+    auto network_layers = GetNetworkLayers(param_.network_type);
 
     num_global_qubits_per_layer_.reserve(2);
     global_index_bit_classes_.reserve(2);
@@ -149,7 +154,7 @@ struct MultiProcessCuStateVecEx {
     initialized_ = true;
   }
 
-  auto create_sv_config(unsigned num_qubits, cudaDataType_t data_type) const {
+  auto CreateSVConfig(unsigned num_qubits, cudaDataType_t data_type) const {
     custatevecExDictionaryDescriptor_t sv_config = nullptr;
 
     if (!initialized_ ||
@@ -187,7 +192,7 @@ struct MultiProcessCuStateVecEx {
 
   using NetworkLayers = std::vector<NetworkLayer>;
 
-  static NetworkLayers get_network_layers(NetworkType id) {
+  static NetworkLayers GetNetworkLayers(NetworkType id) {
     switch (id) {
     case kSuperPod:
       return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 3},

diff --git a/lib/statespace_custatevecex.h b/lib/statespace_custatevecex.h
@@ -130,7 +130,7 @@ class StateSpaceCuStateVecEx :
     unsigned required_rank = k / size;
 
     if (state.distr_type() != Base::kMultiProcess
-        || Base::mp.rank() == required_rank) {
+        || Base::mp.Rank() == required_rank) {
       ErrorCheck(custatevecExStateVectorGetState(
           state.get(), buf, kStateDataType, k, k + 1, 1));
     }
@@ -139,7 +139,7 @@ class StateSpaceCuStateVecEx :
 
     if (state.distr_type() == Base::kMultiProcess) {
       auto cuda_type = GetCudaType<std::complex<fp_type>>();
-      auto comm = Base::mp.communicator();
+      auto comm = Base::mp.Communicator();
       ErrorCheck(comm->intf->bcast(comm, buf, 1, cuda_type, required_rank));
     }
 
@@ -161,7 +161,7 @@ class StateSpaceCuStateVecEx :
     unsigned required_rank = k / size;
 
     if (state.distr_type() != Base::kMultiProcess
-        || Base::mp.rank() == required_rank) {
+        || Base::mp.Rank() == required_rank) {
       ErrorCheck(custatevecExStateVectorSetState(
           state.get(), buf, kStateDataType, k, k + 1, 1));
     }
@@ -183,7 +183,7 @@ class StateSpaceCuStateVecEx :
     unsigned required_rank = k / size;
 
     if (state.distr_type() != Base::kMultiProcess
-        || Base::mp.rank() == required_rank) {
+        || Base::mp.Rank() == required_rank) {
       ErrorCheck(custatevecExStateVectorSetState(
           state.get(), buf, kStateDataType, k, k + 1, 1));
     }

diff --git a/lib/vectorspace_custatevecex.h b/lib/vectorspace_custatevecex.h
@@ -245,7 +245,7 @@ class VectorSpaceCuStateVecEx {
         ResultType local_r = callback(k, res);
 
         auto cuda_type = GetCudaType<ResultType>();
-        auto comm = mp_->communicator();
+        auto comm = mp_->Communicator();
         ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
 
         return r;
@@ -338,7 +338,7 @@ class VectorSpaceCuStateVecEx {
         ResultType local_r = callback(k, res1, res2);
 
         auto cuda_type = GetCudaType<ResultType>();
-        auto comm = mp_->communicator();
+        auto comm = mp_->Communicator();
         ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
 
         return r;
@@ -385,16 +385,16 @@ class VectorSpaceCuStateVecEx {
   Vector Create(unsigned num_qubits) const {
     custatevecExStateVectorDescriptor_t state_vec;
     custatevecExDictionaryDescriptor_t sv_config
-        = mp.create_sv_config(num_qubits, kStateDataType);
+        = mp.CreateSVConfig(num_qubits, kStateDataType);
 
     unsigned num_substates = 1;
     DistributionType distr_type = kNoDistr;
 
     if (sv_config != nullptr) {
       ErrorCheck(custatevecExStateVectorCreateMultiProcess(
-          &state_vec, sv_config, nullptr, mp.communicator(), nullptr));
+          &state_vec, sv_config, nullptr, mp.Communicator(), nullptr));
 
-      num_substates = mp.num_processes();
+      num_substates = mp.NumProcesses();
       distr_type = kMultiProcess;
 
       if (param.verbosity > 2) {
@@ -480,6 +480,10 @@ class VectorSpaceCuStateVecEx {
     return vector.get() == nullptr;
   }
 
+  static bool ValidDeviceNetworkType(unsigned network_type) {
+    return network_type < 2;
+  }
+
   bool Copy(const Vector& src, Vector& dest) const {
     if (src.num_qubits() != dest.num_qubits()) {
       return false;
@@ -503,15 +507,15 @@ class VectorSpaceCuStateVecEx {
   bool Copy(const Vector& src, fp_type* dest) const {
     if (src.distr_type() == kMultiProcess) {
       uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
-      uint64_t offset = size * mp.rank();
+      uint64_t offset = size * mp.Rank();
 
       ErrorCheck(custatevecExStateVectorGetState(
           src.get(), dest + 2 * offset, kStateDataType,
           offset, offset + size, 1));
       ErrorCheck(custatevecExStateVectorSynchronize(src.get()));
 
       auto cuda_type = GetCudaType<std::complex<fp_type>>();
-      auto comm = mp.communicator();
+      auto comm = mp.Communicator();
       ErrorCheck(comm->intf->allgather(
           comm, dest + 2 * offset, dest, size, cuda_type));
     } else {
@@ -529,7 +533,7 @@ class VectorSpaceCuStateVecEx {
   bool Copy(const fp_type* src, Vector& dest) const {
     if (dest.distr_type() == kMultiProcess) {
       uint64_t size = (uint64_t{1} << dest.num_qubits()) / dest.num_substates();
-      uint64_t offset = size * mp.rank();
+      uint64_t offset = size * mp.Rank();
 
       ErrorCheck(custatevecExStateVectorSetState(
           dest.get(), src + 2 * offset, kStateDataType,
@@ -560,7 +564,7 @@ class VectorSpaceCuStateVecEx {
 
     if (dest.distr_type() == kMultiProcess) {
       size /= dest.num_substates();
-      uint64_t offset = size * mp.rank();
+      uint64_t offset = size * mp.Rank();
 
       ErrorCheck(custatevecExStateVectorSetState(
           dest.get(), src + 2 * offset, kStateDataType,

diff --git a/pybind_interface/avx2/pybind_main_avx2.cpp b/pybind_interface/avx2/pybind_main_avx2.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorAVX<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;

diff --git a/pybind_interface/avx512/pybind_main_avx512.cpp b/pybind_interface/avx512/pybind_main_avx512.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorAVX512<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;

diff --git a/pybind_interface/basic/pybind_main_basic.cpp b/pybind_interface/basic/pybind_main_basic.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorBasic<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;

diff --git a/pybind_interface/cuda/pybind_main_cuda.cpp b/pybind_interface/cuda/pybind_main_cuda.cpp
@@ -24,13 +24,9 @@ namespace qsim {
   using Simulator = SimulatorCUDA<float>;
 
   struct Factory {
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks
-    ) {
-      ss_params.num_threads = num_state_threads;
-      ss_params.num_dblocks = num_dblocks;
+    explicit Factory(const py::dict& options) {
+      ss_params.num_threads = ParseOptions<unsigned>(options, "gsst\0");
+      ss_params.num_dblocks = ParseOptions<unsigned>(options, "gdb\0");
     }
 
     using Simulator = qsim::Simulator;

diff --git a/pybind_interface/custatevec/CMakeLists.txt b/pybind_interface/custatevec/CMakeLists.txt
@@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
 link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)
 
 add_library(qsim_custatevec MODULE pybind_main_custatevec.cpp)
-target_link_libraries(qsim_custatevec -lcustatevec -lcublas)
-
 set_target_properties(qsim_custatevec PROPERTIES
     PREFIX "${PYTHON_MODULE_PREFIX}"
     SUFFIX "${PYTHON_MODULE_EXTENSION}"
 )
 set_source_files_properties(pybind_main_custatevec.cpp PROPERTIES LANGUAGE CUDA)
 
-target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config)
+target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config -lcustatevec -lcublas)
diff --git a/pybind_interface/custatevec/pybind_main_custatevec.cpp b/pybind_interface/custatevec/pybind_main_custatevec.cpp
@@ -27,11 +27,7 @@ namespace qsim {
   using Simulator = SimulatorCuStateVec<float>;
 
   struct Factory {
-    // num_sim_threads, num_state_threads and num_dblocks are unused, but kept
-    // for consistency with other factories.
-    Factory(unsigned num_sim_threads,
-            unsigned num_state_threads,
-            unsigned num_dblocks) {
+    explicit Factory(const py::dict& options) {
       ErrorCheck(cublasCreate(&cublas_handle));
       ErrorCheck(custatevecCreate(&custatevec_handle));
     }

diff --git a/pybind_interface/custatevecex/CMakeLists.txt b/pybind_interface/custatevecex/CMakeLists.txt
@@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
 link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)
 
 add_library(qsim_custatevecex MODULE pybind_main_custatevecex.cpp)
-target_link_libraries(qsim_custatevecex -lcustatevec -lcublas)
-
 set_target_properties(qsim_custatevecex PROPERTIES
     PREFIX "${PYTHON_MODULE_PREFIX}"
     SUFFIX "${PYTHON_MODULE_EXTENSION}"
 )
 set_source_files_properties(pybind_main_custatevecex.cpp PROPERTIES LANGUAGE CUDA)
 
-target_link_libraries(qsim_custatevecex PRIVATE qsim_openmp_config)
+target_link_libraries(qsim_custatevecex PRIVATE qsim_openmp_config -lcustatevec -lcublas)