From c5ecef321a3655758a7f5470496af86334f1cae9 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <54642992+sergeisakov@users.noreply.github.com>
Date: Fri, 30 Jan 2026 17:55:35 +0100
Subject: [PATCH 1/3] Add options for cuStateVecEx.

---
 apps/qsim_base_custatevecex.cu                |  22 +-
 docs/cirq_interface.md                        |  15 +-
 lib/multiprocess_custatevecex.h               |  13 +-
 lib/vectorspace_custatevecex.h                |   4 +
 pybind_interface/avx2/pybind_main_avx2.cpp    |   9 +-
 .../avx512/pybind_main_avx512.cpp             |   9 +-
 pybind_interface/basic/pybind_main_basic.cpp  |   9 +-
 pybind_interface/cuda/pybind_main_cuda.cpp    |  10 +-
 pybind_interface/custatevec/CMakeLists.txt    |   4 +-
 .../custatevec/pybind_main_custatevec.cpp     |   6 +-
 pybind_interface/custatevecex/CMakeLists.txt  |   4 +-
 .../custatevecex/pybind_main_custatevecex.cpp |  55 ++-
 pybind_interface/hip/pybind_main_hip.cpp      |   9 +-
 pybind_interface/pybind_main.cpp              | 371 +++++++-----------
 pybind_interface/pybind_main.h                |  11 +
 pybind_interface/sse/pybind_main_sse.cpp      |   9 +-
 qsimcirq/qsim_simulator.py                    |  14 +-
 tests/hybrid_custatevecex_test.cu             |   2 +-
 tests/qtrajectory_custatevecex_test.cu        |   2 +-
 tests/run_custatevecex_test.cu                |   2 +-
 tests/simulator_custatevecex_test.cu          |   2 +-
 tests/statespace_custatevecex_test.cu         |   2 +-
 22 files changed, 288 insertions(+), 296 deletions(-)
diff --git a/apps/qsim_base_custatevecex.cu b/apps/qsim_base_custatevecex.cu
index 99ce1a283..d7d1180a3 100644
--- a/apps/qsim_base_custatevecex.cu
+++ b/apps/qsim_base_custatevecex.cu
@@ -32,18 +32,19 @@ struct Options {
   std::string circuit_file;
   unsigned maxtime = std::numeric_limits<unsigned>::max();
   unsigned seed = 1;
+  unsigned lbuf = 30;
   unsigned verbosity = 0;
 };
 
 Options GetOptions(int argc, char* argv[]) {
-  constexpr char usage[] = "usage:\n  ./qsim_base -c circuit -d maxtime "
-                           "-s seed -v verbosity\n";
+  constexpr char usage[] = "usage:\n  ./qsim_base_custatevecex.x -c circuit "
+                           "-d maxtime -s seed  -l lbuf -v verbosity\n";
 
   Options opt;
 
   int k;
 
-  while ((k = getopt(argc, argv, "c:d:s:v:")) != -1) {
+  while ((k = getopt(argc, argv, "c:d:s:l:v:")) != -1) {
     switch (k) {
       case 'c':
         opt.circuit_file = optarg;
@@ -54,6 +55,9 @@ Options GetOptions(int argc, char* argv[]) {
       case 's':
         opt.seed = std::atoi(optarg);
         break;
+      case 'l':
+        opt.lbuf = std::atoi(optarg);
+        break;
       case 'v':
         opt.verbosity = std::atoi(optarg);
         break;
@@ -112,8 +116,14 @@ int main(int argc, char* argv[]) {
     using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
     using StateSpace = Simulator::StateSpace;
 
-    explicit Factory(unsigned verbosity = 0) : verbosity(verbosity) {
-      mp.initialize();
+    explicit Factory(uint64_t transfer_buffer_size, unsigned verbosity = 0)
+        : verbosity(verbosity) {
+      MultiProcessCuStateVecEx::Parameter param = {transfer_buffer_size};
+      mp.initialize(param);
+
+      if (verbosity > 2 && mp.initialized()) {
+        qsim::IO::messagef("# transfer_buf_size=%lu\n", transfer_buffer_size);
+      }
     }
 
     StateSpace CreateStateSpace() const {
@@ -136,7 +146,7 @@ int main(int argc, char* argv[]) {
   using State = StateSpace::State;
   using Runner = CuStateVecExRunner<IO, Factory>;
 
-  Factory factory(opt.verbosity);
+  Factory factory(uint64_t{1} << opt.lbuf, opt.verbosity);
 
   StateSpace state_space = factory.CreateStateSpace();
   State state = state_space.Create(circuit.num_qubits);
diff --git a/docs/cirq_interface.md b/docs/cirq_interface.md
index 5dd1ddb4f..7448d251f 100644
--- a/docs/cirq_interface.md
+++ b/docs/cirq_interface.md
@@ -183,16 +183,17 @@ To compile with the NVIDIA cuStateVec library (v1.0.0 or higher is required),
 set the environmment variable `CUQUANTUM_ROOT` to the path to the cuStateVec
 library.
 
-`QSimOptions` provides five parameters to configure GPU execution. `use_gpu`
+`QSimOptions` provides six parameters to configure GPU execution. `use_gpu`
 is required to enable GPU execution:
 * `use_gpu`: if True, use GPU instead of CPU for simulation.
 * `gpu_mode`: use CUDA if set to 0 (default value), use the NVIDIA cuStateVec
 if set to 1 or use the NVIDIA cuStateVecEx library if set to any other value.
 
 In the case of the NVIDIA cuStateVecEx library, simulations can be performed
-in multi-device / multi-node environments.
+in multi-device / multi-node environments. A CUDA-aware MPI library is required
+for multi-node. Currently, only Open MPI is supported.
 
-If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
+If `use_gpu` is set and `gpu_mode` is set to 0, two parameters can
 optionally be set to fine-tune StateSpace performance for a specific device.
 In most cases, the default values provide good performance.
 * `gpu_state_threads`: number of threads per CUDA block to use for the GPU
@@ -200,3 +201,11 @@ StateSpace. This must be a power of 2 in the range [32, 1024].
 * `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace.
 Below 16 data blocks, performance is noticeably reduced.
 
+If `use_gpu` is set and `gpu_mode` is set to 2 or greater (cuStateVecEx), two
+parameters can be set to adjust the transfer buffer size for MPI communication
+or network type.
+* `gpu_cusvex_log_buf_size`: log2 of the buffer size. Default value is 30,
+i.e. the buffer size is 2^30 bytes.
+* `gpu_cusvex_network_type`: Device network type for multi-device:
+0=Switch (default), 1=FullMesh. Or layered network type for multi-process:
+0=SuperPOD (default), 1=GB200NVL, 2=SwitchTree, 3=Communicator.
diff --git a/lib/multiprocess_custatevecex.h b/lib/multiprocess_custatevecex.h
index 9c4a13bb1..1ffcd0332 100644
--- a/lib/multiprocess_custatevecex.h
+++ b/lib/multiprocess_custatevecex.h
@@ -39,12 +39,11 @@ struct MultiProcessCuStateVecEx {
   };
 
   struct Parameter {
-    uint64_t transfer_buffer_size = 16777216;
+    uint64_t transfer_buffer_size = uint64_t{1} << 30;
     NetworkType network_type = kSuperPod;
   };
 
-  MultiProcessCuStateVecEx(Parameter param = Parameter{16777216, kSuperPod})
-      : param_(param), communicator_(nullptr), initialized_(false) {}
+  MultiProcessCuStateVecEx() : communicator_(nullptr), initialized_(false) {}
 
   ~MultiProcessCuStateVecEx() {
     if (communicator_) {
@@ -67,11 +66,17 @@ struct MultiProcessCuStateVecEx {
     return rank_;
   }
 
+  static bool valid_network_type(unsigned network_type) {
+    return network_type < 4;
+  }
+
   bool initialized() const {
     return initialized_;
   }
 
-  void initialize() {
+  void initialize(Parameter param) {
+    param_ = param;
+
     int argc = 0;
     char** argv = nullptr;
 
diff --git a/lib/vectorspace_custatevecex.h b/lib/vectorspace_custatevecex.h
index 3fa26a931..c7b17455a 100644
--- a/lib/vectorspace_custatevecex.h
+++ b/lib/vectorspace_custatevecex.h
@@ -480,6 +480,10 @@ class VectorSpaceCuStateVecEx {
     return vector.get() == nullptr;
   }
 
+  static bool ValidDeviceNetworkType(unsigned network_type) {
+    return network_type < 2;
+  }
+
   bool Copy(const Vector& src, Vector& dest) const {
     if (src.num_qubits() != dest.num_qubits()) {
       return false;
diff --git a/pybind_interface/avx2/pybind_main_avx2.cpp b/pybind_interface/avx2/pybind_main_avx2.cpp
index a70f1b0f2..193ed97e3 100644
--- a/pybind_interface/avx2/pybind_main_avx2.cpp
+++ b/pybind_interface/avx2/pybind_main_avx2.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorAVX<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;
diff --git a/pybind_interface/avx512/pybind_main_avx512.cpp b/pybind_interface/avx512/pybind_main_avx512.cpp
index 548bf40bf..b730a0d6d 100644
--- a/pybind_interface/avx512/pybind_main_avx512.cpp
+++ b/pybind_interface/avx512/pybind_main_avx512.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorAVX512<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;
diff --git a/pybind_interface/basic/pybind_main_basic.cpp b/pybind_interface/basic/pybind_main_basic.cpp
index 9b37191b5..6db3f5a5e 100644
--- a/pybind_interface/basic/pybind_main_basic.cpp
+++ b/pybind_interface/basic/pybind_main_basic.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorBasic<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;
diff --git a/pybind_interface/cuda/pybind_main_cuda.cpp b/pybind_interface/cuda/pybind_main_cuda.cpp
index df9a293ee..d399de4c8 100644
--- a/pybind_interface/cuda/pybind_main_cuda.cpp
+++ b/pybind_interface/cuda/pybind_main_cuda.cpp
@@ -24,13 +24,9 @@ namespace qsim {
   using Simulator = SimulatorCUDA<float>;
 
   struct Factory {
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks
-    ) {
-      ss_params.num_threads = num_state_threads;
-      ss_params.num_dblocks = num_dblocks;
+    explicit Factory(const py::dict& options) {
+      ss_params.num_threads = ParseOptions<unsigned>(options, "gsst\0");
+      ss_params.num_dblocks = ParseOptions<unsigned>(options, "gdb\0");
     }
 
     using Simulator = qsim::Simulator;
diff --git a/pybind_interface/custatevec/CMakeLists.txt b/pybind_interface/custatevec/CMakeLists.txt
index 34bda5e48..9afecb6bc 100644
--- a/pybind_interface/custatevec/CMakeLists.txt
+++ b/pybind_interface/custatevec/CMakeLists.txt
@@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
 link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)
 
 add_library(qsim_custatevec MODULE pybind_main_custatevec.cpp)
-target_link_libraries(qsim_custatevec -lcustatevec -lcublas)
-
 set_target_properties(qsim_custatevec PROPERTIES
     PREFIX "${PYTHON_MODULE_PREFIX}"
     SUFFIX "${PYTHON_MODULE_EXTENSION}"
 )
 set_source_files_properties(pybind_main_custatevec.cpp PROPERTIES LANGUAGE CUDA)
 
-target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config)
+target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config -lcustatevec -lcublas)
diff --git a/pybind_interface/custatevec/pybind_main_custatevec.cpp b/pybind_interface/custatevec/pybind_main_custatevec.cpp
index b4523c8ff..6e61831cf 100644
--- a/pybind_interface/custatevec/pybind_main_custatevec.cpp
+++ b/pybind_interface/custatevec/pybind_main_custatevec.cpp
@@ -27,11 +27,7 @@ namespace qsim {
   using Simulator = SimulatorCuStateVec<float>;
 
   struct Factory {
-    // num_sim_threads, num_state_threads and num_dblocks are unused, but kept
-    // for consistency with other factories.
-    Factory(unsigned num_sim_threads,
-            unsigned num_state_threads,
-            unsigned num_dblocks) {
+    explicit Factory(const py::dict& options) {
       ErrorCheck(cublasCreate(&cublas_handle));
       ErrorCheck(custatevecCreate(&custatevec_handle));
     }
diff --git a/pybind_interface/custatevecex/CMakeLists.txt b/pybind_interface/custatevecex/CMakeLists.txt
index 93d66ac33..0b4d49439 100644
--- a/pybind_interface/custatevecex/CMakeLists.txt
+++ b/pybind_interface/custatevecex/CMakeLists.txt
@@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
 link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)
 
 add_library(qsim_custatevecex MODULE pybind_main_custatevecex.cpp)
-target_link_libraries(qsim_custatevecex -lcustatevec -lcublas)
-
 set_target_properties(qsim_custatevecex PROPERTIES
     PREFIX "${PYTHON_MODULE_PREFIX}"
     SUFFIX "${PYTHON_MODULE_EXTENSION}"
 )
 set_source_files_properties(pybind_main_custatevecex.cpp PROPERTIES LANGUAGE CUDA)
 
-target_link_libraries(qsim_custatevecex PRIVATE qsim_openmp_config)
+target_link_libraries(qsim_custatevecex PRIVATE qsim_openmp_config -lcustatevec -lcublas)
diff --git a/pybind_interface/custatevecex/pybind_main_custatevecex.cpp b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
index c29a608a6..e8b9e7f6d 100644
--- a/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
+++ b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <stdexcept>
+
 #include <custatevecEx.h>
 
 #include "pybind_main_custatevecex.h"
@@ -33,19 +35,43 @@ namespace qsim {
   using Simulator = SimulatorCuStateVecEx<float>;
 
   struct Factory {
-    // num_sim_threads, num_state_threads and num_dblocks are unused, but kept
-    // for consistency with other factories.
-    Factory(unsigned num_sim_threads,
-            unsigned num_state_threads,
-            unsigned num_dblocks) {
+    using Simulator = qsim::Simulator;
+    using StateSpace = Simulator::StateSpace;
+
+    explicit Factory(const py::dict& options) {
+      verbosity = ParseOptions<unsigned>(options, "v\0");
+      nwt = ParseOptions<unsigned>(options, "gnwt\0");
+
+      if (!mp.initialized()) {
+        using MP = qsim::MultiProcessCuStateVecEx;
+
+        if (!mp.valid_network_type(nwt)) {
+          throw std::invalid_argument("Invalid network type.");
+        }
+
+        unsigned l = ParseOptions<unsigned>(options, "glbuf\0");
+        uint64_t buffer_size = uint64_t{1} << l;
+
+        MP::NetworkType network_type = static_cast<MP::NetworkType>(nwt);
+
+        MP::Parameter param;
+        param.transfer_buffer_size = buffer_size;
+        param.network_type = network_type;
+
+        mp.initialize(param);
+
+        if (verbosity > 2 && mp.initialized()) {
+          qsim::IO::messagef("transfer_buf_size=%lu\n", buffer_size);
+        }
+      }
+
       if (!mp.initialized()) {
-        mp.initialize();
+        if (!StateSpace::ValidDeviceNetworkType(nwt)) {
+          throw std::invalid_argument("Invalid device network type.");
+        }
       }
     }
 
-    using Simulator = qsim::Simulator;
-    using StateSpace = Simulator::StateSpace;
-
     using Gate = Cirq::GateCirq<float>;
     using Runner = CuStateVecExRunner<IO, Factory>;
     struct RunnerParameter : public Runner::Parameter {
@@ -59,12 +85,21 @@ namespace qsim {
     };
 
     StateSpace CreateStateSpace() const {
-      return StateSpace{mp};
+      using NetworkType = StateSpace::DeviceNetworkType;
+
+      StateSpace::Parameter param;
+      param.device_network_type = static_cast<NetworkType>(nwt);
+      param.verbosity = verbosity;
+
+      return StateSpace{mp, param};
     }
 
     Simulator CreateSimulator() const {
       return Simulator{};
     }
+
+    unsigned verbosity = 0;
+    unsigned nwt = 0;
   };
 
   inline void SetFlushToZeroAndDenormalsAreZeros() {}
diff --git a/pybind_interface/hip/pybind_main_hip.cpp b/pybind_interface/hip/pybind_main_hip.cpp
index 98a9e3fcd..726048bcd 100644
--- a/pybind_interface/hip/pybind_main_hip.cpp
+++ b/pybind_interface/hip/pybind_main_hip.cpp
@@ -24,11 +24,10 @@ namespace qsim {
   using Simulator = SimulatorCUDA<float>;
 
   struct Factory {
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks
-    ) : ss_params{num_state_threads, num_dblocks} {}
+     explicit Factory(const py::dict& options) {
+      ss_params.num_threads = ParseOptions<unsigned>(options, "gsst\0");
+      ss_params.num_dblocks = ParseOptions<unsigned>(options, "gdb\0");
+    }
 
     using Simulator = qsim::Simulator;
     using StateSpace = Simulator::StateSpace;
diff --git a/pybind_interface/pybind_main.cpp b/pybind_interface/pybind_main.cpp
index 5f6644469..1efba5484 100644
--- a/pybind_interface/pybind_main.cpp
+++ b/pybind_interface/pybind_main.cpp
@@ -34,16 +34,6 @@ using namespace qsim;
 
 namespace {
 
-template <typename T>
-T parseOptions(const py::dict &options, const char *key) {
-  if (!options.contains(key)) {
-    std::string msg = std::string("Argument ") + key + " is not provided.\n";
-    throw std::invalid_argument(msg);
-  }
-  const auto &value = options[key];
-  return value.cast<T>();
-}
-
 Circuit<Factory::Gate> getCircuit(const py::dict &options) {
   try {
     return options["c\0"].cast<Circuit<Factory::Gate>>();
@@ -63,7 +53,7 @@ NoisyCircuit<Factory::Gate> getNoisyCircuit(const py::dict &options) {
 std::vector<Bitstring> getBitstrings(const py::dict &options, int num_qubits) {
   std::string bitstrings_str;
   try {
-    bitstrings_str = parseOptions<std::string>(options, "i\0");
+    bitstrings_str = ParseOptions<std::string>(options, "i\0");
   } catch (const std::invalid_argument &exp) {
     throw;
   }
@@ -389,40 +379,28 @@ std::vector<std::complex<float>> qsim_simulate(const py::dict &options) {
     }
   };
 
-  bool use_gpu;
   bool denormals_are_zeros;
-  unsigned gpu_mode;
-  unsigned num_sim_threads = 0;
-  unsigned num_state_threads = 0;
-  unsigned num_dblocks = 0;
   RunnerParameter param;
+
   try {
-    use_gpu = parseOptions<unsigned>(options, "g\0");
-    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
-    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
-    if (use_gpu == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "t\0");
-    } else if (gpu_mode == 0) {
-      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
-      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
+    denormals_are_zeros = ParseOptions<unsigned>(options, "z\0");
+    param.max_fused_size = ParseOptions<unsigned>(options, "f\0");
+    param.verbosity = ParseOptions<unsigned>(options, "v\0");
+    param.seed = ParseOptions<unsigned>(options, "s\0");
+
+    if (denormals_are_zeros) {
+      SetFlushToZeroAndDenormalsAreZeros();
+    } else {
+      ClearFlushToZeroAndDenormalsAreZeros();
     }
-    param.max_fused_size = parseOptions<unsigned>(options, "f\0");
-    param.verbosity = parseOptions<unsigned>(options, "v\0");
-    param.seed = parseOptions<unsigned>(options, "s\0");
+
+    Factory factory(options);
+    Runner::Run(param, factory, circuit, measure);
   } catch (const std::invalid_argument &exp) {
     IO::errorf("%s", exp.what());
     return {};
   }
 
-  if (denormals_are_zeros) {
-    SetFlushToZeroAndDenormalsAreZeros();
-  } else {
-    ClearFlushToZeroAndDenormalsAreZeros();
-  }
-
-  Runner::Run(
-    param, Factory(num_sim_threads, num_state_threads, num_dblocks), circuit,
-    measure);
   return amplitudes;
 }
 
@@ -450,54 +428,42 @@ std::vector<std::complex<float>> qtrajectory_simulate(const py::dict &options) {
   amplitudes.reserve(bitstrings.size());
 
   NoisyRunnerParameter param;
-  bool use_gpu;
   bool denormals_are_zeros;
-  unsigned gpu_mode;
-  unsigned num_sim_threads = 0;
-  unsigned num_state_threads = 0;
-  unsigned num_dblocks = 0;
   uint64_t seed;
 
   try {
-    use_gpu = parseOptions<unsigned>(options, "g\0");
-    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
-    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
-    if (use_gpu == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "t\0");
-    } else if (gpu_mode == 0) {
-      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
-      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
-    }
-    param.max_fused_size = parseOptions<unsigned>(options, "f\0");
-    param.verbosity = parseOptions<unsigned>(options, "v\0");
-    seed = parseOptions<unsigned>(options, "s\0");
-  } catch (const std::invalid_argument &exp) {
-    IO::errorf("%s", exp.what());
-    return {};
-  }
+    denormals_are_zeros = ParseOptions<unsigned>(options, "z\0");
+    param.max_fused_size = ParseOptions<unsigned>(options, "f\0");
+    param.verbosity = ParseOptions<unsigned>(options, "v\0");
+    seed = ParseOptions<unsigned>(options, "s\0");
 
-  Factory factory(num_sim_threads, num_state_threads, num_dblocks);
-  Simulator simulator = factory.CreateSimulator();
-  StateSpace state_space = factory.CreateStateSpace();
+    Factory factory(options);
+    Simulator simulator = factory.CreateSimulator();
+    StateSpace state_space = factory.CreateStateSpace();
 
-  auto measure = [&bitstrings, &amplitudes, &state_space](
-                  unsigned k, const State &state, NoisyRunner::Stat& stat) {
-    for (const auto &b : bitstrings) {
-      amplitudes.push_back(state_space.GetAmpl(state, b));
-    }
-  };
+    auto measure = [&bitstrings, &amplitudes, &state_space](
+                    unsigned k, const State &state, NoisyRunner::Stat& stat) {
+      for (const auto &b : bitstrings) {
+        amplitudes.push_back(state_space.GetAmpl(state, b));
+      }
+    };
 
-  if (denormals_are_zeros) {
-    SetFlushToZeroAndDenormalsAreZeros();
-  } else {
-    ClearFlushToZeroAndDenormalsAreZeros();
-  }
+    if (denormals_are_zeros) {
+      SetFlushToZeroAndDenormalsAreZeros();
+    } else {
+      ClearFlushToZeroAndDenormalsAreZeros();
+    }
 
-  if (!NoisyRunner::RunBatch(param, ncircuit, seed, seed + 1, state_space,
-                             simulator, measure)) {
-    IO::errorf("qtrajectory simulation of the circuit errored out.\n");
+    if (!NoisyRunner::RunBatch(param, ncircuit, seed, seed + 1, state_space,
+                               simulator, measure)) {
+      IO::errorf("qtrajectory simulation of the circuit errored out.\n");
+      return {};
+    }
+  } catch (const std::invalid_argument &exp) {
+    IO::errorf("%s", exp.what());
     return {};
   }
+
   return amplitudes;
 }
 
@@ -665,48 +631,38 @@ class SimulatorHelper {
 
  private:
   SimulatorHelper(const py::dict &options, bool noisy)
-      : factory(Factory(1, 1, 1)),
+      : factory(Factory(options)),
         state(StateSpace::Null()),
         scratch(StateSpace::Null()) {
     bool denormals_are_zeros;
     is_valid = false;
     is_noisy = noisy;
+
     try {
       if (is_noisy) {
         ncircuit = getNoisyCircuit(options);
         num_qubits = ncircuit.num_qubits;
-        noisy_reps = parseOptions<unsigned>(options, "r\0");
+        noisy_reps = ParseOptions<unsigned>(options, "r\0");
       } else {
         circuit = getCircuit(options);
         num_qubits = circuit.num_qubits;
       }
 
-      use_gpu = parseOptions<unsigned>(options, "g\0");
-      gpu_mode = parseOptions<unsigned>(options, "gmode\0");
-      denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
-      if (use_gpu == 0) {
-        num_sim_threads = parseOptions<unsigned>(options, "t\0");
-      } else if (gpu_mode == 0) {
-        num_state_threads = parseOptions<unsigned>(options, "gsst\0");
-        num_dblocks = parseOptions<unsigned>(options, "gdb\0");
-      }
-      max_fused_size = parseOptions<unsigned>(options, "f\0");
-      verbosity = parseOptions<unsigned>(options, "v\0");
-      seed = parseOptions<unsigned>(options, "s\0");
-
-      if (use_gpu == 0 || gpu_mode == 0) {
-        factory = Factory(num_sim_threads, num_state_threads, num_dblocks);
-      }
+      denormals_are_zeros = ParseOptions<unsigned>(options, "z\0");
+      max_fused_size = ParseOptions<unsigned>(options, "f\0");
+      verbosity = ParseOptions<unsigned>(options, "v\0");
+      seed = ParseOptions<unsigned>(options, "s\0");
 
       StateSpace state_space = factory.CreateStateSpace();
       state = state_space.Create(num_qubits);
-      is_valid = true;
 
       if (denormals_are_zeros) {
         SetFlushToZeroAndDenormalsAreZeros();
       } else {
         ClearFlushToZeroAndDenormalsAreZeros();
       }
+
+      is_valid = true;
     } catch (const std::invalid_argument &exp) {
       // If this triggers, is_valid is false.
       IO::errorf("%s", exp.what());
@@ -848,12 +804,7 @@ class SimulatorHelper {
   State state;
   State scratch;
 
-  bool use_gpu;
-  unsigned gpu_mode;
   unsigned num_qubits;
-  unsigned num_sim_threads;
-  unsigned num_state_threads;
-  unsigned num_dblocks;
   unsigned noisy_reps;
   unsigned max_fused_size;
   unsigned verbosity;
@@ -999,53 +950,43 @@ std::vector<unsigned> qsim_sample(const py::dict &options) {
   using State = StateSpace::State;
   using MeasurementResult = StateSpace::MeasurementResult;
 
-  bool use_gpu;
   bool denormals_are_zeros;
-  unsigned gpu_mode;
-  unsigned num_sim_threads = 0;
-  unsigned num_state_threads = 0;
-  unsigned num_dblocks = 0;
   RunnerParameter param;
+
+  std::vector<unsigned> result_bits;
+
   try {
-    use_gpu = parseOptions<unsigned>(options, "g\0");
-    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
-    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
-    if (use_gpu == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "t\0");
-    } else if (gpu_mode == 0) {
-      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
-      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
-    }
-    param.max_fused_size = parseOptions<unsigned>(options, "f\0");
-    param.verbosity = parseOptions<unsigned>(options, "v\0");
-    param.seed = parseOptions<unsigned>(options, "s\0");
-  } catch (const std::invalid_argument &exp) {
-    IO::errorf("%s", exp.what());
-    return {};
-  }
+    denormals_are_zeros = ParseOptions<unsigned>(options, "z\0");
+    param.max_fused_size = ParseOptions<unsigned>(options, "f\0");
+    param.verbosity = ParseOptions<unsigned>(options, "v\0");
+    param.seed = ParseOptions<unsigned>(options, "s\0");
 
-  std::vector<MeasurementResult> results;
-  Factory factory(num_sim_threads, num_state_threads, num_dblocks);
-  StateSpace state_space = factory.CreateStateSpace();
-  State state = state_space.Create(circuit.num_qubits);
-  state_space.SetStateZero(state);
+    std::vector<MeasurementResult> results;
+    Factory factory(options);
+    StateSpace state_space = factory.CreateStateSpace();
+    State state = state_space.Create(circuit.num_qubits);
+    state_space.SetStateZero(state);
 
-  if (denormals_are_zeros) {
-    SetFlushToZeroAndDenormalsAreZeros();
-  } else {
-    ClearFlushToZeroAndDenormalsAreZeros();
-  }
+    if (denormals_are_zeros) {
+      SetFlushToZeroAndDenormalsAreZeros();
+    } else {
+      ClearFlushToZeroAndDenormalsAreZeros();
+    }
 
-  if (!Runner::Run(param, factory, circuit, state, results)) {
-    IO::errorf("qsim sampling of the circuit errored out.\n");
+    if (!Runner::Run(param, factory, circuit, state, results)) {
+      IO::errorf("qsim sampling of the circuit errored out.\n");
+      return {};
+    }
+
+    for (const auto& result : results) {
+      result_bits.insert(result_bits.end(), result.bitstring.begin(),
+                         result.bitstring.end());
+    }
+  } catch (const std::invalid_argument &exp) {
+    IO::errorf("%s", exp.what());
     return {};
   }
 
-  std::vector<unsigned> result_bits;
-  for (const auto& result : results) {
-    result_bits.insert(result_bits.end(), result.bitstring.begin(),
-                       result.bitstring.end());
-  }
   return result_bits;
 }
 
@@ -1064,79 +1005,68 @@ std::vector<unsigned> qtrajectory_sample(const py::dict &options) {
   using StateSpace = Simulator::StateSpace;
   using State = StateSpace::State;
 
-  NoisyRunnerParameter param;
-  bool use_gpu;
   bool denormals_are_zeros;
-  unsigned gpu_mode;
-  unsigned num_sim_threads = 0;
-  unsigned num_state_threads = 0;
-  unsigned num_dblocks = 0;
+  NoisyRunnerParameter param;
   uint64_t seed;
 
+  std::vector<unsigned> result_bits;
+
   try {
-    use_gpu = parseOptions<unsigned>(options, "g\0");
-    gpu_mode = parseOptions<unsigned>(options, "gmode\0");
-    denormals_are_zeros = parseOptions<unsigned>(options, "z\0");
-    if (use_gpu == 0) {
-      num_sim_threads = parseOptions<unsigned>(options, "t\0");
-    } else if (gpu_mode == 0) {
-      num_state_threads = parseOptions<unsigned>(options, "gsst\0");
-      num_dblocks = parseOptions<unsigned>(options, "gdb\0");
-    }
-    param.max_fused_size = parseOptions<unsigned>(options, "f\0");
-    param.verbosity = parseOptions<unsigned>(options, "v\0");
-    seed = parseOptions<unsigned>(options, "s\0");
+    denormals_are_zeros = ParseOptions<unsigned>(options, "z\0");
+    param.max_fused_size = ParseOptions<unsigned>(options, "f\0");
+    param.verbosity = ParseOptions<unsigned>(options, "v\0");
+    seed = ParseOptions<unsigned>(options, "s\0");
     param.collect_mea_stat = true;
-  } catch (const std::invalid_argument &exp) {
-    IO::errorf("%s", exp.what());
-    return {};
-  }
 
-  Factory factory(num_sim_threads, num_state_threads, num_dblocks);
-  Simulator simulator = factory.CreateSimulator();
-  StateSpace state_space = factory.CreateStateSpace();
-
-  std::vector<std::vector<unsigned>> results;
-
-  auto measure = [&results, &ncircuit](
-                  unsigned k, const State& state, NoisyRunner::Stat& stat) {
-    // Converts stat (which matches the MeasurementResult 'bits' field) into
-    // bitstrings matching the MeasurementResult 'bitstring' field.
-    unsigned idx = 0;
-    for (const auto& channel : ncircuit.channels) {
-      if (channel[0].kind != gate::kMeasurement)
-        continue;
-      for (const auto& op : channel[0].ops) {
-        std::vector<unsigned> bitstring;
-        uint64_t val = stat.samples[idx];
-        for (const auto& q : op.qubits) {
-          bitstring.push_back((val >> q) & 1);
-        }
-        results.push_back(bitstring);
+    Factory factory(options);
+    Simulator simulator = factory.CreateSimulator();
+    StateSpace state_space = factory.CreateStateSpace();
 
-        idx += 1;
-        if (idx >= stat.samples.size())
-          return;
+    std::vector<std::vector<unsigned>> results;
+
+    auto measure = [&results, &ncircuit](
+                    unsigned k, const State& state, NoisyRunner::Stat& stat) {
+      // Converts stat (which matches the MeasurementResult 'bits' field) into
+      // bitstrings matching the MeasurementResult 'bitstring' field.
+      unsigned idx = 0;
+      for (const auto& channel : ncircuit.channels) {
+        if (channel[0].kind != gate::kMeasurement)
+          continue;
+        for (const auto& op : channel[0].ops) {
+          std::vector<unsigned> bitstring;
+          uint64_t val = stat.samples[idx];
+          for (const auto& q : op.qubits) {
+            bitstring.push_back((val >> q) & 1);
+          }
+          results.push_back(bitstring);
+
+          idx += 1;
+          if (idx >= stat.samples.size())
+            return;
+        }
       }
+    };
+
+    if (denormals_are_zeros) {
+      SetFlushToZeroAndDenormalsAreZeros();
+    } else {
+      ClearFlushToZeroAndDenormalsAreZeros();
     }
-  };
 
-  if (denormals_are_zeros) {
-    SetFlushToZeroAndDenormalsAreZeros();
-  } else {
-    ClearFlushToZeroAndDenormalsAreZeros();
-  }
+    if (!NoisyRunner::RunBatch(param, ncircuit, seed, seed + 1,
+                               state_space, simulator, measure)) {
+      IO::errorf("qtrajectory sampling of the circuit errored out.\n");
+      return {};
+    }
 
-  if (!NoisyRunner::RunBatch(param, ncircuit, seed, seed + 1,
-                             state_space, simulator, measure)) {
-    IO::errorf("qtrajectory sampling of the circuit errored out.\n");
+    for (const auto& bitstring : results) {
+      result_bits.insert(result_bits.end(), bitstring.begin(), bitstring.end());
+    }
+  } catch (const std::invalid_argument &exp) {
+    IO::errorf("%s", exp.what());
     return {};
   }
 
-  std::vector<unsigned> result_bits;
-  for (const auto& bitstring : results) {
-    result_bits.insert(result_bits.end(), bitstring.begin(), bitstring.end());
-  }
   return result_bits;
 }
 
@@ -1155,36 +1085,37 @@ std::vector<std::complex<float>> qsimh_simulate(const py::dict &options) {
   try {
     circuit = getCircuit(options);
     bitstrings = getBitstrings(options, circuit.num_qubits);
-    dense_parts = parseOptions<py::list>(options, "k\0");
-    param.prefix = parseOptions<uint64_t>(options, "w\0");
-    param.num_prefix_gatexs = parseOptions<unsigned>(options, "p\0");
-    param.num_root_gatexs = parseOptions<unsigned>(options, "r\0");
-    param.num_threads = parseOptions<unsigned>(options, "t\0");
-    param.max_fused_size = parseOptions<unsigned>(options, "f\0");
-    param.verbosity = parseOptions<unsigned>(options, "v\0");
-  } catch (const std::invalid_argument &exp) {
-    IO::errorf("%s", exp.what());
-    return {};
-  }
-
-  std::vector<unsigned> parts(circuit.num_qubits, 0);
-  for (auto i : dense_parts) {
-    unsigned idx = i.cast<unsigned>();
-    if (idx >= circuit.num_qubits) {
-      IO::errorf("Invalid arguments are provided for arg k.\n");
-      return {};
+    dense_parts = ParseOptions<py::list>(options, "k\0");
+    param.prefix = ParseOptions<uint64_t>(options, "w\0");
+    param.num_prefix_gatexs = ParseOptions<unsigned>(options, "p\0");
+    param.num_root_gatexs = ParseOptions<unsigned>(options, "r\0");
+    param.num_threads = ParseOptions<unsigned>(options, "t\0");
+    param.max_fused_size = ParseOptions<unsigned>(options, "f\0");
+    param.verbosity = ParseOptions<unsigned>(options, "v\0");
+
+    std::vector<unsigned> parts(circuit.num_qubits, 0);
+    for (auto i : dense_parts) {
+      unsigned idx = i.cast<unsigned>();
+      if (idx >= circuit.num_qubits) {
+        IO::errorf("Invalid arguments are provided for arg k.\n");
+        return {};
+      }
+      parts[i.cast<unsigned>()] = 1;
     }
-    parts[i.cast<unsigned>()] = 1;
-  }
 
-  // Define container for amplitudes
-  std::vector<std::complex<float>> amplitudes(bitstrings.size(), 0);
+    // Define container for amplitudes
+    std::vector<std::complex<float>> amplitudes(bitstrings.size(), 0);
 
-  Factory factory(param.num_threads, 0, 0);
+    Factory factory(options);
 
-  if (Runner::Run(param, factory, circuit, parts, bitstrings, amplitudes)) {
-    return amplitudes;
+    if (Runner::Run(param, factory, circuit, parts, bitstrings, amplitudes)) {
+      return amplitudes;
+    }
+  } catch (const std::invalid_argument &exp) {
+    IO::errorf("%s", exp.what());
+    return {};
   }
+
   IO::errorf("qsimh simulation of the circuit errored out.\n");
   return {};
 }
diff --git a/pybind_interface/pybind_main.h b/pybind_interface/pybind_main.h
index 1e4395473..325b9a91e 100644
--- a/pybind_interface/pybind_main.h
+++ b/pybind_interface/pybind_main.h
@@ -23,6 +23,7 @@
 namespace py = pybind11;
 
 #include <map>
+#include <stdexcept>
 #include <vector>
 
 #include "../lib/circuit.h"
@@ -180,6 +181,16 @@ qtrajectory_simulate_moment_expectation_values(
 // Hybrid simulator.
 std::vector<std::complex<float>> qsimh_simulate(const py::dict &options);
 
+template <typename T>
+T ParseOptions(const py::dict& options, const char* key) {
+  if (!options.contains(key)) {
+    std::string msg = std::string("Argument ") + key + " is not provided.\n";
+    throw std::invalid_argument(msg);
+  }
+  const auto& value = options[key];
+  return value.cast<T>();
+}
+
 #define MODULE_BINDINGS                                                               \
       m.doc() = "pybind11 plugin";  /* optional module docstring */                   \
       /* Methods for returning amplitudes */                                          \
diff --git a/pybind_interface/sse/pybind_main_sse.cpp b/pybind_interface/sse/pybind_main_sse.cpp
index f04925135..21287d31f 100644
--- a/pybind_interface/sse/pybind_main_sse.cpp
+++ b/pybind_interface/sse/pybind_main_sse.cpp
@@ -27,12 +27,9 @@ namespace qsim {
   using Simulator = SimulatorSSE<For>;
 
   struct Factory {
-    // num_state_threads and num_dblocks are unused, but kept for consistency
-    // with the GPU Factory.
-    Factory(
-      unsigned num_sim_threads,
-      unsigned num_state_threads,
-      unsigned num_dblocks) : num_threads(num_sim_threads) {}
+    explicit Factory(const py::dict& options) {
+      num_threads = ParseOptions<unsigned>(options, "t\0");
+    }
 
     using Simulator = qsim::Simulator<For>;
     using StateSpace = Simulator::StateSpace;
diff --git a/qsimcirq/qsim_simulator.py b/qsimcirq/qsim_simulator.py
index 240715639..417f67072 100644
--- a/qsimcirq/qsim_simulator.py
+++ b/qsimcirq/qsim_simulator.py
@@ -63,11 +63,19 @@ class QSimOptions:
         gpu_mode: use CUDA if set to 0 (default value), use the NVIDIA
             cuStateVec library if set to 1 or use the NVIDIA cuStateVecEx
             library if set to any other value. The "gpu_*" arguments below are
-            only considered if this is set to 0.
+            only considered if this is set to 0. The "gpu_cusvex_*" arguments
+            below are only considered if this is set to 2 or greater.
         gpu_state_threads: number of threads per CUDA block to use for the GPU
             StateSpace. This must be a power of 2 in the range [32, 1024].
         gpu_data_blocks: number of data blocks to use for the GPU StateSpace.
             Below 16 data blocks, performance is noticeably reduced.
+        gpu_cusvex_log_buf_size: log2 of the transfer buffer size that is used
+            for MPI communication. Default value is 30, i.e. the buffer size is
+            2^30 bytes.
+        gpu_cusvex_network_type: Device network type for multi-device:
+            0=Switch (default), 1=FullMesh. Or layered network type for
+            multi-process: 0=SuperPOD (default), 1=GB200NVL, 2=SwitchTree,
+            3=Communicator.
         verbosity: Logging verbosity.
         denormals_are_zeros: if true, set flush-to-zero and denormals-are-zeros
             MXCSR control flags. This prevents rare cases of performance
@@ -81,6 +89,8 @@ class QSimOptions:
     gpu_mode: int = 0
     gpu_state_threads: int = 512
     gpu_data_blocks: int = 16
+    gpu_cusvex_log_buf_size: int = 30
+    gpu_cusvex_network_type: int = 0
     verbosity: int = 0
     denormals_are_zeros: bool = False
 
@@ -97,6 +107,8 @@ def as_dict(self):
             "gmode": self.gpu_mode,
             "gsst": self.gpu_state_threads,
             "gdb": self.gpu_data_blocks,
+            "glbuf": self.gpu_cusvex_log_buf_size,
+            "gnwt": self.gpu_cusvex_network_type,
             "v": self.verbosity,
             "z": self.denormals_are_zeros,
         }
diff --git a/tests/hybrid_custatevecex_test.cu b/tests/hybrid_custatevecex_test.cu
index a0c75b031..96b1e27a2 100644
--- a/tests/hybrid_custatevecex_test.cu
+++ b/tests/hybrid_custatevecex_test.cu
@@ -53,7 +53,7 @@ TEST(HybridCuStateVecExTest, Hybrid4) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize();
+  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/qtrajectory_custatevecex_test.cu b/tests/qtrajectory_custatevecex_test.cu
index 8d70bfc00..d97c5b6d1 100644
--- a/tests/qtrajectory_custatevecex_test.cu
+++ b/tests/qtrajectory_custatevecex_test.cu
@@ -82,7 +82,7 @@ TEST(QTrajectoryCuStateVecExTest, UncomputeFinalState) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize();
+  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/run_custatevecex_test.cu b/tests/run_custatevecex_test.cu
index 079fd2696..e2ada42f0 100644
--- a/tests/run_custatevecex_test.cu
+++ b/tests/run_custatevecex_test.cu
@@ -256,7 +256,7 @@ TEST(RunQSimTest, CirqGates) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize();
+  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/simulator_custatevecex_test.cu b/tests/simulator_custatevecex_test.cu
index dcf9eaf65..cfb3df377 100644
--- a/tests/simulator_custatevecex_test.cu
+++ b/tests/simulator_custatevecex_test.cu
@@ -99,7 +99,7 @@ TYPED_TEST(SimulatorCuStateVecExTest, ExpectationValue2) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize();
+  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/statespace_custatevecex_test.cu b/tests/statespace_custatevecex_test.cu
index db840d7c9..36d2f7df3 100644
--- a/tests/statespace_custatevecex_test.cu
+++ b/tests/statespace_custatevecex_test.cu
@@ -113,7 +113,7 @@ TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmplDefault) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize();
+  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }

From ca89f998571c9181c509c05badbfac88622e6dc1 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <54642992+sergeisakov@users.noreply.github.com>
Date: Sun, 1 Feb 2026 15:20:25 +0100
Subject: [PATCH 2/3] Restrict the number of devices to pass tests.

---
 tests/simulator_custatevecex_test.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/simulator_custatevecex_test.cu b/tests/simulator_custatevecex_test.cu
index cfb3df377..fe39f8cc0 100644
--- a/tests/simulator_custatevecex_test.cu
+++ b/tests/simulator_custatevecex_test.cu
@@ -28,8 +28,7 @@ namespace qsim {
 template <class T>
 class SimulatorCuStateVecExTest : public testing::Test {};
 
-//using fp_impl = ::testing::Types<float, double>;
-using fp_impl = ::testing::Types<float>;
+using fp_impl = ::testing::Types<float, double>;
 
 TYPED_TEST_SUITE(SimulatorCuStateVecExTest, fp_impl);
 
@@ -41,7 +40,9 @@ struct Factory {
   using StateSpace = typename Simulator::StateSpace;
 
   StateSpace CreateStateSpace() const {
-    return StateSpace{mp};
+    typename StateSpace::Parameter param;
+    param.num_devices = 2;
+    return StateSpace{mp, param};
   }
 
   Simulator CreateSimulator() const {

From 4a9be974c8b21afc38a8632b701d3582ca3d7586 Mon Sep 17 00:00:00 2001
From: Sergei Isakov <54642992+sergeisakov@users.noreply.github.com>
Date: Sun, 1 Feb 2026 17:09:55 +0100
Subject: [PATCH 3/3] Make names consistent.

---
 apps/qsim_base_custatevecex.cu                 |  4 ++--
 lib/multiprocess_custatevecex.h                | 18 +++++++++---------
 lib/statespace_custatevecex.h                  |  8 ++++----
 lib/vectorspace_custatevecex.h                 | 18 +++++++++---------
 .../custatevecex/pybind_main_custatevecex.cpp  | 10 +++++-----
 tests/hybrid_custatevecex_test.cu              |  2 +-
 tests/qtrajectory_custatevecex_test.cu         |  2 +-
 tests/run_custatevecex_test.cu                 |  2 +-
 tests/simulator_custatevecex_test.cu           |  2 +-
 tests/statespace_custatevecex_test.cu          |  2 +-
 10 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/apps/qsim_base_custatevecex.cu b/apps/qsim_base_custatevecex.cu
index d7d1180a3..041cfba37 100644
--- a/apps/qsim_base_custatevecex.cu
+++ b/apps/qsim_base_custatevecex.cu
@@ -119,9 +119,9 @@ int main(int argc, char* argv[]) {
     explicit Factory(uint64_t transfer_buffer_size, unsigned verbosity = 0)
         : verbosity(verbosity) {
       MultiProcessCuStateVecEx::Parameter param = {transfer_buffer_size};
-      mp.initialize(param);
+      mp.Initialize(param);
 
-      if (verbosity > 2 && mp.initialized()) {
+      if (verbosity > 2 && mp.Initialized()) {
         qsim::IO::messagef("# transfer_buf_size=%lu\n", transfer_buffer_size);
       }
     }
diff --git a/lib/multiprocess_custatevecex.h b/lib/multiprocess_custatevecex.h
index 1ffcd0332..4636b461b 100644
--- a/lib/multiprocess_custatevecex.h
+++ b/lib/multiprocess_custatevecex.h
@@ -54,27 +54,27 @@ struct MultiProcessCuStateVecEx {
     custatevecExCommunicatorFinalize(&status);
   }
 
-  custatevecExCommunicatorDescriptor_t communicator() const {
+  custatevecExCommunicatorDescriptor_t Communicator() const {
     return communicator_;
   }
 
-  unsigned num_processes() const {
+  unsigned NumProcesses() const {
     return num_processes_;
   }
 
-  unsigned rank() const {
+  unsigned Rank() const {
     return rank_;
   }
 
-  static bool valid_network_type(unsigned network_type) {
+  static bool ValidNetworkType(unsigned network_type) {
     return network_type < 4;
   }
 
-  bool initialized() const {
+  bool Initialized() const {
     return initialized_;
   }
 
-  void initialize(Parameter param) {
+  void Initialize(Parameter param) {
     param_ = param;
 
     int argc = 0;
@@ -114,7 +114,7 @@ struct MultiProcessCuStateVecEx {
     num_global_qubits_ = get_num_global_qubits(num_processes);
 
     unsigned num_acc_global_qubits = 0;
-    auto network_layers = get_network_layers(param_.network_type);
+    auto network_layers = GetNetworkLayers(param_.network_type);
 
     num_global_qubits_per_layer_.reserve(2);
     global_index_bit_classes_.reserve(2);
@@ -154,7 +154,7 @@ struct MultiProcessCuStateVecEx {
     initialized_ = true;
   }
 
-  auto create_sv_config(unsigned num_qubits, cudaDataType_t data_type) const {
+  auto CreateSVConfig(unsigned num_qubits, cudaDataType_t data_type) const {
     custatevecExDictionaryDescriptor_t sv_config = nullptr;
 
     if (!initialized_ ||
@@ -192,7 +192,7 @@ struct MultiProcessCuStateVecEx {
 
   using NetworkLayers = std::vector<NetworkLayer>;
 
-  static NetworkLayers get_network_layers(NetworkType id) {
+  static NetworkLayers GetNetworkLayers(NetworkType id) {
     switch (id) {
     case kSuperPod:
       return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 3},
diff --git a/lib/statespace_custatevecex.h b/lib/statespace_custatevecex.h
index ce5cb0c3e..8aa6167f2 100644
--- a/lib/statespace_custatevecex.h
+++ b/lib/statespace_custatevecex.h
@@ -130,7 +130,7 @@ class StateSpaceCuStateVecEx :
     unsigned required_rank = k / size;
 
     if (state.distr_type() != Base::kMultiProcess
-        || Base::mp.rank() == required_rank) {
+        || Base::mp.Rank() == required_rank) {
       ErrorCheck(custatevecExStateVectorGetState(
           state.get(), buf, kStateDataType, k, k + 1, 1));
     }
@@ -139,7 +139,7 @@ class StateSpaceCuStateVecEx :
 
     if (state.distr_type() == Base::kMultiProcess) {
       auto cuda_type = GetCudaType<std::complex<fp_type>>();
-      auto comm = Base::mp.communicator();
+      auto comm = Base::mp.Communicator();
       ErrorCheck(comm->intf->bcast(comm, buf, 1, cuda_type, required_rank));
     }
 
@@ -161,7 +161,7 @@ class StateSpaceCuStateVecEx :
     unsigned required_rank = k / size;
 
     if (state.distr_type() != Base::kMultiProcess
-        || Base::mp.rank() == required_rank) {
+        || Base::mp.Rank() == required_rank) {
       ErrorCheck(custatevecExStateVectorSetState(
           state.get(), buf, kStateDataType, k, k + 1, 1));
     }
@@ -183,7 +183,7 @@ class StateSpaceCuStateVecEx :
     unsigned required_rank = k / size;
 
     if (state.distr_type() != Base::kMultiProcess
-        || Base::mp.rank() == required_rank) {
+        || Base::mp.Rank() == required_rank) {
       ErrorCheck(custatevecExStateVectorSetState(
           state.get(), buf, kStateDataType, k, k + 1, 1));
     }
diff --git a/lib/vectorspace_custatevecex.h b/lib/vectorspace_custatevecex.h
index c7b17455a..2d80f28de 100644
--- a/lib/vectorspace_custatevecex.h
+++ b/lib/vectorspace_custatevecex.h
@@ -245,7 +245,7 @@ class VectorSpaceCuStateVecEx {
         ResultType local_r = callback(k, res);
 
         auto cuda_type = GetCudaType<ResultType>();
-        auto comm = mp_->communicator();
+        auto comm = mp_->Communicator();
         ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
 
         return r;
@@ -338,7 +338,7 @@ class VectorSpaceCuStateVecEx {
         ResultType local_r = callback(k, res1, res2);
 
         auto cuda_type = GetCudaType<ResultType>();
-        auto comm = mp_->communicator();
+        auto comm = mp_->Communicator();
         ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
 
         return r;
@@ -385,16 +385,16 @@ class VectorSpaceCuStateVecEx {
   Vector Create(unsigned num_qubits) const {
     custatevecExStateVectorDescriptor_t state_vec;
     custatevecExDictionaryDescriptor_t sv_config
-        = mp.create_sv_config(num_qubits, kStateDataType);
+        = mp.CreateSVConfig(num_qubits, kStateDataType);
 
     unsigned num_substates = 1;
     DistributionType distr_type = kNoDistr;
 
     if (sv_config != nullptr) {
       ErrorCheck(custatevecExStateVectorCreateMultiProcess(
-          &state_vec, sv_config, nullptr, mp.communicator(), nullptr));
+          &state_vec, sv_config, nullptr, mp.Communicator(), nullptr));
 
-      num_substates = mp.num_processes();
+      num_substates = mp.NumProcesses();
       distr_type = kMultiProcess;
 
       if (param.verbosity > 2) {
@@ -507,7 +507,7 @@ class VectorSpaceCuStateVecEx {
   bool Copy(const Vector& src, fp_type* dest) const {
     if (src.distr_type() == kMultiProcess) {
       uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
-      uint64_t offset = size * mp.rank();
+      uint64_t offset = size * mp.Rank();
 
       ErrorCheck(custatevecExStateVectorGetState(
           src.get(), dest + 2 * offset, kStateDataType,
@@ -515,7 +515,7 @@ class VectorSpaceCuStateVecEx {
       ErrorCheck(custatevecExStateVectorSynchronize(src.get()));
 
       auto cuda_type = GetCudaType<std::complex<fp_type>>();
-      auto comm = mp.communicator();
+      auto comm = mp.Communicator();
       ErrorCheck(comm->intf->allgather(
           comm, dest + 2 * offset, dest, size, cuda_type));
     } else {
@@ -533,7 +533,7 @@ class VectorSpaceCuStateVecEx {
   bool Copy(const fp_type* src, Vector& dest) const {
     if (dest.distr_type() == kMultiProcess) {
       uint64_t size = (uint64_t{1} << dest.num_qubits()) / dest.num_substates();
-      uint64_t offset = size * mp.rank();
+      uint64_t offset = size * mp.Rank();
 
       ErrorCheck(custatevecExStateVectorSetState(
           dest.get(), src + 2 * offset, kStateDataType,
@@ -564,7 +564,7 @@ class VectorSpaceCuStateVecEx {
 
     if (dest.distr_type() == kMultiProcess) {
       size /= dest.num_substates();
-      uint64_t offset = size * mp.rank();
+      uint64_t offset = size * mp.Rank();
 
       ErrorCheck(custatevecExStateVectorSetState(
           dest.get(), src + 2 * offset, kStateDataType,
diff --git a/pybind_interface/custatevecex/pybind_main_custatevecex.cpp b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
index e8b9e7f6d..400baa220 100644
--- a/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
+++ b/pybind_interface/custatevecex/pybind_main_custatevecex.cpp
@@ -42,10 +42,10 @@ namespace qsim {
       verbosity = ParseOptions<unsigned>(options, "v\0");
       nwt = ParseOptions<unsigned>(options, "gnwt\0");
 
-      if (!mp.initialized()) {
+      if (!mp.Initialized()) {
         using MP = qsim::MultiProcessCuStateVecEx;
 
-        if (!mp.valid_network_type(nwt)) {
+        if (!mp.ValidNetworkType(nwt)) {
           throw std::invalid_argument("Invalid network type.");
         }
 
@@ -58,14 +58,14 @@ namespace qsim {
         param.transfer_buffer_size = buffer_size;
         param.network_type = network_type;
 
-        mp.initialize(param);
+        mp.Initialize(param);
 
-        if (verbosity > 2 && mp.initialized()) {
+        if (verbosity > 2 && mp.Initialized()) {
           qsim::IO::messagef("transfer_buf_size=%lu\n", buffer_size);
         }
       }
 
-      if (!mp.initialized()) {
+      if (!mp.Initialized()) {
         if (!StateSpace::ValidDeviceNetworkType(nwt)) {
           throw std::invalid_argument("Invalid device network type.");
         }
diff --git a/tests/hybrid_custatevecex_test.cu b/tests/hybrid_custatevecex_test.cu
index 96b1e27a2..142ad747b 100644
--- a/tests/hybrid_custatevecex_test.cu
+++ b/tests/hybrid_custatevecex_test.cu
@@ -53,7 +53,7 @@ TEST(HybridCuStateVecExTest, Hybrid4) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
+  qsim::mp.Initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/qtrajectory_custatevecex_test.cu b/tests/qtrajectory_custatevecex_test.cu
index d97c5b6d1..f46c1562f 100644
--- a/tests/qtrajectory_custatevecex_test.cu
+++ b/tests/qtrajectory_custatevecex_test.cu
@@ -82,7 +82,7 @@ TEST(QTrajectoryCuStateVecExTest, UncomputeFinalState) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
+  qsim::mp.Initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/run_custatevecex_test.cu b/tests/run_custatevecex_test.cu
index e2ada42f0..823df7dbd 100644
--- a/tests/run_custatevecex_test.cu
+++ b/tests/run_custatevecex_test.cu
@@ -256,7 +256,7 @@ TEST(RunQSimTest, CirqGates) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
+  qsim::mp.Initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/simulator_custatevecex_test.cu b/tests/simulator_custatevecex_test.cu
index fe39f8cc0..5d8d1ea68 100644
--- a/tests/simulator_custatevecex_test.cu
+++ b/tests/simulator_custatevecex_test.cu
@@ -100,7 +100,7 @@ TYPED_TEST(SimulatorCuStateVecExTest, ExpectationValue2) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
+  qsim::mp.Initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }
diff --git a/tests/statespace_custatevecex_test.cu b/tests/statespace_custatevecex_test.cu
index 36d2f7df3..55993459a 100644
--- a/tests/statespace_custatevecex_test.cu
+++ b/tests/statespace_custatevecex_test.cu
@@ -113,7 +113,7 @@ TYPED_TEST(StateSpaceCuStateVecExTest, BulkSetAmplDefault) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 
-  qsim::mp.initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
+  qsim::mp.Initialize(qsim::MultiProcessCuStateVecEx::Parameter{});
 
   return RUN_ALL_TESTS();
 }