quantumlib · sun-9545sunoj · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu
@@ -0,0 +1,220 @@
+// Copyright 2026 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <complex>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <unistd.h>
+#include <vector>
+
+#include "../lib/bitstring.h"
+#include "../lib/circuit_qsim_parser.h"
+#include "../lib/formux.h"
+#include "../lib/fuser_basic.h"
+#include "../lib/gates_qsim.h"
+#include "../lib/io_file.h"
+#include "../lib/run_qsimh.h"
+#include "../lib/simmux.h"
+#include "../lib/simulator_cuda.h"
+#include "../lib/util.h"
+#include "../lib/util_cpu.h"
+
+constexpr char usage[] =
+    "usage:\n  ./qsimh_base_cuda.x -c circuit_file "
+    "-d maximum_time -k part1_qubits "
+    "-w prefix -p num_prefix_gates -r num_root_gates "
+    "-t num_threads -n num_dblocks -v verbosity -z\n";
+
+struct Options {
+  std::string circuit_file;
+  std::vector<unsigned> part1;
+  uint64_t prefix;
+  unsigned maxtime = std::numeric_limits<unsigned>::max();
+  unsigned num_prefix_gatexs = 0;
+  unsigned num_root_gatexs = 0;
+  unsigned num_threads = 256;
+  unsigned num_dblocks = 16;
+  unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
+};
+
+Options GetOptions(int argc, char* argv[]) {
+  Options opt;
+
+  int k;
+
+  auto to_int = [](const std::string& word) -> unsigned {
+    return std::stoul(word);
+  };
+
+  while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:n:v:z")) != -1) {
+    switch (k) {
+      case 'c':
+        opt.circuit_file = optarg;
+        break;
+      case 'd':
+        opt.maxtime = std::stoul(optarg);
+        break;
+      case 'k':
+        qsim::SplitString(optarg, ',', to_int, opt.part1);
+        break;
+      case 'w':
+        opt.prefix = std::stoull(optarg);
+        break;
+      case 'p':
+        opt.num_prefix_gatexs = std::stoul(optarg);
+        break;
+      case 'r':
+        opt.num_root_gatexs = std::stoul(optarg);
+        break;
+      case 't':
+        opt.num_threads = std::stoul(optarg);
+        break;
+      case 'n':
+        opt.num_dblocks = std::stoul(optarg);
+        break;
+      case 'v':
+        opt.verbosity = std::stoul(optarg);
+        break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
+      default:
+        qsim::IO::errorf(usage);
+        exit(1);
+    }
+  }
+
+  return opt;
+}
+
+bool ValidateOptions(const Options& opt) {
+  if (opt.circuit_file.empty()) {
+    qsim::IO::errorf("circuit file is not provided.\n");
+    qsim::IO::errorf(usage);
+    return false;
+  }
+
+  return true;
+}
+
+bool ValidatePart1(unsigned num_qubits, const std::vector<unsigned>& part1) {
+  for (std::size_t i = 0; i < part1.size(); ++i) {
+    if (part1[i] >= num_qubits) {
+      qsim::IO::errorf("part 1 qubit indices are too large.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+std::vector<unsigned> GetParts(
+    unsigned num_qubits, const std::vector<unsigned>& part1) {
+  std::vector<unsigned> parts(num_qubits, 0);
+
+  for (std::size_t i = 0; i < part1.size(); ++i) {
+    parts[part1[i]] = 1;
+  }
+
+  return parts;
+}
+
+int main(int argc, char* argv[]) {
+  using namespace qsim;
+
+  auto opt = GetOptions(argc, argv);
+  if (!ValidateOptions(opt)) {
+    return 1;
+  }
+
+  Circuit<GateQSim<float>> circuit;
+  if (!CircuitQsimParser<IOFile>::FromFile(
+          opt.maxtime, opt.circuit_file, circuit)) {
+    return 1;
+  }
+
+  if (!ValidatePart1(circuit.num_qubits, opt.part1)) {
+    return 1;
+  }
+  auto parts = GetParts(circuit.num_qubits, opt.part1);
+
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
+  uint64_t num_bitstrings =
+      std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits);
+
+  std::vector<Bitstring> bitstrings;
+  bitstrings.reserve(num_bitstrings);
+  for (std::size_t i = 0; i < num_bitstrings; ++i) {
+    bitstrings.push_back(i);
+  }
+
+  struct Factory {
+    using Simulator = qsim::SimulatorCUDA<float>;
+    using StateSpace = Simulator::StateSpace;
+    using fp_type = Simulator::fp_type;
+
+    Factory(const StateSpace::Parameter& param) : param(param) {}
+
+    StateSpace CreateStateSpace() const { return StateSpace(param); }
+
+    Simulator CreateSimulator() const { return Simulator(); }
+
+    const StateSpace::Parameter& param;
+  };
+
+  using HybridSimulator =
+      HybridSimulator<IO, GateQSim<float>, BasicGateFuser, For>;
+  using Runner = QSimHRunner<IO, HybridSimulator>;
+
+  Runner::Parameter param;
+  param.prefix = opt.prefix;
+  param.num_prefix_gatexs = opt.num_prefix_gatexs;
+  param.num_root_gatexs = opt.num_root_gatexs;
+  param.num_threads =
+      opt.num_threads;  // This is reused for StateSpaceCUDA params implicitly
+                        // if not careful, but here we separate.
+  param.verbosity = opt.verbosity;
+
+  std::vector<std::complex<Factory::fp_type>> results(num_bitstrings, 0);
+
+  // Setup CUDA parameters
+  Factory::StateSpace::Parameter cuda_param;
+  cuda_param.num_threads = opt.num_threads;
+  cuda_param.num_dblocks = opt.num_dblocks;
+
+  Factory factory(cuda_param);
+
+  if (Runner::Run(param, factory, circuit, parts, bitstrings, results)) {
+    static constexpr char const* bits[8] = {
+        "000", "001", "010", "011", "100", "101", "110", "111",
+    };
+
+    unsigned s = 3 - std::min(unsigned{3}, circuit.num_qubits);
+
+    for (std::size_t i = 0; i < num_bitstrings; ++i) {
+      const auto& a = results[i];
+      qsim::IO::messagef(
+          "%s:%16.8g%16.8g%16.8g\n", bits[i] + s, std::real(a), std::imag(a),
+          std::norm(a));
+    }
+  }
+
+  return 0;
+}
diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
@@ -350,8 +350,8 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
+    ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, state.get());
   }
 
   template <unsigned G>
@@ -374,8 +374,8 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+    ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         1 << num_effective_qs, state.get());
   }
 
@@ -407,8 +407,8 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
+    ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
   }
 
   template <unsigned G>
@@ -432,9 +432,9 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
+    ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1,
+        d.cvalsh, 1 << d.num_effective_qs, state.get());
   }
 
   template <unsigned G>
@@ -458,8 +458,8 @@ class SimulatorCUDA final {
 
     IndicesLC<G> d_i(d_ws);
 
-    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
+    ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
         1 << (5 - d.remaining_low_cqs), state.get());
   }
@@ -493,9 +493,9 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
-        state.get(), Plus<double>(), d_res1);
+    ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(),
+        Plus<double>(), d_res1);
 
     double mul = size == 1 ? 0.5 : 1.0;
 
@@ -531,8 +531,8 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+    ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         num_iterations_per_block, state.get(), Plus<double>(), d_res1);
 
     double mul = double(1 << (5 + num_effective_qs - G)) / 32;
@@ -895,6 +895,12 @@ class SimulatorCUDA final {
     return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
   }
 
+  static dim3 CreateGrid(uint64_t blocks) {
+    if (blocks <= 65535) return dim3(blocks);
+    uint32_t x = 65535;
+    uint32_t y = (blocks + x - 1) / x;
+    return dim3(x, y);
+  }
 
   void* AllocScratch(uint64_t size) const {
     if (size > scratch_size_) {