diff --git a/Cargo.lock b/Cargo.lock index 23b752b4..f5f8018c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2979,6 +2979,23 @@ dependencies = [ "tiny-skia", ] +[[package]] +name = "sdot" +version = "0.1.0" +dependencies = [ + "blastoff", + "cuda_builder", + "cust", + "rand 0.9.2", +] + +[[package]] +name = "sdot_kernels" +version = "0.1.0" +dependencies = [ + "cuda_std", +] + [[package]] name = "semver" version = "0.6.0" diff --git a/Cargo.toml b/Cargo.toml index 632d3460..c27b794f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,8 @@ members = [ "examples/gemm/kernels", "examples/i128_demo", "examples/i128_demo/kernels", + "examples/sdot", + "examples/sdot/kernels", "examples/sha2_crates_io", "examples/sha2_crates_io/kernels", "examples/vecadd", diff --git a/examples/README.md b/examples/README.md index eca937e4..d2705059 100644 --- a/examples/README.md +++ b/examples/README.md @@ -21,3 +21,6 @@ which runs on CPU or GPU, with the additional option of running OptiX denoising. The Path Tracer uses cuda_builder to compile the core path tracer for the GPU and GPU (hardware raytracing), and uses the core path tracer as a normal crate for CPU rendering and sharing structures. + +### [sdot](sdot) +Example of computes the dot product of two single-precision (f32) vectors \ No newline at end of file diff --git a/examples/sdot/Cargo.toml b/examples/sdot/Cargo.toml new file mode 100644 index 00000000..0d9b1511 --- /dev/null +++ b/examples/sdot/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "sdot" +version = "0.1.0" +edition = "2024" +description = "Example of computes the dot product of two f32 vectors" + +[dependencies] +cust = { path = "../../crates/cust" } +blastoff = { path = "../../crates/blastoff" } +rand = "0.9.*" + +[build-dependencies] +cuda_builder = { workspace = true, default-features = false } diff --git a/examples/sdot/build.rs b/examples/sdot/build.rs new file mode 100644 index 00000000..dacebdec --- /dev/null +++ b/examples/sdot/build.rs @@ -0,0 +1,63 @@ +use std::env; +use std::path; +use std::process::{Command, Stdio}; + +use cuda_builder::CudaBuilder; + +fn main() { + println!("cargo::rerun-if-changed=build.rs"); + println!("cargo::rerun-if-changed=kernels"); + + let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap()); + let kernels_path = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()).join("kernels"); + + CudaBuilder::new(kernels_path.as_path()) + .copy_to(out_path.join("kernels.ptx")) + .build() + .unwrap(); + + // Generate PTX from native CUDA kernels + let cuda_kernel_path = kernels_path.join("cuda/sdot.cu"); + + println!("cargo::rerun-if-changed={}", cuda_kernel_path.display()); + + let cuda_ptx = out_path.join("kernels_cuda_mangles.ptx"); + let mut nvcc = Command::new("nvcc"); + nvcc.arg("--ptx") + .args(["--Werror", "all-warnings"]) + .args(["--output-directory", out_path.as_os_str().to_str().unwrap()]) + .args(["-o", cuda_ptx.as_os_str().to_str().unwrap()]) + .arg(cuda_kernel_path.as_path()); + + let build = nvcc + .stderr(Stdio::inherit()) + .output() + .expect("failed to execute nvcc kernel build"); + + assert!(build.status.success()); + + // Decodes (demangles) low-level identifiers + let cat_out = Command::new("cat") + .arg(cuda_ptx) + .stdout(Stdio::piped()) + .stderr(Stdio::inherit()) + .spawn() + .expect("Failed to start cat process") + .stdout + .expect("Failed to open cat stdout"); + + let outputs = std::fs::File::create(out_path.join("kernels_cuda.ptx")) + .expect("Can not open output ptc kernel file"); + + let filt_out = Command::new("cu++filt") + .arg("-p") + .stdin(Stdio::from(cat_out)) + .stdout(Stdio::from(outputs)) + .stderr(Stdio::inherit()) + .spawn() + .expect("Failed to start cu++filt process") + .wait_with_output() + .expect("Failed to wait on cu++filt"); + + assert!(filt_out.status.success()); +} diff --git a/examples/sdot/kernels/Cargo.toml b/examples/sdot/kernels/Cargo.toml new file mode 100644 index 00000000..b4ee4357 --- /dev/null +++ b/examples/sdot/kernels/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "sdot_kernels" +version = "0.1.0" +edition = "2024" + +[dependencies] +cuda_std = { path = "../../../crates/cuda_std" } + +[lib] +crate-type = ["cdylib", "rlib"] diff --git a/examples/sdot/kernels/cuda/sdot.cu b/examples/sdot/kernels/cuda/sdot.cu new file mode 100644 index 00000000..ff354921 --- /dev/null +++ b/examples/sdot/kernels/cuda/sdot.cu @@ -0,0 +1,36 @@ +// Here we use `unsigned long` to match the Rust version `usize`. +__global__ void sdot(const float *x, unsigned long x_n, const float *y, unsigned long y_n, float *out) +{ + + extern __shared__ float shared_sum[]; + unsigned int i; + + unsigned int num_threads = gridDim.x * blockDim.x; + unsigned int start_ind = blockDim.x * blockIdx.x; + unsigned int tid = threadIdx.x; + + float sum = 0.0f; + for (i = start_ind + tid; i < x_n; i += num_threads) + { + // Rust checks emulation + if (i >= y_n) + __trap(); + + sum += x[i] * y[i]; + } + shared_sum[tid] = sum; + + for (i = blockDim.x >> 1; i > 0; i >>= 1) + { + __syncthreads(); + if (tid < i) + { + shared_sum[tid] += shared_sum[tid + i]; + } + } + + if (tid == 0) + { + out[blockIdx.x] = shared_sum[tid]; + } +} \ No newline at end of file diff --git a/examples/sdot/kernels/src/lib.rs b/examples/sdot/kernels/src/lib.rs new file mode 100644 index 00000000..da2c825b --- /dev/null +++ b/examples/sdot/kernels/src/lib.rs @@ -0,0 +1,37 @@ +use cuda_std::{kernel, shared, thread}; + +#[kernel] +#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)] +pub unsafe fn sdot(x: &[f32], y: &[f32], out: *mut f32) { + let shared_sum = shared::dynamic_shared_mem::(); + + let num_threads = (thread::grid_dim_x() as usize) * (thread::block_dim_x() as usize); + let start_ind = (thread::block_dim_x() as usize) * (thread::block_idx_x() as usize); + let tid = thread::thread_idx_x() as usize; + + let mut sum = 0f32; + for i in ((start_ind + tid)..x.len()).step_by(num_threads) { + sum += x[i] * y[i]; + } + unsafe { + *shared_sum.add(tid) = sum; + } + + let mut i = (thread::block_dim_x() >> 1) as usize; + while i > 0 { + thread::sync_threads(); + if tid < i { + unsafe { + *shared_sum.add(tid) += *shared_sum.add(tid + i); + } + } + + i >>= 1; + } + + if tid == 0 { + unsafe { + *out.add(thread::block_idx_x() as usize) = *shared_sum.add(tid); + } + } +} diff --git a/examples/sdot/src/main.rs b/examples/sdot/src/main.rs new file mode 100644 index 00000000..92d7de00 --- /dev/null +++ b/examples/sdot/src/main.rs @@ -0,0 +1,242 @@ +use std::error::Error; +use std::fmt; +use std::time::Duration; + +use rand::Rng; + +use blastoff::CublasContext; +use cust::event; +use cust::function; +use cust::launch; +use cust::memory::{self, CopyDestination as _}; +use cust::module::Module; +use cust::stream; +use cust::util::SliceExt as _; + +const VECTORS_LEN: usize = 10_000_000; +const NUM_WARMUP: usize = 100; +const NUM_RUNS: usize = 1000; +const BLOCK_SIZE: u32 = 1024; +const GRID_SIZE: u32 = 80; + +static PTX_NATIVE: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels_cuda.ptx")); +static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx")); + +struct KernelLaunchStats { + /// The size of the block grid + pub grid_size: u32, + /// Number of threads per block + pub block_size: u32, + /// The amount of dynamically allocated shared memory + pub shared_mem_size: u32, + /// Number of registers used + pub num_regs: u32, +} + +impl fmt::Display for KernelLaunchStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "Kernel launch stats:")?; + writeln!(f, " grid_size: {}", self.grid_size)?; + writeln!(f, " block_size: {}", self.block_size)?; + writeln!(f, " shared_mem_size: {}", self.shared_mem_size)?; + writeln!(f, " num_regs: {}", self.num_regs) + } +} + +/// Launch statistics and outputs +struct RunResult { + /// The average value of the result + pub res_average: f64, + /// Duration of one iteration + pub run_duration: Duration, + /// Statistics of the running kernel + pub kernel_launch_stats: Option, +} + +impl fmt::Display for RunResult { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "result: {}", self.res_average)?; + writeln!( + f, + "Duration of one iteration: {} ms", + self.run_duration.as_secs_f64() * 1000f64 + )?; + if let Some(kernel_lunch_stats) = &self.kernel_launch_stats { + write!(f, "{}", kernel_lunch_stats)?; + } + Ok(()) + } +} + +fn main() -> Result<(), Box> { + // Initialize CUDA + let _ctx = cust::quick_init()?; + + // make a CUDA stream to issue calls to. + let stream = stream::Stream::new(stream::StreamFlags::NON_BLOCKING, None)?; + + // Generate input hosts vectors A and B + let mut rng = rand::rng(); + let x_host = (0..VECTORS_LEN) + .map(|_i| rng.random_range(0.0..=1.0f32)) + .collect::>(); + let y_host = (0..VECTORS_LEN) + .map(|_i| rng.random_range(0.0..=1.0f32)) + .collect::>(); + + // Allocate the GPU memory + let x_gpu = x_host.as_slice().as_dbuf()?; + let y_gpu = y_host.as_slice().as_dbuf()?; + + // Run cuBLAS test + let blas_res = run_cublas_sdot_test(&stream, &x_gpu, &y_gpu)?; + println!("cuBLAS:\n{}", blas_res); + + // Run native CUDA kernel test + let module_native = Module::from_ptx(PTX_NATIVE, &[])?; + let sdot_native = module_native.get_function("sdot")?; + + let native_res = + run_cuda_sdot_test(&stream, sdot_native, &x_gpu, &y_gpu, GRID_SIZE, BLOCK_SIZE)?; + println!("Native CUDA:\n{}", native_res); + + // Run Rust CUDA kernel test + let module_rust = Module::from_ptx(PTX, &[])?; + let sdot_rust = module_rust.get_function("sdot")?; + + let rust_res = run_cuda_sdot_test(&stream, sdot_rust, &x_gpu, &y_gpu, GRID_SIZE, BLOCK_SIZE)?; + println!("Rust CUDA:\n{}", rust_res); + + Ok(()) +} + +/// Runs cuBLAS dot product of two vectors +/// +/// Runs the scalar product calculations several times, before warming up +/// and calculating the execution time. +fn run_cublas_sdot_test( + stream: &stream::Stream, + x: &memory::DeviceBuffer, + y: &memory::DeviceBuffer, +) -> Result> { + let mut ctx = CublasContext::new()?; + + // WarmUp + for _ in 0..NUM_WARMUP { + let mut result = memory::DeviceBox::new(&0.0)?; + ctx.dot(stream, x.len(), x, y, &mut result)?; + stream.synchronize()?; + let _res = result.as_host_value()?; + } + + // Run bench + let mut res_average = 0f64; + let begin = event::Event::new(event::EventFlags::DEFAULT)?; + let end = event::Event::new(event::EventFlags::DEFAULT)?; + begin.record(stream)?; + + for _ in 0..NUM_RUNS { + let mut result = memory::DeviceBox::new(&0.0)?; + ctx.dot(stream, x.len(), x, y, &mut result)?; + stream.synchronize()?; + res_average += result.as_host_value()? as f64; + } + + end.record(stream)?; + begin.synchronize()?; + end.synchronize()?; + + res_average /= NUM_RUNS as f64; + let run_duration = end.elapsed(&begin)?.div_f64(NUM_RUNS as f64); + + let stats = RunResult { + res_average, + run_duration, + kernel_launch_stats: None, + }; + Ok(stats) +} + +/// Runs CUDA kernel test: dot product of two vectors +/// +/// Runs the scalar product calculations several times, before warming up +/// and calculating the execution time. +fn run_cuda_sdot_test( + stream: &stream::Stream, + sdot_fun: function::Function, + x: &memory::DeviceBuffer, + y: &memory::DeviceBuffer, + grid_size: u32, + block_size: u32, +) -> Result> { + // Allocate memory to collect results, one per thread block + let mut out_host = vec![0.0f32; grid_size as _]; + let out_gpu = memory::DeviceBuffer::zeroed(grid_size as _)?; + + // Shared memory size per thread block + let shared_mem_size = block_size * (std::mem::size_of::() as u32); + + // WarmUp + for _ in 0..NUM_WARMUP { + unsafe { + launch!( + sdot_fun<<>>( + x.as_device_ptr(), + x.len(), + y.as_device_ptr(), + y.len(), + out_gpu.as_device_ptr(), + ) + )?; + } + stream.synchronize()?; + out_gpu.copy_to(&mut out_host)?; + let _res: f64 = out_host.iter().map(|e| *e as f64).sum(); + } + + // Run bench + let mut res_average = 0f64; + let begin = event::Event::new(event::EventFlags::DEFAULT)?; + let end = event::Event::new(event::EventFlags::DEFAULT)?; + begin.record(stream)?; + + for _ in 0..NUM_RUNS { + unsafe { + launch!( + sdot_fun<<>>( + x.as_device_ptr(), + x.len(), + y.as_device_ptr(), + y.len(), + out_gpu.as_device_ptr(), + ) + )?; + } + stream.synchronize()?; + out_gpu.copy_to(&mut out_host)?; + let res: f64 = out_host.iter().map(|e| *e as f64).sum(); + res_average += res; + } + + let kernel_launch_stats = KernelLaunchStats { + grid_size, + block_size, + shared_mem_size, + num_regs: sdot_fun.get_attribute(function::FunctionAttribute::NumRegisters)? as u32, + }; + + end.record(stream)?; + begin.synchronize()?; + end.synchronize()?; + + res_average /= NUM_RUNS as f64; + let run_duration = end.elapsed(&begin)?.div_f64(NUM_RUNS as f64); + + let stats = RunResult { + res_average, + run_duration, + kernel_launch_stats: Some(kernel_launch_stats), + }; + + Ok(stats) +}