diff --git a/Cargo.lock b/Cargo.lock
index 23b752b4..f5f8018c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2979,6 +2979,23 @@ dependencies = [
  "tiny-skia",
 ]
 
+[[package]]
+name = "sdot"
+version = "0.1.0"
+dependencies = [
+ "blastoff",
+ "cuda_builder",
+ "cust",
+ "rand 0.9.2",
+]
+
+[[package]]
+name = "sdot_kernels"
+version = "0.1.0"
+dependencies = [
+ "cuda_std",
+]
+
 [[package]]
 name = "semver"
 version = "0.6.0"
diff --git a/Cargo.toml b/Cargo.toml
index 632d3460..c27b794f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,8 @@ members = [
   "examples/gemm/kernels",
   "examples/i128_demo",
   "examples/i128_demo/kernels",
+  "examples/sdot",
+  "examples/sdot/kernels",
   "examples/sha2_crates_io",
   "examples/sha2_crates_io/kernels",
   "examples/vecadd",
diff --git a/examples/README.md b/examples/README.md
index eca937e4..d2705059 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -21,3 +21,6 @@ which runs on CPU or GPU, with the additional option of running OptiX denoising.
 
 The Path Tracer uses cuda_builder to compile the core path tracer for the GPU and GPU (hardware raytracing), and uses the core path tracer as a normal crate
 for CPU rendering and sharing structures.
+
+### [sdot](sdot)
+Example of computes the dot product of two single-precision (f32) vectors
\ No newline at end of file
diff --git a/examples/sdot/Cargo.toml b/examples/sdot/Cargo.toml
new file mode 100644
index 00000000..0d9b1511
--- /dev/null
+++ b/examples/sdot/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "sdot"
+version = "0.1.0"
+edition = "2024"
+description = "Example of computes the dot product of two f32 vectors"
+
+[dependencies]
+cust = { path = "../../crates/cust" }
+blastoff = { path = "../../crates/blastoff" }
+rand = "0.9.*"
+
+[build-dependencies]
+cuda_builder = { workspace = true, default-features = false }
diff --git a/examples/sdot/build.rs b/examples/sdot/build.rs
new file mode 100644
index 00000000..dacebdec
--- /dev/null
+++ b/examples/sdot/build.rs
@@ -0,0 +1,63 @@
+use std::env;
+use std::path;
+use std::process::{Command, Stdio};
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let kernels_path = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()).join("kernels");
+
+    CudaBuilder::new(kernels_path.as_path())
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+
+    // Generate PTX from native CUDA kernels
+    let cuda_kernel_path = kernels_path.join("cuda/sdot.cu");
+
+    println!("cargo::rerun-if-changed={}", cuda_kernel_path.display());
+
+    let cuda_ptx = out_path.join("kernels_cuda_mangles.ptx");
+    let mut nvcc = Command::new("nvcc");
+    nvcc.arg("--ptx")
+        .args(["--Werror", "all-warnings"])
+        .args(["--output-directory", out_path.as_os_str().to_str().unwrap()])
+        .args(["-o", cuda_ptx.as_os_str().to_str().unwrap()])
+        .arg(cuda_kernel_path.as_path());
+
+    let build = nvcc
+        .stderr(Stdio::inherit())
+        .output()
+        .expect("failed to execute nvcc kernel build");
+
+    assert!(build.status.success());
+
+    // Decodes (demangles) low-level identifiers
+    let cat_out = Command::new("cat")
+        .arg(cuda_ptx)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::inherit())
+        .spawn()
+        .expect("Failed to start cat process")
+        .stdout
+        .expect("Failed to open cat stdout");
+
+    let outputs = std::fs::File::create(out_path.join("kernels_cuda.ptx"))
+        .expect("Can not open output ptc kernel file");
+
+    let filt_out = Command::new("cu++filt")
+        .arg("-p")
+        .stdin(Stdio::from(cat_out))
+        .stdout(Stdio::from(outputs))
+        .stderr(Stdio::inherit())
+        .spawn()
+        .expect("Failed to start cu++filt process")
+        .wait_with_output()
+        .expect("Failed to wait on cu++filt");
+
+    assert!(filt_out.status.success());
+}
diff --git a/examples/sdot/kernels/Cargo.toml b/examples/sdot/kernels/Cargo.toml
new file mode 100644
index 00000000..b4ee4357
--- /dev/null
+++ b/examples/sdot/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "sdot_kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/examples/sdot/kernels/cuda/sdot.cu b/examples/sdot/kernels/cuda/sdot.cu
new file mode 100644
index 00000000..ff354921
--- /dev/null
+++ b/examples/sdot/kernels/cuda/sdot.cu
@@ -0,0 +1,36 @@
+// Here we use `unsigned long` to match the Rust version `usize`.
+__global__ void sdot(const float *x, unsigned long x_n, const float *y, unsigned long y_n, float *out)
+{
+
+    extern __shared__ float shared_sum[];
+    unsigned int i;
+
+    unsigned int num_threads = gridDim.x * blockDim.x;
+    unsigned int start_ind = blockDim.x * blockIdx.x;
+    unsigned int tid = threadIdx.x;
+
+    float sum = 0.0f;
+    for (i = start_ind + tid; i < x_n; i += num_threads)
+    {
+        // Rust checks emulation
+        if (i >= y_n)
+            __trap();
+
+        sum += x[i] * y[i];
+    }
+    shared_sum[tid] = sum;
+
+    for (i = blockDim.x >> 1; i > 0; i >>= 1)
+    {
+        __syncthreads();
+        if (tid < i)
+        {
+            shared_sum[tid] += shared_sum[tid + i];
+        }
+    }
+
+    if (tid == 0)
+    {
+        out[blockIdx.x] = shared_sum[tid];
+    }
+}
\ No newline at end of file
diff --git a/examples/sdot/kernels/src/lib.rs b/examples/sdot/kernels/src/lib.rs
new file mode 100644
index 00000000..da2c825b
--- /dev/null
+++ b/examples/sdot/kernels/src/lib.rs
@@ -0,0 +1,37 @@
+use cuda_std::{kernel, shared, thread};
+
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn sdot(x: &[f32], y: &[f32], out: *mut f32) {
+    let shared_sum = shared::dynamic_shared_mem::<f32>();
+
+    let num_threads = (thread::grid_dim_x() as usize) * (thread::block_dim_x() as usize);
+    let start_ind = (thread::block_dim_x() as usize) * (thread::block_idx_x() as usize);
+    let tid = thread::thread_idx_x() as usize;
+
+    let mut sum = 0f32;
+    for i in ((start_ind + tid)..x.len()).step_by(num_threads) {
+        sum += x[i] * y[i];
+    }
+    unsafe {
+        *shared_sum.add(tid) = sum;
+    }
+
+    let mut i = (thread::block_dim_x() >> 1) as usize;
+    while i > 0 {
+        thread::sync_threads();
+        if tid < i {
+            unsafe {
+                *shared_sum.add(tid) += *shared_sum.add(tid + i);
+            }
+        }
+
+        i >>= 1;
+    }
+
+    if tid == 0 {
+        unsafe {
+            *out.add(thread::block_idx_x() as usize) = *shared_sum.add(tid);
+        }
+    }
+}
diff --git a/examples/sdot/src/main.rs b/examples/sdot/src/main.rs
new file mode 100644
index 00000000..92d7de00
--- /dev/null
+++ b/examples/sdot/src/main.rs
@@ -0,0 +1,242 @@
+use std::error::Error;
+use std::fmt;
+use std::time::Duration;
+
+use rand::Rng;
+
+use blastoff::CublasContext;
+use cust::event;
+use cust::function;
+use cust::launch;
+use cust::memory::{self, CopyDestination as _};
+use cust::module::Module;
+use cust::stream;
+use cust::util::SliceExt as _;
+
+const VECTORS_LEN: usize = 10_000_000;
+const NUM_WARMUP: usize = 100;
+const NUM_RUNS: usize = 1000;
+const BLOCK_SIZE: u32 = 1024;
+const GRID_SIZE: u32 = 80;
+
+static PTX_NATIVE: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels_cuda.ptx"));
+static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
+
+struct KernelLaunchStats {
+    /// The size of the block grid
+    pub grid_size: u32,
+    /// Number of threads per block
+    pub block_size: u32,
+    /// The amount of dynamically allocated shared memory
+    pub shared_mem_size: u32,
+    /// Number of registers used
+    pub num_regs: u32,
+}
+
+impl fmt::Display for KernelLaunchStats {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "Kernel launch stats:")?;
+        writeln!(f, "    grid_size: {}", self.grid_size)?;
+        writeln!(f, "    block_size: {}", self.block_size)?;
+        writeln!(f, "    shared_mem_size: {}", self.shared_mem_size)?;
+        writeln!(f, "    num_regs: {}", self.num_regs)
+    }
+}
+
+/// Launch statistics and outputs
+struct RunResult {
+    /// The average value of the result
+    pub res_average: f64,
+    /// Duration of one iteration
+    pub run_duration: Duration,
+    /// Statistics of the running kernel
+    pub kernel_launch_stats: Option<KernelLaunchStats>,
+}
+
+impl fmt::Display for RunResult {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "result: {}", self.res_average)?;
+        writeln!(
+            f,
+            "Duration of one iteration: {} ms",
+            self.run_duration.as_secs_f64() * 1000f64
+        )?;
+        if let Some(kernel_lunch_stats) = &self.kernel_launch_stats {
+            write!(f, "{}", kernel_lunch_stats)?;
+        }
+        Ok(())
+    }
+}
+
+fn main() -> Result<(), Box<dyn Error>> {
+    // Initialize CUDA
+    let _ctx = cust::quick_init()?;
+
+    // make a CUDA stream to issue calls to.
+    let stream = stream::Stream::new(stream::StreamFlags::NON_BLOCKING, None)?;
+
+    // Generate input hosts vectors A and B
+    let mut rng = rand::rng();
+    let x_host = (0..VECTORS_LEN)
+        .map(|_i| rng.random_range(0.0..=1.0f32))
+        .collect::<Vec<_>>();
+    let y_host = (0..VECTORS_LEN)
+        .map(|_i| rng.random_range(0.0..=1.0f32))
+        .collect::<Vec<_>>();
+
+    // Allocate the GPU memory
+    let x_gpu = x_host.as_slice().as_dbuf()?;
+    let y_gpu = y_host.as_slice().as_dbuf()?;
+
+    // Run cuBLAS test
+    let blas_res = run_cublas_sdot_test(&stream, &x_gpu, &y_gpu)?;
+    println!("cuBLAS:\n{}", blas_res);
+
+    // Run native CUDA kernel test
+    let module_native = Module::from_ptx(PTX_NATIVE, &[])?;
+    let sdot_native = module_native.get_function("sdot")?;
+
+    let native_res =
+        run_cuda_sdot_test(&stream, sdot_native, &x_gpu, &y_gpu, GRID_SIZE, BLOCK_SIZE)?;
+    println!("Native CUDA:\n{}", native_res);
+
+    // Run Rust CUDA kernel test
+    let module_rust = Module::from_ptx(PTX, &[])?;
+    let sdot_rust = module_rust.get_function("sdot")?;
+
+    let rust_res = run_cuda_sdot_test(&stream, sdot_rust, &x_gpu, &y_gpu, GRID_SIZE, BLOCK_SIZE)?;
+    println!("Rust CUDA:\n{}", rust_res);
+
+    Ok(())
+}
+
+/// Runs cuBLAS dot product of two vectors
+///
+/// Runs the scalar product calculations several times, before warming up
+/// and calculating the execution time.
+fn run_cublas_sdot_test(
+    stream: &stream::Stream,
+    x: &memory::DeviceBuffer<f32>,
+    y: &memory::DeviceBuffer<f32>,
+) -> Result<RunResult, Box<dyn Error>> {
+    let mut ctx = CublasContext::new()?;
+
+    // WarmUp
+    for _ in 0..NUM_WARMUP {
+        let mut result = memory::DeviceBox::new(&0.0)?;
+        ctx.dot(stream, x.len(), x, y, &mut result)?;
+        stream.synchronize()?;
+        let _res = result.as_host_value()?;
+    }
+
+    // Run bench
+    let mut res_average = 0f64;
+    let begin = event::Event::new(event::EventFlags::DEFAULT)?;
+    let end = event::Event::new(event::EventFlags::DEFAULT)?;
+    begin.record(stream)?;
+
+    for _ in 0..NUM_RUNS {
+        let mut result = memory::DeviceBox::new(&0.0)?;
+        ctx.dot(stream, x.len(), x, y, &mut result)?;
+        stream.synchronize()?;
+        res_average += result.as_host_value()? as f64;
+    }
+
+    end.record(stream)?;
+    begin.synchronize()?;
+    end.synchronize()?;
+
+    res_average /= NUM_RUNS as f64;
+    let run_duration = end.elapsed(&begin)?.div_f64(NUM_RUNS as f64);
+
+    let stats = RunResult {
+        res_average,
+        run_duration,
+        kernel_launch_stats: None,
+    };
+    Ok(stats)
+}
+
+/// Runs CUDA kernel test: dot product of two vectors
+///
+/// Runs the scalar product calculations several times, before warming up
+/// and calculating the execution time.
+fn run_cuda_sdot_test(
+    stream: &stream::Stream,
+    sdot_fun: function::Function,
+    x: &memory::DeviceBuffer<f32>,
+    y: &memory::DeviceBuffer<f32>,
+    grid_size: u32,
+    block_size: u32,
+) -> Result<RunResult, Box<dyn Error>> {
+    // Allocate memory to collect results, one per thread block
+    let mut out_host = vec![0.0f32; grid_size as _];
+    let out_gpu = memory::DeviceBuffer::zeroed(grid_size as _)?;
+
+    // Shared memory size per thread block
+    let shared_mem_size = block_size * (std::mem::size_of::<f32>() as u32);
+
+    // WarmUp
+    for _ in 0..NUM_WARMUP {
+        unsafe {
+            launch!(
+                sdot_fun<<<grid_size, block_size, shared_mem_size, stream>>>(
+                    x.as_device_ptr(),
+                    x.len(),
+                    y.as_device_ptr(),
+                    y.len(),
+                    out_gpu.as_device_ptr(),
+                )
+            )?;
+        }
+        stream.synchronize()?;
+        out_gpu.copy_to(&mut out_host)?;
+        let _res: f64 = out_host.iter().map(|e| *e as f64).sum();
+    }
+
+    // Run bench
+    let mut res_average = 0f64;
+    let begin = event::Event::new(event::EventFlags::DEFAULT)?;
+    let end = event::Event::new(event::EventFlags::DEFAULT)?;
+    begin.record(stream)?;
+
+    for _ in 0..NUM_RUNS {
+        unsafe {
+            launch!(
+                sdot_fun<<<grid_size, block_size, shared_mem_size, stream>>>(
+                    x.as_device_ptr(),
+                    x.len(),
+                    y.as_device_ptr(),
+                    y.len(),
+                    out_gpu.as_device_ptr(),
+                )
+            )?;
+        }
+        stream.synchronize()?;
+        out_gpu.copy_to(&mut out_host)?;
+        let res: f64 = out_host.iter().map(|e| *e as f64).sum();
+        res_average += res;
+    }
+
+    let kernel_launch_stats = KernelLaunchStats {
+        grid_size,
+        block_size,
+        shared_mem_size,
+        num_regs: sdot_fun.get_attribute(function::FunctionAttribute::NumRegisters)? as u32,
+    };
+
+    end.record(stream)?;
+    begin.synchronize()?;
+    end.synchronize()?;
+
+    res_average /= NUM_RUNS as f64;
+    let run_duration = end.elapsed(&begin)?.div_f64(NUM_RUNS as f64);
+
+    let stats = RunResult {
+        res_average,
+        run_duration,
+        kernel_launch_stats: Some(kernel_launch_stats),
+    };
+
+    Ok(stats)
+}