diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py index d67f180fe0..540c9b4c11 100644 --- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py +++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates using the device clock for kernel timing via +# NVRTC-compiled CUDA code. +# +# ################################################################################ + import platform import numpy as np diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py index 5d764509ce..c92d33e975 100644 --- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py @@ -1,6 +1,12 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates cubemap texture sampling and transformation. +# +# ################################################################################ + import ctypes import sys import time diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py index 09dafa1be1..1b21166de2 100644 --- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates peer-to-peer memory access and data transfer +# between multiple GPUs. +# +# ################################################################################ + import ctypes import platform import sys diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py index d4bf44e19a..e4dc439b9b 100644 --- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py +++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates vector addition using zero-copy (mapped) host +# memory, allowing the GPU to access CPU memory directly. +# +# ################################################################################ + import ctypes import math import platform diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py index 94a356101f..ed4a13e686 100644 --- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py +++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py @@ -1,6 +1,12 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates system-wide atomic operations on managed memory. +# +# ################################################################################ + import ctypes import os import sys diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py index 8c70aadd3a..0a29b8c0ca 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates vector addition using the CUDA Driver API with +# unified virtual addressing. +# +# ################################################################################ + import ctypes import math import sys diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py index d5e2e3d26f..55178f1abd 100644 --- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py +++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates vector addition using multi-device memory +# mapping (cuMemCreate, cuMemMap) with virtual address management. +# +# ################################################################################ + import ctypes import math import platform diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py index f26dd2dabe..407079ad43 100644 --- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py +++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates stream-ordered memory allocation (cudaMallocAsync +# / cudaFreeAsync) and memory pool release thresholds. +# +# ################################################################################ + import ctypes import math import platform diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index 722d19dcb5..00ed5cdfd4 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates asynchronous copy from global to shared memory +# (memcpy_async) in matrix multiplication kernels. +# +# ################################################################################ + import ctypes import math import platform diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py index b08da3edc0..9fff51767e 100644 --- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates CUDA Graphs for capture and replay of GPU +# workloads, including manual graph construction and stream capture. +# +# ################################################################################ + import ctypes import random as rnd diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py index 8ef5506257..a2d4cdca40 100644 --- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py +++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates a conjugate gradient solver using cooperative +# groups and multi-block grid synchronization. +# +# ################################################################################ + import ctypes import math import platform @@ -350,3 +357,7 @@ def main(): if math.sqrt(dot_result_local) >= tol: print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr) sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index 21303664ac..f21877b2db 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates isotropic finite-difference wave propagation +# modelling across multiple GPUs with peer-to-peer halo exchange. +# +# ################################################################################ + import time import numpy as np diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py index 80e7e73376..892776dfd9 100644 --- a/cuda_bindings/examples/extra/jit_program_test.py +++ b/cuda_bindings/examples/extra/jit_program_test.py @@ -1,6 +1,13 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +# ################################################################################ +# +# This example demonstrates JIT compilation of CUDA kernels using NVRTC +# and the Driver API (saxpy kernel). +# +# ################################################################################ + import ctypes import numpy as np diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py index c6233dd5d9..be23067200 100644 --- a/cuda_core/examples/cuda_graphs.py +++ b/cuda_core/examples/cuda_graphs.py @@ -4,9 +4,9 @@ # ################################################################################ # -# This demo illustrates how to use CUDA graphs to capture and execute -# multiple kernel launches with minimal overhead. The graph performs a -# sequence of vector operations: add, multiply, and subtract. +# This example demonstrates CUDA graphs to capture and execute multiple +# kernel launches with minimal overhead. The graph performs a sequence of +# vector operations: add, multiply, and subtract. # # ################################################################################ diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py index 46fa59ee3f..3d881a90f2 100644 --- a/cuda_core/examples/gl_interop_plasma.py +++ b/cuda_core/examples/gl_interop_plasma.py @@ -4,10 +4,12 @@ # ################################################################################ # -# Real-time Plasma Effect -- CUDA/OpenGL Interop with cuda.core.GraphicsResource +# This example demonstrates cuda.core.GraphicsResource for CUDA/OpenGL +# interop: a CUDA kernel writes pixels directly into an OpenGL PBO with +# zero copies through the CPU. Requires pyglet. # # ################################################################################ -# + # What this example teaches # ========================= # How to use cuda.core.GraphicsResource to let a CUDA kernel write pixels @@ -18,12 +20,12 @@ # Normally, getting CUDA results onto the screen would require: # CUDA -> CPU memory -> OpenGL (two slow copies across the PCIe bus) # -# GraphicsResource eliminates the CPU round-trip. The pixel data stays +# GraphicsResource eliminates the CPU round-trip. The pixel data stays # on the GPU the entire time: # # 1. OpenGL allocates a PBO (Pixel Buffer Object) -- a raw GPU buffer. # 2. GraphicsResource.from_gl_buffer() registers that PBO with CUDA. -# Now both CUDA and OpenGL have access to the *same* GPU memory. +# Now both CUDA and OpenGL have access to the same GPU memory. # # +----------------------+ +---------------------+ # | OpenGL PBO | | GraphicsResource | @@ -39,23 +41,21 @@ # 4. glTexSubImage2D -- OpenGL copies PBO into a texture (GPU-to-GPU) # 5. draw -- OpenGL renders the texture to the window # -# Why is there a copy in step 4? OpenGL can only render from a -# "texture" object, not from a raw buffer. The glTexSubImage2D step +# Why is there a copy in step 4? OpenGL can only render from a +# texture object, not from a raw buffer. The glTexSubImage2D step # copies the PBO bytes into a texture, but this happens entirely on # the GPU and it is very fast. The big win from GraphicsResource is -# that we never copy pixels from the CPU to the GPU and then and back. +# that we never copy pixels from the CPU to the GPU and then back. # # What you should see # =================== -# A window showing smoothly animated, colorful swirling patterns (a "plasma" -# effect popular in the demoscene). The window title shows the current FPS. +# A window showing smoothly animated, colorful swirling patterns (a plasma +# effect popular in the demoscene). The window title shows the current FPS. # Close the window or press Escape to exit. # # Requirements # ============ # pip install pyglet -# -# ################################################################################ import ctypes import sys diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py index cfaa1d6707..acf96be0f0 100644 --- a/cuda_core/examples/jit_lto_fractal.py +++ b/cuda_core/examples/jit_lto_fractal.py @@ -4,20 +4,11 @@ # ################################################################################ # -# This demo illustrates: -# -# 1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together -# 2. That linking allows for libraries to modify workflows dynamically at runtime -# -# This demo mimics a relationship between a library and a user. The user's sole responsibility is to -# provide device code that generates some art. Whereas the library is responsible for all steps involved in -# setting up the device, launch configurations and arguments, as well as linking the provided device code. -# -# Two algorithms are implemented: -# 1. A Mandelbrot set -# 2. A Julia set -# -# The user can choose which algorithm to use at runtime and generate the resulting image. +# This example demonstrates the JIT LTO feature of the Linker class to link +# multiple objects together, allowing libraries to modify workflows at runtime. +# It mimics a library-user relationship: the user provides device code that +# generates art (Mandelbrot or Julia set), while the library handles device +# setup, launch config, and linking. # # ################################################################################ diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py index cda486015e..a53f33d2df 100644 --- a/cuda_core/examples/memory_ops.py +++ b/cuda_core/examples/memory_ops.py @@ -4,11 +4,9 @@ # ################################################################################ # -# This demo illustrates: -# -# 1. How to use different memory resources to allocate and manage memory -# 2. How to copy data between different memory types -# 3. How to use DLPack to interoperate with other libraries +# This example demonstrates memory resources for allocation and management, +# copying data between device and pinned memory, and DLPack interop. Requires +# NumPy 2.1.0+. # # ################################################################################ @@ -26,10 +24,6 @@ launch, ) -if np.__version__ < "2.1.0": - print("This example requires NumPy 2.1.0 or later", file=sys.stderr) - sys.exit(1) - # Kernel for memory operations code = """ extern "C" @@ -47,95 +41,105 @@ } """ -dev = Device() -dev.set_current() -stream = dev.create_stream() -# tell CuPy to use our stream as the current stream: -cp.cuda.ExternalStream(int(stream.handle)).use() -device_buffer = None -pinned_buffer = None -new_device_buffer = None - -try: - # Compile kernel - program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") - prog = Program(code, code_type="c++", options=program_options) - mod = prog.compile("cubin") - kernel = mod.get_kernel("memory_ops") - - # Create different memory resources - device_mr = dev.memory_resource - pinned_mr = LegacyPinnedMemoryResource() - - # Allocate different types of memory - size = 1024 - dtype = cp.float32 - element_size = dtype().itemsize - total_size = size * element_size - - # 1. Device Memory (GPU-only) - device_buffer = device_mr.allocate(total_size, stream=stream) - device_array = cp.from_dlpack(device_buffer).view(dtype=dtype) - - # 2. Pinned Memory (CPU memory, GPU accessible) - pinned_buffer = pinned_mr.allocate(total_size, stream=stream) - pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype) - - # Initialize data - rng = cp.random.default_rng() - device_array[:] = rng.random(size, dtype=dtype) - pinned_array[:] = rng.random(size, dtype=dtype).get() - - # Store original values for verification - device_original = device_array.copy() - pinned_original = pinned_array.copy() - - # Sync before kernel launch - stream.sync() - - # Launch kernel - block = 256 - grid = (size + block - 1) // block - config = LaunchConfig(grid=grid, block=block) - - launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size)) - stream.sync() - - # Verify kernel operations - assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed" - assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" - - # Copy data between different memory types - print("\nCopying data between memory types...", file=sys.stderr) - - # Copy from device to pinned memory - device_buffer.copy_to(pinned_buffer, stream=stream) - stream.sync() - - # Verify the copy operation - assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed" - - # Create a new device buffer and copy from pinned - new_device_buffer = device_mr.allocate(total_size, stream=stream) - new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype) - - pinned_buffer.copy_to(new_device_buffer, stream=stream) - stream.sync() - - # Verify the copy operation - assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed" - - print("Memory management example completed!") -finally: - # Clean up resources even if verification fails. - if new_device_buffer is not None: - new_device_buffer.close(stream) - assert new_device_buffer.handle == 0, "New device buffer should be closed" - if pinned_buffer is not None: - pinned_buffer.close(stream) - assert pinned_buffer.handle == 0, "Pinned buffer should be closed" - if device_buffer is not None: - device_buffer.close(stream) - assert device_buffer.handle == 0, "Device buffer should be closed" - stream.close() - cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream + +def main(): + if np.__version__ < "2.1.0": + print("This example requires NumPy 2.1.0 or later", file=sys.stderr) + sys.exit(1) + + dev = Device() + dev.set_current() + stream = dev.create_stream() + # tell CuPy to use our stream as the current stream: + cp.cuda.ExternalStream(int(stream.handle)).use() + device_buffer = None + pinned_buffer = None + new_device_buffer = None + + try: + # Compile kernel + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin") + kernel = mod.get_kernel("memory_ops") + + # Create different memory resources + device_mr = dev.memory_resource + pinned_mr = LegacyPinnedMemoryResource() + + # Allocate different types of memory + size = 1024 + dtype = cp.float32 + element_size = dtype().itemsize + total_size = size * element_size + + # 1. Device Memory (GPU-only) + device_buffer = device_mr.allocate(total_size, stream=stream) + device_array = cp.from_dlpack(device_buffer).view(dtype=dtype) + + # 2. Pinned Memory (CPU memory, GPU accessible) + pinned_buffer = pinned_mr.allocate(total_size, stream=stream) + pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype) + + # Initialize data + rng = cp.random.default_rng() + device_array[:] = rng.random(size, dtype=dtype) + pinned_array[:] = rng.random(size, dtype=dtype).get() + + # Store original values for verification + device_original = device_array.copy() + pinned_original = pinned_array.copy() + + # Sync before kernel launch + stream.sync() + + # Launch kernel + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) + + launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size)) + stream.sync() + + # Verify kernel operations + assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed" + assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" + + # Copy data between different memory types + print("\nCopying data between memory types...", file=sys.stderr) + + # Copy from device to pinned memory + device_buffer.copy_to(pinned_buffer, stream=stream) + stream.sync() + + # Verify the copy operation + assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed" + + # Create a new device buffer and copy from pinned + new_device_buffer = device_mr.allocate(total_size, stream=stream) + new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype) + + pinned_buffer.copy_to(new_device_buffer, stream=stream) + stream.sync() + + # Verify the copy operation + assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed" + + print("Memory management example completed!") + finally: + # Clean up resources even if verification fails. + if new_device_buffer is not None: + new_device_buffer.close(stream) + assert new_device_buffer.handle == 0, "New device buffer should be closed" + if pinned_buffer is not None: + pinned_buffer.close(stream) + assert pinned_buffer.handle == 0, "Pinned buffer should be closed" + if device_buffer is not None: + device_buffer.close(stream) + assert device_buffer.handle == 0, "Device buffer should be closed" + stream.close() + cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 4e3bfcceb5..6909272b4d 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -4,11 +4,8 @@ # ################################################################################ # -# This demo illustrates how to use `cuda.core` to compile a CUDA kernel -# and launch it using PyTorch tensors as inputs. -# -# ## Usage: pip install "cuda-core[cu12]" -# ## python pytorch_example.py +# This example demonstrates how to use cuda.core to compile a CUDA kernel +# and launch it using PyTorch tensors as inputs. Requires PyTorch with CUDA. # # ################################################################################ @@ -30,13 +27,6 @@ } """ -dev = Device() -dev.set_current() - -# Get PyTorch's current stream -pt_stream = torch.cuda.current_stream() -print(f"PyTorch stream: {pt_stream}", file=sys.stderr) - # Create a wrapper class that implements __cuda_stream__ class PyTorchStreamWrapper: @@ -48,66 +38,77 @@ def __cuda_stream__(self): return (0, stream_id) # Return format required by CUDA Python -stream = dev.create_stream(PyTorchStreamWrapper(pt_stream)) - -try: - # prepare program - program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") - prog = Program(code, code_type="c++", options=program_options) - mod = prog.compile( - "cubin", - logs=sys.stdout, - name_expressions=("saxpy_kernel", "saxpy_kernel"), - ) - - # Run in single precision - kernel = mod.get_kernel("saxpy_kernel") - dtype = torch.float32 - - # prepare input/output - size = 64 - # Use a single element tensor for 'a' - a = torch.tensor([10.0], dtype=dtype, device="cuda") - x = torch.rand(size, dtype=dtype, device="cuda") - y = torch.rand(size, dtype=dtype, device="cuda") - out = torch.empty_like(x) - - # prepare launch - block = 32 - grid = int((size + block - 1) // block) - config = LaunchConfig(grid=grid, block=block) - kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) - - # launch kernel on our stream - launch(stream, config, kernel, *kernel_args) - - # check result - assert torch.allclose(out, a.item() * x + y) - - # let's repeat again with double precision - kernel = mod.get_kernel("saxpy_kernel") - dtype = torch.float64 - - # prepare input - size = 128 - # Use a single element tensor for 'a' - a = torch.tensor([42.0], dtype=dtype, device="cuda") - x = torch.rand(size, dtype=dtype, device="cuda") - y = torch.rand(size, dtype=dtype, device="cuda") - - # prepare output - out = torch.empty_like(x) - - # prepare launch - block = 64 - grid = int((size + block - 1) // block) - config = LaunchConfig(grid=grid, block=block) - kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) - - # launch kernel on PyTorch's stream - launch(stream, config, kernel, *kernel_args) - - # check result - assert torch.allclose(out, a * x + y) -finally: - stream.close() +def main(): + dev = Device() + dev.set_current() + + pt_stream = torch.cuda.current_stream() + print(f"PyTorch stream: {pt_stream}", file=sys.stderr) + + stream = dev.create_stream(PyTorchStreamWrapper(pt_stream)) + + try: + # prepare program + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + logs=sys.stdout, + name_expressions=("saxpy_kernel", "saxpy_kernel"), + ) + + # Run in single precision + kernel = mod.get_kernel("saxpy_kernel") + dtype = torch.float32 + + # prepare input/output + size = 64 + # Use a single element tensor for 'a' + a = torch.tensor([10.0], dtype=dtype, device="cuda") + x = torch.rand(size, dtype=dtype, device="cuda") + y = torch.rand(size, dtype=dtype, device="cuda") + out = torch.empty_like(x) + + # prepare launch + block = 32 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + + # launch kernel on our stream + launch(stream, config, kernel, *kernel_args) + + # check result + assert torch.allclose(out, a.item() * x + y) + + # let's repeat again with double precision + kernel = mod.get_kernel("saxpy_kernel") + dtype = torch.float64 + + # prepare input + size = 128 + # Use a single element tensor for 'a' + a = torch.tensor([42.0], dtype=dtype, device="cuda") + x = torch.rand(size, dtype=dtype, device="cuda") + y = torch.rand(size, dtype=dtype, device="cuda") + + # prepare output + out = torch.empty_like(x) + + # prepare launch + block = 64 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + + # launch kernel on PyTorch's stream + launch(stream, config, kernel, *kernel_args) + + # check result + assert torch.allclose(out, a * x + y) + finally: + stream.close() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index 548af802be..6e5b320f90 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -4,10 +4,9 @@ # ################################################################################ # -# This demo illustrates how to use `cuda.core` to compile a templated CUDA kernel -# and launch it using `cupy` arrays as inputs. This is a simple example of a -# templated kernel, where the kernel is instantiated for both `float` and `double` -# data types. +# This example demonstrates a templated CUDA kernel (SAXPY) compiled and +# launched with cuda.core, using CuPy arrays. The kernel is instantiated +# for both float and double. # # ################################################################################ @@ -33,87 +32,92 @@ """ -dev = Device() -dev.set_current() -stream = dev.create_stream() -buf = None - -try: - # prepare program - program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") - prog = Program(code, code_type="c++", options=program_options) - - # Note the use of the `name_expressions` argument to specify the template - # instantiations of the kernel that we will use. For non-templated kernels, - # `name_expressions` will simply contain the name of the kernels. - mod = prog.compile( - "cubin", - logs=sys.stdout, - name_expressions=("saxpy", "saxpy"), - ) - - # run in single precision - kernel = mod.get_kernel("saxpy") - dtype = cp.float32 - - # prepare input/output - size = cp.uint64(64) - a = dtype(10) - rng = cp.random.default_rng() - x = rng.random(size, dtype=dtype) - y = rng.random(size, dtype=dtype) - out = cp.empty_like(x) - dev.sync() # cupy runs on a different stream from stream, so sync before accessing - - # prepare launch - block = 32 - grid = int((size + block - 1) // block) - config = LaunchConfig(grid=grid, block=block) - kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) - - # launch kernel on stream - launch(stream, config, kernel, *kernel_args) - stream.sync() - - # check result - assert cp.allclose(out, a * x + y) - - # let's repeat again, this time allocates our own out buffer instead of cupy's - # run in double precision - kernel = mod.get_kernel("saxpy") - dtype = cp.float64 - - # prepare input - size = cp.uint64(128) - a = dtype(42) - x = rng.random(size, dtype=dtype) - y = rng.random(size, dtype=dtype) - dev.sync() - - # prepare output - buf = dev.allocate( - size * 8, # = dtype.itemsize - stream=stream, - ) - - # prepare launch - block = 64 - grid = int((size + block - 1) // block) - config = LaunchConfig(grid=grid, block=block) - kernel_args = (a, x.data.ptr, y.data.ptr, buf, size) - - # launch kernel on stream - launch(stream, config, kernel, *kernel_args) - stream.sync() - - # check result - # we wrap output buffer as a cupy array for simplicity - out = cp.ndarray( - size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0) - ) - assert cp.allclose(out, a * x + y) -finally: - # cupy cleans up automatically the rest - if buf is not None: - buf.close(stream) - stream.close() +def main(): + dev = Device() + dev.set_current() + stream = dev.create_stream() + buf = None + + try: + # prepare program + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + + # Note the use of the `name_expressions` argument to specify the template + # instantiations of the kernel that we will use. For non-templated kernels, + # `name_expressions` will simply contain the name of the kernels. + mod = prog.compile( + "cubin", + logs=sys.stdout, + name_expressions=("saxpy", "saxpy"), + ) + + # run in single precision + kernel = mod.get_kernel("saxpy") + dtype = cp.float32 + + # prepare input/output + size = cp.uint64(64) + a = dtype(10) + rng = cp.random.default_rng() + x = rng.random(size, dtype=dtype) + y = rng.random(size, dtype=dtype) + out = cp.empty_like(x) + dev.sync() # cupy runs on a different stream from stream, so sync before accessing + + # prepare launch + block = 32 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) + + # launch kernel on stream + launch(stream, config, kernel, *kernel_args) + stream.sync() + + # check result + assert cp.allclose(out, a * x + y) + + # let's repeat again, this time allocates our own out buffer instead of cupy's + # run in double precision + kernel = mod.get_kernel("saxpy") + dtype = cp.float64 + + # prepare input + size = cp.uint64(128) + a = dtype(42) + x = rng.random(size, dtype=dtype) + y = rng.random(size, dtype=dtype) + dev.sync() + + # prepare output + buf = dev.allocate( + size * 8, # = dtype.itemsize + stream=stream, + ) + + # prepare launch + block = 64 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + kernel_args = (a, x.data.ptr, y.data.ptr, buf, size) + + # launch kernel on stream + launch(stream, config, kernel, *kernel_args) + stream.sync() + + # check result + # we wrap output buffer as a cupy array for simplicity + out = cp.ndarray( + size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0) + ) + assert cp.allclose(out, a * x + y) + finally: + # cupy cleans up automatically the rest + if buf is not None: + buf.close(stream) + stream.close() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py index baf86ebc03..093b89b331 100644 --- a/cuda_core/examples/show_device_properties.py +++ b/cuda_core/examples/show_device_properties.py @@ -4,7 +4,7 @@ # ################################################################################ # -# This demo illustrates how to use `cuda.core` to show the properties of the +# This example demonstrates how to use cuda.core to show the properties of # CUDA devices in the system. # # ################################################################################ @@ -236,8 +236,12 @@ def show_device_properties(): print("*****************************************************\n\n") -if __name__ == "__main__": +def main(): if len(sys.argv) != 1: print("no command-line arguments expected", file=sys.stderr) sys.exit(1) show_device_properties() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 882ce8bbb3..236a1cca20 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -4,8 +4,8 @@ # ################################################################################ # -# This demo illustrates how to use `cuda.core` to compile and launch kernels -# on multiple GPUs. +# This example demonstrates how to use cuda.core to compile and launch +# kernels on multiple GPUs. Requires at least 2 GPUs. # # ################################################################################ @@ -15,10 +15,6 @@ from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system -if system.get_num_devices() < 2: - print("this example requires at least 2 GPUs", file=sys.stderr) - sys.exit(1) - dtype = cp.float32 size = 50000 @@ -34,17 +30,22 @@ def __cuda_stream__(self): return (0, self.obj.ptr) -# Set GPU 0 -dev0 = Device(0) -dev0.set_current() -stream0 = dev0.create_stream() -stream1 = None -cp_stream0 = None -cp_stream1 = None +def main(): + if system.get_num_devices() < 2: + print("this example requires at least 2 GPUs", file=sys.stderr) + sys.exit(1) -try: - # Compile a kernel targeting GPU 0 to compute c = a + b - code_add = """ + # Set GPU 0 + dev0 = Device(0) + dev0.set_current() + stream0 = dev0.create_stream() + stream1 = None + cp_stream0 = None + cp_stream1 = None + + try: + # Compile a kernel targeting GPU 0 to compute c = a + b + code_add = """ extern "C" __global__ void vector_add(const float* A, const float* B, @@ -56,17 +57,17 @@ def __cuda_stream__(self): } } """ - prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")) - mod_add = prog_add.compile("cubin") - add_kernel = mod_add.get_kernel("vector_add") + prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}")) + mod_add = prog_add.compile("cubin") + add_kernel = mod_add.get_kernel("vector_add") - # Set GPU 1 - dev1 = Device(1) - dev1.set_current() - stream1 = dev1.create_stream() + # Set GPU 1 + dev1 = Device(1) + dev1.set_current() + stream1 = dev1.create_stream() - # Compile a kernel targeting GPU 1 to compute c = a - b - code_sub = """ + # Compile a kernel targeting GPU 1 to compute c = a - b + code_sub = """ extern "C" __global__ void vector_sub(const float* A, const float* B, @@ -78,62 +79,66 @@ def __cuda_stream__(self): } } """ - prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")) - mod_sub = prog_sub.compile("cubin") - sub_kernel = mod_sub.get_kernel("vector_sub") - - # Create launch configs for each kernel that will be executed on the respective - # CUDA streams. - block = 256 - grid = (size + block - 1) // block - config0 = LaunchConfig(grid=grid, block=block) - config1 = LaunchConfig(grid=grid, block=block) - - # Allocate memory on GPU 0 - # Note: This runs on CuPy's current stream for GPU 0 - dev0.set_current() - rng = cp.random.default_rng() - a = rng.random(size, dtype=dtype) - b = rng.random(size, dtype=dtype) - c = cp.empty_like(a) - cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) - - # Establish a stream order to ensure that memory has been initialized before - # accessed by the kernel. - stream0.wait(cp_stream0) - - # Launch the add kernel on GPU 0 / stream 0 - launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) - - # Allocate memory on GPU 1 - # Note: This runs on CuPy's current stream for GPU 1. - dev1.set_current() - rng = cp.random.default_rng() - x = rng.random(size, dtype=dtype) - y = rng.random(size, dtype=dtype) - z = cp.empty_like(a) - cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) - - # Establish a stream order - stream1.wait(cp_stream1) - - # Launch the subtract kernel on GPU 1 / stream 1 - launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) - - # Synchronize both GPUs are validate the results - dev0.set_current() - stream0.sync() - assert cp.allclose(c, a + b) - dev1.set_current() - stream1.sync() - assert cp.allclose(z, x - y) - - print("done") -finally: - if cp_stream1 is not None: - cp_stream1.close() - if cp_stream0 is not None: - cp_stream0.close() - if stream1 is not None: - stream1.close() - stream0.close() + prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}")) + mod_sub = prog_sub.compile("cubin") + sub_kernel = mod_sub.get_kernel("vector_sub") + + # Create launch configs for each kernel that will be executed on the respective + # CUDA streams. + block = 256 + grid = (size + block - 1) // block + config0 = LaunchConfig(grid=grid, block=block) + config1 = LaunchConfig(grid=grid, block=block) + + # Allocate memory on GPU 0 + # Note: This runs on CuPy's current stream for GPU 0 + dev0.set_current() + rng = cp.random.default_rng() + a = rng.random(size, dtype=dtype) + b = rng.random(size, dtype=dtype) + c = cp.empty_like(a) + cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) + + # Establish a stream order to ensure that memory has been initialized before + # accessed by the kernel. + stream0.wait(cp_stream0) + + # Launch the add kernel on GPU 0 / stream 0 + launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + + # Allocate memory on GPU 1 + # Note: This runs on CuPy's current stream for GPU 1. + dev1.set_current() + rng = cp.random.default_rng() + x = rng.random(size, dtype=dtype) + y = rng.random(size, dtype=dtype) + z = cp.empty_like(a) + cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) + + # Establish a stream order + stream1.wait(cp_stream1) + + # Launch the subtract kernel on GPU 1 / stream 1 + launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size)) + + # Synchronize both GPUs and validate the results + dev0.set_current() + stream0.sync() + assert cp.allclose(c, a + b) + dev1.set_current() + stream1.sync() + assert cp.allclose(z, x - y) + + print("done") + finally: + if cp_stream1 is not None: + cp_stream1.close() + if cp_stream0 is not None: + cp_stream0.close() + if stream1 is not None: + stream1.close() + stream0.close() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py index f973a813b9..8482021c45 100644 --- a/cuda_core/examples/strided_memory_view_cpu.py +++ b/cuda_core/examples/strided_memory_view_cpu.py @@ -4,13 +4,8 @@ # ################################################################################ # -# This demo illustrates: -# -# 1. The similarity between CPU and GPU JIT-compilation with C++ sources -# 2. How to use StridedMemoryView to interface with foreign C/C++ functions -# -# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be -# easily installed from pip or conda following their instructions. +# This example demonstrates StridedMemoryView for interfacing with foreign +# C/C++ functions, using JIT-compiled CPU code via cffi. Requires cffi. # # ################################################################################ @@ -124,11 +119,11 @@ def _run_example(cpu_prog, cpu_func): assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) -def run(): +def main(): cpu_prog = _create_cpu_program() with tempfile.TemporaryDirectory() as temp_dir, _compiled_cpu_func(cpu_prog, temp_dir) as cpu_func: _run_example(cpu_prog, cpu_func) if __name__ == "__main__": - run() + main() diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index 9d4e4aacff..0abf5d086e 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -4,13 +4,8 @@ # ################################################################################ # -# This demo illustrates: -# -# 1. The similarity between CPU and GPU JIT-compilation with C++ sources -# 2. How to use StridedMemoryView to interface with foreign C/C++ functions -# -# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be -# easily installed from pip or conda following their instructions. +# This example demonstrates StridedMemoryView for interfacing with foreign +# C/C++ functions, using JIT-compiled GPU code. Requires cupy. # # ################################################################################ @@ -84,7 +79,7 @@ def my_func(arr, work_stream, kernel): work_stream.sync() -def run(): +def main(): global my_func # Here is a concrete (very naive!) implementation on GPU: gpu_code = string.Template(r""" @@ -122,4 +117,4 @@ def run(): if __name__ == "__main__": - run() + main() diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index a5f50d4189..495fe882a9 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -4,8 +4,9 @@ # ################################################################################ # -# This demo illustrates the use of thread block clusters in the CUDA launch +# This example demonstrates thread block clusters in the CUDA launch # configuration and verifies that the correct grid size is passed to the kernel. +# Requires compute capability >= 9.0 and CUDA_PATH. # # ################################################################################ @@ -23,24 +24,6 @@ launch, ) -if np.lib.NumpyVersion(np.__version__) < "2.2.5": - print("This example requires NumPy 2.2.5 or later", file=sys.stderr) - sys.exit(1) - -# prepare include -cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) -if cuda_path is None: - print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr) - sys.exit(1) -cuda_include = os.path.join(cuda_path, "include") -if not os.path.isdir(cuda_include): - print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr) - sys.exit(1) -include_path = [cuda_include] -cccl_include = os.path.join(cuda_include, "cccl") -if os.path.isdir(cccl_include): - include_path.insert(0, cccl_include) - # print cluster info using a kernel and store results in pinned memory code = r""" #include @@ -76,77 +59,100 @@ } """ -dev = Device() -arch = dev.compute_capability -if arch < (9, 0): - print( - "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)", - file=sys.stderr, - ) - sys.exit(1) -arch = "".join(f"{i}" for i in arch) - -# prepare program & compile kernel -dev.set_current() -prog = Program( - code, - code_type="c++", - options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), -) -mod = prog.compile(target_type="cubin") -kernel = mod.get_kernel("check_cluster_info") - -# prepare launch config -grid = 4 -cluster = 2 -block = 32 -config = LaunchConfig(grid=grid, cluster=cluster, block=block) - -# allocate pinned memory to store kernel results -pinned_mr = LegacyPinnedMemoryResource() -element_size = np.dtype(np.uint32).itemsize -grid_buffer = None -cluster_buffer = None -block_buffer = None - -try: - # allocate 3 uint32 values each for grid, cluster, and block dimensions - grid_buffer = pinned_mr.allocate(3 * element_size) - cluster_buffer = pinned_mr.allocate(3 * element_size) - block_buffer = pinned_mr.allocate(3 * element_size) - - # create NumPy arrays from the pinned memory - grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32) - cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32) - block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32) - - # initialize arrays to zero - grid_dims[:] = 0 - cluster_dims[:] = 0 - block_dims[:] = 0 - - # launch kernel on the default stream - launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer) - dev.sync() - - # verify results - print("\nResults stored in pinned memory:") - print(f"Grid dimensions (blocks): {tuple(grid_dims)}") - print(f"Cluster dimensions: {tuple(cluster_dims)}") - print(f"Block dimensions (threads): {tuple(block_dims)}") - - # verify that grid conversion worked correctly: - # LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster) - expected_grid_blocks = grid * cluster # 4 * 2 = 8 - actual_grid_blocks = grid_dims[0] - - assert actual_grid_blocks == expected_grid_blocks, ( - f"Grid conversion failed: expected {expected_grid_blocks} total blocks, got {actual_grid_blocks}" + +def main(): + if np.lib.NumpyVersion(np.__version__) < "2.2.5": + print("This example requires NumPy 2.2.5 or later", file=sys.stderr) + sys.exit(1) + + cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) + if cuda_path is None: + print("this example requires a valid CUDA_PATH environment variable set", file=sys.stderr) + sys.exit(1) + cuda_include = os.path.join(cuda_path, "include") + if not os.path.isdir(cuda_include): + print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr) + sys.exit(1) + include_path = [cuda_include] + cccl_include = os.path.join(cuda_include, "cccl") + if os.path.isdir(cccl_include): + include_path.insert(0, cccl_include) + + dev = Device() + arch = dev.compute_capability + if arch < (9, 0): + print( + "this example requires compute capability >= 9.0 (since thread block cluster is a hardware feature)", + file=sys.stderr, + ) + sys.exit(1) + arch = "".join(f"{i}" for i in arch) + + # prepare program & compile kernel + dev.set_current() + prog = Program( + code, + code_type="c++", + options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), ) -finally: - if block_buffer is not None: - block_buffer.close() - if cluster_buffer is not None: - cluster_buffer.close() - if grid_buffer is not None: - grid_buffer.close() + mod = prog.compile(target_type="cubin") + kernel = mod.get_kernel("check_cluster_info") + + # prepare launch config + grid = 4 + cluster = 2 + block = 32 + config = LaunchConfig(grid=grid, cluster=cluster, block=block) + + # allocate pinned memory to store kernel results + pinned_mr = LegacyPinnedMemoryResource() + element_size = np.dtype(np.uint32).itemsize + grid_buffer = None + cluster_buffer = None + block_buffer = None + + try: + # allocate 3 uint32 values each for grid, cluster, and block dimensions + grid_buffer = pinned_mr.allocate(3 * element_size) + cluster_buffer = pinned_mr.allocate(3 * element_size) + block_buffer = pinned_mr.allocate(3 * element_size) + + # create NumPy arrays from the pinned memory + grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32) + cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32) + block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32) + + # initialize arrays to zero + grid_dims[:] = 0 + cluster_dims[:] = 0 + block_dims[:] = 0 + + # launch kernel on the default stream + launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer) + dev.sync() + + # verify results + print("\nResults stored in pinned memory:") + print(f"Grid dimensions (blocks): {tuple(grid_dims)}") + print(f"Cluster dimensions: {tuple(cluster_dims)}") + print(f"Block dimensions (threads): {tuple(block_dims)}") + + # verify that grid conversion worked correctly: + # LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster) + expected_grid_blocks = grid * cluster # 4 * 2 = 8 + actual_grid_blocks = grid_dims[0] + + assert actual_grid_blocks == expected_grid_blocks, ( + f"Grid conversion failed: expected {expected_grid_blocks} total blocks, got {actual_grid_blocks}" + ) + finally: + if block_buffer is not None: + block_buffer.close() + if cluster_buffer is not None: + cluster_buffer.close() + if grid_buffer is not None: + grid_buffer.close() + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index e648a3846f..3adf04882e 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -4,8 +4,8 @@ # ################################################################################ # -# This demo illustrates how to use `cuda.core` to compile and launch a simple -# vector addition kernel. +# This example demonstrates how to use cuda.core to compile and launch a +# simple vector addition kernel. # # ################################################################################ @@ -28,40 +28,45 @@ """ -dev = Device() -dev.set_current() -stream = dev.create_stream() +def main(): + dev = Device() + dev.set_current() + stream = dev.create_stream() -try: - # prepare program - program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") - prog = Program(code, code_type="c++", options=program_options) - mod = prog.compile("cubin", name_expressions=("vector_add",)) + try: + # prepare program + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("vector_add",)) - # run in single precision - kernel = mod.get_kernel("vector_add") - dtype = cp.float32 + # run in single precision + kernel = mod.get_kernel("vector_add") + dtype = cp.float32 - # prepare input/output - size = 50000 - rng = cp.random.default_rng() - a = rng.random(size, dtype=dtype) - b = rng.random(size, dtype=dtype) - c = cp.empty_like(a) + # prepare input/output + size = 50000 + rng = cp.random.default_rng() + a = rng.random(size, dtype=dtype) + b = rng.random(size, dtype=dtype) + c = cp.empty_like(a) - # cupy runs on a different stream from stream, so sync before accessing - dev.sync() + # cupy runs on a different stream from stream, so sync before accessing + dev.sync() - # prepare launch - block = 256 - grid = (size + block - 1) // block - config = LaunchConfig(grid=grid, block=block) + # prepare launch + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) - # launch kernel on stream - launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) - stream.sync() + # launch kernel on stream + launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + stream.sync() - # check result - assert cp.allclose(c, a + b) -finally: - stream.close() + # check result + assert cp.allclose(c, a + b) + finally: + stream.close() + + +if __name__ == "__main__": + main()