diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
index d67f180fe0..540c9b4c11 100644
--- a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
+++ b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates using the device clock for kernel timing via
+# NVRTC-compiled CUDA code.
+#
+# ################################################################################
+
 import platform
 
 import numpy as np
diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
index 5d764509ce..c92d33e975 100644
--- a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -1,6 +1,12 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates cubemap texture sampling and transformation.
+#
+# ################################################################################
+
 import ctypes
 import sys
 import time
diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
index 09dafa1be1..1b21166de2 100644
--- a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates peer-to-peer memory access and data transfer
+# between multiple GPUs.
+#
+# ################################################################################
+
 import ctypes
 import platform
 import sys
diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
index d4bf44e19a..e4dc439b9b 100644
--- a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
+++ b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates vector addition using zero-copy (mapped) host
+# memory, allowing the GPU to access CPU memory directly.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
index 94a356101f..ed4a13e686 100644
--- a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
+++ b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -1,6 +1,12 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates system-wide atomic operations on managed memory.
+#
+# ################################################################################
+
 import ctypes
 import os
 import sys
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
index 8c70aadd3a..0a29b8c0ca 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates vector addition using the CUDA Driver API with
+# unified virtual addressing.
+#
+# ################################################################################
+
 import ctypes
 import math
 import sys
diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
index d5e2e3d26f..55178f1abd 100644
--- a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
+++ b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates vector addition using multi-device memory
+# mapping (cuMemCreate, cuMemMap) with virtual address management.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
index f26dd2dabe..407079ad43 100644
--- a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
+++ b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates stream-ordered memory allocation (cudaMallocAsync
+# / cudaFreeAsync) and memory pool release thresholds.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index 722d19dcb5..00ed5cdfd4 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates asynchronous copy from global to shared memory
+# (memcpy_async) in matrix multiplication kernels.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform
diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
index b08da3edc0..9fff51767e 100644
--- a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates CUDA Graphs for capture and replay of GPU
+# workloads, including manual graph construction and stream capture.
+#
+# ################################################################################
+
 import ctypes
 import random as rnd
 
diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
index 8ef5506257..a2d4cdca40 100644
--- a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
+++ b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates a conjugate gradient solver using cooperative
+# groups and multi-block grid synchronization.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform
@@ -350,3 +357,7 @@ def main():
     if math.sqrt(dot_result_local) >= tol:
         print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr)
         sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index 21303664ac..f21877b2db 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates isotropic finite-difference wave propagation
+# modelling across multiple GPUs with peer-to-peer halo exchange.
+#
+# ################################################################################
+
 import time
 
 import numpy as np
diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
index 80e7e73376..892776dfd9 100644
--- a/cuda_bindings/examples/extra/jit_program_test.py
+++ b/cuda_bindings/examples/extra/jit_program_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates JIT compilation of CUDA kernels using NVRTC
+# and the Driver API (saxpy kernel).
+#
+# ################################################################################
+
 import ctypes
 
 import numpy as np
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index c6233dd5d9..be23067200 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -4,9 +4,9 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use CUDA graphs to capture and execute
-# multiple kernel launches with minimal overhead. The graph performs a
-# sequence of vector operations: add, multiply, and subtract.
+# This example demonstrates CUDA graphs to capture and execute multiple
+# kernel launches with minimal overhead. The graph performs a sequence of
+# vector operations: add, multiply, and subtract.
 #
 # ################################################################################
 
diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
index 46fa59ee3f..3d881a90f2 100644
--- a/cuda_core/examples/gl_interop_plasma.py
+++ b/cuda_core/examples/gl_interop_plasma.py
@@ -4,10 +4,12 @@
 
 # ################################################################################
 #
-# Real-time Plasma Effect -- CUDA/OpenGL Interop with cuda.core.GraphicsResource
+# This example demonstrates cuda.core.GraphicsResource for CUDA/OpenGL
+# interop: a CUDA kernel writes pixels directly into an OpenGL PBO with
+# zero copies through the CPU. Requires pyglet.
 #
 # ################################################################################
-#
+
 # What this example teaches
 # =========================
 # How to use cuda.core.GraphicsResource to let a CUDA kernel write pixels
@@ -18,12 +20,12 @@
 # Normally, getting CUDA results onto the screen would require:
 #   CUDA -> CPU memory -> OpenGL  (two slow copies across the PCIe bus)
 #
-# GraphicsResource eliminates the CPU round-trip.  The pixel data stays
+# GraphicsResource eliminates the CPU round-trip. The pixel data stays
 # on the GPU the entire time:
 #
 #   1. OpenGL allocates a PBO (Pixel Buffer Object) -- a raw GPU buffer.
 #   2. GraphicsResource.from_gl_buffer() registers that PBO with CUDA.
-#      Now both CUDA and OpenGL have access to the *same* GPU memory.
+#      Now both CUDA and OpenGL have access to the same GPU memory.
 #
 #   +----------------------+       +---------------------+
 #   |    OpenGL PBO        |       |  GraphicsResource   |
@@ -39,23 +41,21 @@
 #   4. glTexSubImage2D -- OpenGL copies PBO into a texture (GPU-to-GPU)
 #   5. draw            -- OpenGL renders the texture to the window
 #
-#   Why is there a copy in step 4?  OpenGL can only render from a
-#   "texture" object, not from a raw buffer. The glTexSubImage2D step
+#   Why is there a copy in step 4? OpenGL can only render from a
+#   texture object, not from a raw buffer. The glTexSubImage2D step
 #   copies the PBO bytes into a texture, but this happens entirely on
 #   the GPU and it is very fast. The big win from GraphicsResource is
-#   that we never copy pixels from the CPU to the GPU and then and back.
+#   that we never copy pixels from the CPU to the GPU and then back.
 #
 # What you should see
 # ===================
-# A window showing smoothly animated, colorful swirling patterns (a "plasma"
-# effect popular in the demoscene).  The window title shows the current FPS.
+# A window showing smoothly animated, colorful swirling patterns (a plasma
+# effect popular in the demoscene). The window title shows the current FPS.
 # Close the window or press Escape to exit.
 #
 # Requirements
 # ============
 #   pip install pyglet
-#
-# ################################################################################
 
 import ctypes
 import sys
diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index cfaa1d6707..acf96be0f0 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -4,20 +4,11 @@
 
 # ################################################################################
 #
-# This demo illustrates:
-#
-#   1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
-#   2. That linking allows for libraries to modify workflows dynamically at runtime
-#
-# This demo mimics a relationship between a library and a user. The user's sole responsibility is to
-# provide device code that generates some art. Whereas the library is responsible for all steps involved in
-# setting up the device, launch configurations and arguments, as well as linking the provided device code.
-#
-# Two algorithms are implemented:
-#   1. A Mandelbrot set
-#   2. A Julia set
-#
-# The user can choose which algorithm to use at runtime and generate the resulting image.
+# This example demonstrates the JIT LTO feature of the Linker class to link
+# multiple objects together, allowing libraries to modify workflows at runtime.
+# It mimics a library-user relationship: the user provides device code that
+# generates art (Mandelbrot or Julia set), while the library handles device
+# setup, launch config, and linking.
 #
 # ################################################################################
 
diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py
index cda486015e..a53f33d2df 100644
--- a/cuda_core/examples/memory_ops.py
+++ b/cuda_core/examples/memory_ops.py
@@ -4,11 +4,9 @@
 
 # ################################################################################
 #
-# This demo illustrates:
-#
-#   1. How to use different memory resources to allocate and manage memory
-#   2. How to copy data between different memory types
-#   3. How to use DLPack to interoperate with other libraries
+# This example demonstrates memory resources for allocation and management,
+# copying data between device and pinned memory, and DLPack interop. Requires
+# NumPy 2.1.0+.
 #
 # ################################################################################
 
@@ -26,10 +24,6 @@
     launch,
 )
 
-if np.__version__ < "2.1.0":
-    print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
-    sys.exit(1)
-
 # Kernel for memory operations
 code = """
 extern "C"
@@ -47,95 +41,105 @@
 }
 """
 
-dev = Device()
-dev.set_current()
-stream = dev.create_stream()
-# tell CuPy to use our stream as the current stream:
-cp.cuda.ExternalStream(int(stream.handle)).use()
-device_buffer = None
-pinned_buffer = None
-new_device_buffer = None
-
-try:
-    # Compile kernel
-    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
-    prog = Program(code, code_type="c++", options=program_options)
-    mod = prog.compile("cubin")
-    kernel = mod.get_kernel("memory_ops")
-
-    # Create different memory resources
-    device_mr = dev.memory_resource
-    pinned_mr = LegacyPinnedMemoryResource()
-
-    # Allocate different types of memory
-    size = 1024
-    dtype = cp.float32
-    element_size = dtype().itemsize
-    total_size = size * element_size
-
-    # 1. Device Memory (GPU-only)
-    device_buffer = device_mr.allocate(total_size, stream=stream)
-    device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)
-
-    # 2. Pinned Memory (CPU memory, GPU accessible)
-    pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
-    pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)
-
-    # Initialize data
-    rng = cp.random.default_rng()
-    device_array[:] = rng.random(size, dtype=dtype)
-    pinned_array[:] = rng.random(size, dtype=dtype).get()
-
-    # Store original values for verification
-    device_original = device_array.copy()
-    pinned_original = pinned_array.copy()
-
-    # Sync before kernel launch
-    stream.sync()
-
-    # Launch kernel
-    block = 256
-    grid = (size + block - 1) // block
-    config = LaunchConfig(grid=grid, block=block)
-
-    launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
-    stream.sync()
-
-    # Verify kernel operations
-    assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
-    assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
-
-    # Copy data between different memory types
-    print("\nCopying data between memory types...", file=sys.stderr)
-
-    # Copy from device to pinned memory
-    device_buffer.copy_to(pinned_buffer, stream=stream)
-    stream.sync()
-
-    # Verify the copy operation
-    assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"
-
-    # Create a new device buffer and copy from pinned
-    new_device_buffer = device_mr.allocate(total_size, stream=stream)
-    new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)
-
-    pinned_buffer.copy_to(new_device_buffer, stream=stream)
-    stream.sync()
-
-    # Verify the copy operation
-    assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"
-
-    print("Memory management example completed!")
-finally:
-    # Clean up resources even if verification fails.
-    if new_device_buffer is not None:
-        new_device_buffer.close(stream)
-        assert new_device_buffer.handle == 0, "New device buffer should be closed"
-    if pinned_buffer is not None:
-        pinned_buffer.close(stream)
-        assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
-    if device_buffer is not None:
-        device_buffer.close(stream)
-        assert device_buffer.handle == 0, "Device buffer should be closed"
-    stream.close()
-    cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream
+
+def main():
+    if np.__version__ < "2.1.0":
+        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
+        sys.exit(1)
+
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+    # tell CuPy to use our stream as the current stream:
+    cp.cuda.ExternalStream(int(stream.handle)).use()
+    device_buffer = None
+    pinned_buffer = None
+    new_device_buffer = None
+
+    try:
+        # Compile kernel
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin")
+        kernel = mod.get_kernel("memory_ops")
+
+        # Create different memory resources
+        device_mr = dev.memory_resource
+        pinned_mr = LegacyPinnedMemoryResource()
+
+        # Allocate different types of memory
+        size = 1024
+        dtype = cp.float32
+        element_size = dtype().itemsize
+        total_size = size * element_size
+
+        # 1. Device Memory (GPU-only)
+        device_buffer = device_mr.allocate(total_size, stream=stream)
+        device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)
+
+        # 2. Pinned Memory (CPU memory, GPU accessible)
+        pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
+        pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)
+
+        # Initialize data
+        rng = cp.random.default_rng()
+        device_array[:] = rng.random(size, dtype=dtype)
+        pinned_array[:] = rng.random(size, dtype=dtype).get()
+
+        # Store original values for verification
+        device_original = device_array.copy()
+        pinned_original = pinned_array.copy()
+
+        # Sync before kernel launch
+        stream.sync()
+
+        # Launch kernel
+        block = 256
+        grid = (size + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
+
+        launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
+        stream.sync()
+
+        # Verify kernel operations
+        assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
+        assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
+
+        # Copy data between different memory types
+        print("\nCopying data between memory types...", file=sys.stderr)
+
+        # Copy from device to pinned memory
+        device_buffer.copy_to(pinned_buffer, stream=stream)
+        stream.sync()
+
+        # Verify the copy operation
+        assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"
+
+        # Create a new device buffer and copy from pinned
+        new_device_buffer = device_mr.allocate(total_size, stream=stream)
+        new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)
+
+        pinned_buffer.copy_to(new_device_buffer, stream=stream)
+        stream.sync()
+
+        # Verify the copy operation
+        assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"
+
+        print("Memory management example completed!")
+    finally:
+        # Clean up resources even if verification fails.
+        if new_device_buffer is not None:
+            new_device_buffer.close(stream)
+            assert new_device_buffer.handle == 0, "New device buffer should be closed"
+        if pinned_buffer is not None:
+            pinned_buffer.close(stream)
+            assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
+        if device_buffer is not None:
+            device_buffer.close(stream)
+            assert device_buffer.handle == 0, "Device buffer should be closed"
+        stream.close()
+        cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 4e3bfcceb5..6909272b4d 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -4,11 +4,8 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use `cuda.core` to compile a CUDA kernel
-# and launch it using PyTorch tensors as inputs.
-#
-# ## Usage: pip install "cuda-core[cu12]"
-# ## python pytorch_example.py
+# This example demonstrates how to use cuda.core to compile a CUDA kernel
+# and launch it using PyTorch tensors as inputs. Requires PyTorch with CUDA.
 #
 # ################################################################################
 
@@ -30,13 +27,6 @@
 }
 """
 
-dev = Device()
-dev.set_current()
-
-# Get PyTorch's current stream
-pt_stream = torch.cuda.current_stream()
-print(f"PyTorch stream: {pt_stream}", file=sys.stderr)
-
 
 # Create a wrapper class that implements __cuda_stream__
 class PyTorchStreamWrapper:
@@ -48,66 +38,77 @@ def __cuda_stream__(self):
         return (0, stream_id)  # Return format required by CUDA Python
 
 
-stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
-
-try:
-    # prepare program
-    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(code, code_type="c++", options=program_options)
-    mod = prog.compile(
-        "cubin",
-        logs=sys.stdout,
-        name_expressions=("saxpy_kernel<float>", "saxpy_kernel<double>"),
-    )
-
-    # Run in single precision
-    kernel = mod.get_kernel("saxpy_kernel<float>")
-    dtype = torch.float32
-
-    # prepare input/output
-    size = 64
-    # Use a single element tensor for 'a'
-    a = torch.tensor([10.0], dtype=dtype, device="cuda")
-    x = torch.rand(size, dtype=dtype, device="cuda")
-    y = torch.rand(size, dtype=dtype, device="cuda")
-    out = torch.empty_like(x)
-
-    # prepare launch
-    block = 32
-    grid = int((size + block - 1) // block)
-    config = LaunchConfig(grid=grid, block=block)
-    kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
-
-    # launch kernel on our stream
-    launch(stream, config, kernel, *kernel_args)
-
-    # check result
-    assert torch.allclose(out, a.item() * x + y)
-
-    # let's repeat again with double precision
-    kernel = mod.get_kernel("saxpy_kernel<double>")
-    dtype = torch.float64
-
-    # prepare input
-    size = 128
-    # Use a single element tensor for 'a'
-    a = torch.tensor([42.0], dtype=dtype, device="cuda")
-    x = torch.rand(size, dtype=dtype, device="cuda")
-    y = torch.rand(size, dtype=dtype, device="cuda")
-
-    # prepare output
-    out = torch.empty_like(x)
-
-    # prepare launch
-    block = 64
-    grid = int((size + block - 1) // block)
-    config = LaunchConfig(grid=grid, block=block)
-    kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
-
-    # launch kernel on PyTorch's stream
-    launch(stream, config, kernel, *kernel_args)
-
-    # check result
-    assert torch.allclose(out, a * x + y)
-finally:
-    stream.close()
+def main():
+    dev = Device()
+    dev.set_current()
+
+    pt_stream = torch.cuda.current_stream()
+    print(f"PyTorch stream: {pt_stream}", file=sys.stderr)
+
+    stream = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+
+    try:
+        # prepare program
+        program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile(
+            "cubin",
+            logs=sys.stdout,
+            name_expressions=("saxpy_kernel<float>", "saxpy_kernel<double>"),
+        )
+
+        # Run in single precision
+        kernel = mod.get_kernel("saxpy_kernel<float>")
+        dtype = torch.float32
+
+        # prepare input/output
+        size = 64
+        # Use a single element tensor for 'a'
+        a = torch.tensor([10.0], dtype=dtype, device="cuda")
+        x = torch.rand(size, dtype=dtype, device="cuda")
+        y = torch.rand(size, dtype=dtype, device="cuda")
+        out = torch.empty_like(x)
+
+        # prepare launch
+        block = 32
+        grid = int((size + block - 1) // block)
+        config = LaunchConfig(grid=grid, block=block)
+        kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+
+        # launch kernel on our stream
+        launch(stream, config, kernel, *kernel_args)
+
+        # check result
+        assert torch.allclose(out, a.item() * x + y)
+
+        # let's repeat again with double precision
+        kernel = mod.get_kernel("saxpy_kernel<double>")
+        dtype = torch.float64
+
+        # prepare input
+        size = 128
+        # Use a single element tensor for 'a'
+        a = torch.tensor([42.0], dtype=dtype, device="cuda")
+        x = torch.rand(size, dtype=dtype, device="cuda")
+        y = torch.rand(size, dtype=dtype, device="cuda")
+
+        # prepare output
+        out = torch.empty_like(x)
+
+        # prepare launch
+        block = 64
+        grid = int((size + block - 1) // block)
+        config = LaunchConfig(grid=grid, block=block)
+        kernel_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+
+        # launch kernel on PyTorch's stream
+        launch(stream, config, kernel, *kernel_args)
+
+        # check result
+        assert torch.allclose(out, a * x + y)
+    finally:
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index 548af802be..6e5b320f90 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -4,10 +4,9 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use `cuda.core` to compile a templated CUDA kernel
-# and launch it using `cupy` arrays as inputs. This is a simple example of a
-# templated kernel, where the kernel is instantiated for both `float` and `double`
-# data types.
+# This example demonstrates a templated CUDA kernel (SAXPY) compiled and
+# launched with cuda.core, using CuPy arrays. The kernel is instantiated
+# for both float and double.
 #
 # ################################################################################
 
@@ -33,87 +32,92 @@
 """
 
 
-dev = Device()
-dev.set_current()
-stream = dev.create_stream()
-buf = None
-
-try:
-    # prepare program
-    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-    prog = Program(code, code_type="c++", options=program_options)
-
-    # Note the use of the `name_expressions` argument to specify the template
-    # instantiations of the kernel that we will use. For non-templated kernels,
-    # `name_expressions` will simply contain the name of the kernels.
-    mod = prog.compile(
-        "cubin",
-        logs=sys.stdout,
-        name_expressions=("saxpy<float>", "saxpy<double>"),
-    )
-
-    # run in single precision
-    kernel = mod.get_kernel("saxpy<float>")
-    dtype = cp.float32
-
-    # prepare input/output
-    size = cp.uint64(64)
-    a = dtype(10)
-    rng = cp.random.default_rng()
-    x = rng.random(size, dtype=dtype)
-    y = rng.random(size, dtype=dtype)
-    out = cp.empty_like(x)
-    dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
-
-    # prepare launch
-    block = 32
-    grid = int((size + block - 1) // block)
-    config = LaunchConfig(grid=grid, block=block)
-    kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
-
-    # launch kernel on stream
-    launch(stream, config, kernel, *kernel_args)
-    stream.sync()
-
-    # check result
-    assert cp.allclose(out, a * x + y)
-
-    # let's repeat again, this time allocates our own out buffer instead of cupy's
-    # run in double precision
-    kernel = mod.get_kernel("saxpy<double>")
-    dtype = cp.float64
-
-    # prepare input
-    size = cp.uint64(128)
-    a = dtype(42)
-    x = rng.random(size, dtype=dtype)
-    y = rng.random(size, dtype=dtype)
-    dev.sync()
-
-    # prepare output
-    buf = dev.allocate(
-        size * 8,  # = dtype.itemsize
-        stream=stream,
-    )
-
-    # prepare launch
-    block = 64
-    grid = int((size + block - 1) // block)
-    config = LaunchConfig(grid=grid, block=block)
-    kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
-
-    # launch kernel on stream
-    launch(stream, config, kernel, *kernel_args)
-    stream.sync()
-
-    # check result
-    # we wrap output buffer as a cupy array for simplicity
-    out = cp.ndarray(
-        size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
-    )
-    assert cp.allclose(out, a * x + y)
-finally:
-    # cupy cleans up automatically the rest
-    if buf is not None:
-        buf.close(stream)
-    stream.close()
+def main():
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+    buf = None
+
+    try:
+        # prepare program
+        program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+
+        # Note the use of the `name_expressions` argument to specify the template
+        # instantiations of the kernel that we will use. For non-templated kernels,
+        # `name_expressions` will simply contain the name of the kernels.
+        mod = prog.compile(
+            "cubin",
+            logs=sys.stdout,
+            name_expressions=("saxpy<float>", "saxpy<double>"),
+        )
+
+        # run in single precision
+        kernel = mod.get_kernel("saxpy<float>")
+        dtype = cp.float32
+
+        # prepare input/output
+        size = cp.uint64(64)
+        a = dtype(10)
+        rng = cp.random.default_rng()
+        x = rng.random(size, dtype=dtype)
+        y = rng.random(size, dtype=dtype)
+        out = cp.empty_like(x)
+        dev.sync()  # cupy runs on a different stream from stream, so sync before accessing
+
+        # prepare launch
+        block = 32
+        grid = int((size + block - 1) // block)
+        config = LaunchConfig(grid=grid, block=block)
+        kernel_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+
+        # launch kernel on stream
+        launch(stream, config, kernel, *kernel_args)
+        stream.sync()
+
+        # check result
+        assert cp.allclose(out, a * x + y)
+
+        # let's repeat again, this time allocates our own out buffer instead of cupy's
+        # run in double precision
+        kernel = mod.get_kernel("saxpy<double>")
+        dtype = cp.float64
+
+        # prepare input
+        size = cp.uint64(128)
+        a = dtype(42)
+        x = rng.random(size, dtype=dtype)
+        y = rng.random(size, dtype=dtype)
+        dev.sync()
+
+        # prepare output
+        buf = dev.allocate(
+            size * 8,  # = dtype.itemsize
+            stream=stream,
+        )
+
+        # prepare launch
+        block = 64
+        grid = int((size + block - 1) // block)
+        config = LaunchConfig(grid=grid, block=block)
+        kernel_args = (a, x.data.ptr, y.data.ptr, buf, size)
+
+        # launch kernel on stream
+        launch(stream, config, kernel, *kernel_args)
+        stream.sync()
+
+        # check result
+        # we wrap output buffer as a cupy array for simplicity
+        out = cp.ndarray(
+            size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
+        )
+        assert cp.allclose(out, a * x + y)
+    finally:
+        # cupy cleans up automatically the rest
+        if buf is not None:
+            buf.close(stream)
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
index baf86ebc03..093b89b331 100644
--- a/cuda_core/examples/show_device_properties.py
+++ b/cuda_core/examples/show_device_properties.py
@@ -4,7 +4,7 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use `cuda.core` to show the properties of the
+# This example demonstrates how to use cuda.core to show the properties of
 # CUDA devices in the system.
 #
 # ################################################################################
@@ -236,8 +236,12 @@ def show_device_properties():
         print("*****************************************************\n\n")
 
 
-if __name__ == "__main__":
+def main():
     if len(sys.argv) != 1:
         print("no command-line arguments expected", file=sys.stderr)
         sys.exit(1)
     show_device_properties()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 882ce8bbb3..236a1cca20 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -4,8 +4,8 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use `cuda.core` to compile and launch kernels
-# on multiple GPUs.
+# This example demonstrates how to use cuda.core to compile and launch
+# kernels on multiple GPUs. Requires at least 2 GPUs.
 #
 # ################################################################################
 
@@ -15,10 +15,6 @@
 
 from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch, system
 
-if system.get_num_devices() < 2:
-    print("this example requires at least 2 GPUs", file=sys.stderr)
-    sys.exit(1)
-
 dtype = cp.float32
 size = 50000
 
@@ -34,17 +30,22 @@ def __cuda_stream__(self):
         return (0, self.obj.ptr)
 
 
-# Set GPU 0
-dev0 = Device(0)
-dev0.set_current()
-stream0 = dev0.create_stream()
-stream1 = None
-cp_stream0 = None
-cp_stream1 = None
+def main():
+    if system.get_num_devices() < 2:
+        print("this example requires at least 2 GPUs", file=sys.stderr)
+        sys.exit(1)
 
-try:
-    # Compile a kernel targeting GPU 0 to compute c = a + b
-    code_add = """
+    # Set GPU 0
+    dev0 = Device(0)
+    dev0.set_current()
+    stream0 = dev0.create_stream()
+    stream1 = None
+    cp_stream0 = None
+    cp_stream1 = None
+
+    try:
+        # Compile a kernel targeting GPU 0 to compute c = a + b
+        code_add = """
 extern "C"
 __global__ void vector_add(const float* A,
                            const float* B,
@@ -56,17 +57,17 @@ def __cuda_stream__(self):
     }
 }
 """
-    prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
-    mod_add = prog_add.compile("cubin")
-    add_kernel = mod_add.get_kernel("vector_add")
+        prog_add = Program(code_add, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev0.arch}"))
+        mod_add = prog_add.compile("cubin")
+        add_kernel = mod_add.get_kernel("vector_add")
 
-    # Set GPU 1
-    dev1 = Device(1)
-    dev1.set_current()
-    stream1 = dev1.create_stream()
+        # Set GPU 1
+        dev1 = Device(1)
+        dev1.set_current()
+        stream1 = dev1.create_stream()
 
-    # Compile a kernel targeting GPU 1 to compute c = a - b
-    code_sub = """
+        # Compile a kernel targeting GPU 1 to compute c = a - b
+        code_sub = """
 extern "C"
 __global__ void vector_sub(const float* A,
                            const float* B,
@@ -78,62 +79,66 @@ def __cuda_stream__(self):
     }
 }
 """
-    prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
-    mod_sub = prog_sub.compile("cubin")
-    sub_kernel = mod_sub.get_kernel("vector_sub")
-
-    # Create launch configs for each kernel that will be executed on the respective
-    # CUDA streams.
-    block = 256
-    grid = (size + block - 1) // block
-    config0 = LaunchConfig(grid=grid, block=block)
-    config1 = LaunchConfig(grid=grid, block=block)
-
-    # Allocate memory on GPU 0
-    # Note: This runs on CuPy's current stream for GPU 0
-    dev0.set_current()
-    rng = cp.random.default_rng()
-    a = rng.random(size, dtype=dtype)
-    b = rng.random(size, dtype=dtype)
-    c = cp.empty_like(a)
-    cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
-
-    # Establish a stream order to ensure that memory has been initialized before
-    # accessed by the kernel.
-    stream0.wait(cp_stream0)
-
-    # Launch the add kernel on GPU 0 / stream 0
-    launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-
-    # Allocate memory on GPU 1
-    # Note: This runs on CuPy's current stream for GPU 1.
-    dev1.set_current()
-    rng = cp.random.default_rng()
-    x = rng.random(size, dtype=dtype)
-    y = rng.random(size, dtype=dtype)
-    z = cp.empty_like(a)
-    cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
-
-    # Establish a stream order
-    stream1.wait(cp_stream1)
-
-    # Launch the subtract kernel on GPU 1 / stream 1
-    launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
-
-    # Synchronize both GPUs are validate the results
-    dev0.set_current()
-    stream0.sync()
-    assert cp.allclose(c, a + b)
-    dev1.set_current()
-    stream1.sync()
-    assert cp.allclose(z, x - y)
-
-    print("done")
-finally:
-    if cp_stream1 is not None:
-        cp_stream1.close()
-    if cp_stream0 is not None:
-        cp_stream0.close()
-    if stream1 is not None:
-        stream1.close()
-    stream0.close()
+        prog_sub = Program(code_sub, code_type="c++", options=ProgramOptions(std="c++17", arch=f"sm_{dev1.arch}"))
+        mod_sub = prog_sub.compile("cubin")
+        sub_kernel = mod_sub.get_kernel("vector_sub")
+
+        # Create launch configs for each kernel that will be executed on the respective
+        # CUDA streams.
+        block = 256
+        grid = (size + block - 1) // block
+        config0 = LaunchConfig(grid=grid, block=block)
+        config1 = LaunchConfig(grid=grid, block=block)
+
+        # Allocate memory on GPU 0
+        # Note: This runs on CuPy's current stream for GPU 0
+        dev0.set_current()
+        rng = cp.random.default_rng()
+        a = rng.random(size, dtype=dtype)
+        b = rng.random(size, dtype=dtype)
+        c = cp.empty_like(a)
+        cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
+
+        # Establish a stream order to ensure that memory has been initialized before
+        # accessed by the kernel.
+        stream0.wait(cp_stream0)
+
+        # Launch the add kernel on GPU 0 / stream 0
+        launch(stream0, config0, add_kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+
+        # Allocate memory on GPU 1
+        # Note: This runs on CuPy's current stream for GPU 1.
+        dev1.set_current()
+        rng = cp.random.default_rng()
+        x = rng.random(size, dtype=dtype)
+        y = rng.random(size, dtype=dtype)
+        z = cp.empty_like(a)
+        cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
+
+        # Establish a stream order
+        stream1.wait(cp_stream1)
+
+        # Launch the subtract kernel on GPU 1 / stream 1
+        launch(stream1, config1, sub_kernel, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+
+        # Synchronize both GPUs and validate the results
+        dev0.set_current()
+        stream0.sync()
+        assert cp.allclose(c, a + b)
+        dev1.set_current()
+        stream1.sync()
+        assert cp.allclose(z, x - y)
+
+        print("done")
+    finally:
+        if cp_stream1 is not None:
+            cp_stream1.close()
+        if cp_stream0 is not None:
+            cp_stream0.close()
+        if stream1 is not None:
+            stream1.close()
+        stream0.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
index f973a813b9..8482021c45 100644
--- a/cuda_core/examples/strided_memory_view_cpu.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -4,13 +4,8 @@
 
 # ################################################################################
 #
-# This demo illustrates:
-#
-#   1. The similarity between CPU and GPU JIT-compilation with C++ sources
-#   2. How to use StridedMemoryView to interface with foreign C/C++ functions
-#
-# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be
-# easily installed from pip or conda following their instructions.
+# This example demonstrates StridedMemoryView for interfacing with foreign
+# C/C++ functions, using JIT-compiled CPU code via cffi. Requires cffi.
 #
 # ################################################################################
 
@@ -124,11 +119,11 @@ def _run_example(cpu_prog, cpu_func):
     assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
 
 
-def run():
+def main():
     cpu_prog = _create_cpu_program()
     with tempfile.TemporaryDirectory() as temp_dir, _compiled_cpu_func(cpu_prog, temp_dir) as cpu_func:
         _run_example(cpu_prog, cpu_func)
 
 
 if __name__ == "__main__":
-    run()
+    main()
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index 9d4e4aacff..0abf5d086e 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -4,13 +4,8 @@
 
 # ################################################################################
 #
-# This demo illustrates:
-#
-#   1. The similarity between CPU and GPU JIT-compilation with C++ sources
-#   2. How to use StridedMemoryView to interface with foreign C/C++ functions
-#
-# This demo uses cffi (https://cffi.readthedocs.io/) for the CPU path, which can be
-# easily installed from pip or conda following their instructions.
+# This example demonstrates StridedMemoryView for interfacing with foreign
+# C/C++ functions, using JIT-compiled GPU code. Requires cupy.
 #
 # ################################################################################
 
@@ -84,7 +79,7 @@ def my_func(arr, work_stream, kernel):
     work_stream.sync()
 
 
-def run():
+def main():
     global my_func
     # Here is a concrete (very naive!) implementation on GPU:
     gpu_code = string.Template(r"""
@@ -122,4 +117,4 @@ def run():
 
 
 if __name__ == "__main__":
-    run()
+    main()
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index a5f50d4189..495fe882a9 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -4,8 +4,9 @@
 
 # ################################################################################
 #
-# This demo illustrates the use of thread block clusters in the CUDA launch
+# This example demonstrates thread block clusters in the CUDA launch
 # configuration and verifies that the correct grid size is passed to the kernel.
+# Requires compute capability >= 9.0 and CUDA_PATH.
 #
 # ################################################################################
 
@@ -23,24 +24,6 @@
     launch,
 )
 
-if np.lib.NumpyVersion(np.__version__) < "2.2.5":
-    print("This example requires NumPy 2.2.5 or later", file=sys.stderr)
-    sys.exit(1)
-
-# prepare include
-cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
-if cuda_path is None:
-    print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
-    sys.exit(1)
-cuda_include = os.path.join(cuda_path, "include")
-if not os.path.isdir(cuda_include):
-    print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr)
-    sys.exit(1)
-include_path = [cuda_include]
-cccl_include = os.path.join(cuda_include, "cccl")
-if os.path.isdir(cccl_include):
-    include_path.insert(0, cccl_include)
-
 # print cluster info using a kernel and store results in pinned memory
 code = r"""
 #include <cooperative_groups.h>
@@ -76,77 +59,100 @@
 }
 """
 
-dev = Device()
-arch = dev.compute_capability
-if arch < (9, 0):
-    print(
-        "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-arch = "".join(f"{i}" for i in arch)
-
-# prepare program & compile kernel
-dev.set_current()
-prog = Program(
-    code,
-    code_type="c++",
-    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
-)
-mod = prog.compile(target_type="cubin")
-kernel = mod.get_kernel("check_cluster_info")
-
-# prepare launch config
-grid = 4
-cluster = 2
-block = 32
-config = LaunchConfig(grid=grid, cluster=cluster, block=block)
-
-# allocate pinned memory to store kernel results
-pinned_mr = LegacyPinnedMemoryResource()
-element_size = np.dtype(np.uint32).itemsize
-grid_buffer = None
-cluster_buffer = None
-block_buffer = None
-
-try:
-    # allocate 3 uint32 values each for grid, cluster, and block dimensions
-    grid_buffer = pinned_mr.allocate(3 * element_size)
-    cluster_buffer = pinned_mr.allocate(3 * element_size)
-    block_buffer = pinned_mr.allocate(3 * element_size)
-
-    # create NumPy arrays from the pinned memory
-    grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32)
-    cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32)
-    block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32)
-
-    # initialize arrays to zero
-    grid_dims[:] = 0
-    cluster_dims[:] = 0
-    block_dims[:] = 0
-
-    # launch kernel on the default stream
-    launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
-    dev.sync()
-
-    # verify results
-    print("\nResults stored in pinned memory:")
-    print(f"Grid dimensions (blocks): {tuple(grid_dims)}")
-    print(f"Cluster dimensions: {tuple(cluster_dims)}")
-    print(f"Block dimensions (threads): {tuple(block_dims)}")
-
-    # verify that grid conversion worked correctly:
-    # LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster)
-    expected_grid_blocks = grid * cluster  # 4 * 2 = 8
-    actual_grid_blocks = grid_dims[0]
-
-    assert actual_grid_blocks == expected_grid_blocks, (
-        f"Grid conversion failed: expected {expected_grid_blocks} total blocks, got {actual_grid_blocks}"
+
+def main():
+    if np.lib.NumpyVersion(np.__version__) < "2.2.5":
+        print("This example requires NumPy 2.2.5 or later", file=sys.stderr)
+        sys.exit(1)
+
+    cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
+    if cuda_path is None:
+        print("this example requires a valid CUDA_PATH environment variable set", file=sys.stderr)
+        sys.exit(1)
+    cuda_include = os.path.join(cuda_path, "include")
+    if not os.path.isdir(cuda_include):
+        print(f"CUDA include directory not found: {cuda_include}", file=sys.stderr)
+        sys.exit(1)
+    include_path = [cuda_include]
+    cccl_include = os.path.join(cuda_include, "cccl")
+    if os.path.isdir(cccl_include):
+        include_path.insert(0, cccl_include)
+
+    dev = Device()
+    arch = dev.compute_capability
+    if arch < (9, 0):
+        print(
+            "this example requires compute capability >= 9.0 (since thread block cluster is a hardware feature)",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    arch = "".join(f"{i}" for i in arch)
+
+    # prepare program & compile kernel
+    dev.set_current()
+    prog = Program(
+        code,
+        code_type="c++",
+        options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
     )
-finally:
-    if block_buffer is not None:
-        block_buffer.close()
-    if cluster_buffer is not None:
-        cluster_buffer.close()
-    if grid_buffer is not None:
-        grid_buffer.close()
+    mod = prog.compile(target_type="cubin")
+    kernel = mod.get_kernel("check_cluster_info")
+
+    # prepare launch config
+    grid = 4
+    cluster = 2
+    block = 32
+    config = LaunchConfig(grid=grid, cluster=cluster, block=block)
+
+    # allocate pinned memory to store kernel results
+    pinned_mr = LegacyPinnedMemoryResource()
+    element_size = np.dtype(np.uint32).itemsize
+    grid_buffer = None
+    cluster_buffer = None
+    block_buffer = None
+
+    try:
+        # allocate 3 uint32 values each for grid, cluster, and block dimensions
+        grid_buffer = pinned_mr.allocate(3 * element_size)
+        cluster_buffer = pinned_mr.allocate(3 * element_size)
+        block_buffer = pinned_mr.allocate(3 * element_size)
+
+        # create NumPy arrays from the pinned memory
+        grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32)
+        cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32)
+        block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32)
+
+        # initialize arrays to zero
+        grid_dims[:] = 0
+        cluster_dims[:] = 0
+        block_dims[:] = 0
+
+        # launch kernel on the default stream
+        launch(dev.default_stream, config, kernel, grid_buffer, cluster_buffer, block_buffer)
+        dev.sync()
+
+        # verify results
+        print("\nResults stored in pinned memory:")
+        print(f"Grid dimensions (blocks): {tuple(grid_dims)}")
+        print(f"Cluster dimensions: {tuple(cluster_dims)}")
+        print(f"Block dimensions (threads): {tuple(block_dims)}")
+
+        # verify that grid conversion worked correctly:
+        # LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster)
+        expected_grid_blocks = grid * cluster  # 4 * 2 = 8
+        actual_grid_blocks = grid_dims[0]
+
+        assert actual_grid_blocks == expected_grid_blocks, (
+            f"Grid conversion failed: expected {expected_grid_blocks} total blocks, got {actual_grid_blocks}"
+        )
+    finally:
+        if block_buffer is not None:
+            block_buffer.close()
+        if cluster_buffer is not None:
+            cluster_buffer.close()
+        if grid_buffer is not None:
+            grid_buffer.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index e648a3846f..3adf04882e 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -4,8 +4,8 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use `cuda.core` to compile and launch a simple
-# vector addition kernel.
+# This example demonstrates how to use cuda.core to compile and launch a
+# simple vector addition kernel.
 #
 # ################################################################################
 
@@ -28,40 +28,45 @@
 """
 
 
-dev = Device()
-dev.set_current()
-stream = dev.create_stream()
+def main():
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
 
-try:
-    # prepare program
-    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
-    prog = Program(code, code_type="c++", options=program_options)
-    mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
+    try:
+        # prepare program
+        program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+        prog = Program(code, code_type="c++", options=program_options)
+        mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
-    # run in single precision
-    kernel = mod.get_kernel("vector_add<float>")
-    dtype = cp.float32
+        # run in single precision
+        kernel = mod.get_kernel("vector_add<float>")
+        dtype = cp.float32
 
-    # prepare input/output
-    size = 50000
-    rng = cp.random.default_rng()
-    a = rng.random(size, dtype=dtype)
-    b = rng.random(size, dtype=dtype)
-    c = cp.empty_like(a)
+        # prepare input/output
+        size = 50000
+        rng = cp.random.default_rng()
+        a = rng.random(size, dtype=dtype)
+        b = rng.random(size, dtype=dtype)
+        c = cp.empty_like(a)
 
-    # cupy runs on a different stream from stream, so sync before accessing
-    dev.sync()
+        # cupy runs on a different stream from stream, so sync before accessing
+        dev.sync()
 
-    # prepare launch
-    block = 256
-    grid = (size + block - 1) // block
-    config = LaunchConfig(grid=grid, block=block)
+        # prepare launch
+        block = 256
+        grid = (size + block - 1) // block
+        config = LaunchConfig(grid=grid, block=block)
 
-    # launch kernel on stream
-    launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-    stream.sync()
+        # launch kernel on stream
+        launch(stream, config, kernel, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+        stream.sync()
 
-    # check result
-    assert cp.allclose(c, a + b)
-finally:
-    stream.close()
+        # check result
+        assert cp.allclose(c, a + b)
+    finally:
+        stream.close()
+
+
+if __name__ == "__main__":
+    main()