NVIDIA · cpcloud · Mar 17, 2026 · Mar 16, 2026 · Mar 17, 2026
diff --git a/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py b/cuda_bindings/examples/0_Introduction/clock_nvrtc_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates using the device clock for kernel timing via
+# NVRTC-compiled CUDA code.
+#
+# ################################################################################
+
 import platform
 
 import numpy as np

diff --git a/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py b/cuda_bindings/examples/0_Introduction/simpleCubemapTexture_test.py
@@ -1,6 +1,12 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates cubemap texture sampling and transformation.
+#
+# ################################################################################
+
 import ctypes
 import sys
 import time

diff --git a/cuda_bindings/examples/0_Introduction/simpleP2P_test.py b/cuda_bindings/examples/0_Introduction/simpleP2P_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates peer-to-peer memory access and data transfer
+# between multiple GPUs.
+#
+# ################################################################################
+
 import ctypes
 import platform
 import sys

diff --git a/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py b/cuda_bindings/examples/0_Introduction/simpleZeroCopy_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates vector addition using zero-copy (mapped) host
+# memory, allowing the GPU to access CPU memory directly.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform

diff --git a/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py b/cuda_bindings/examples/0_Introduction/systemWideAtomics_test.py
@@ -1,6 +1,12 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates system-wide atomic operations on managed memory.
+#
+# ################################################################################
+
 import ctypes
 import os
 import sys

diff --git a/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py b/cuda_bindings/examples/0_Introduction/vectorAddDrv_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates vector addition using the CUDA Driver API with
+# unified virtual addressing.
+#
+# ################################################################################
+
 import ctypes
 import math
 import sys

diff --git a/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py b/cuda_bindings/examples/0_Introduction/vectorAddMMAP_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates vector addition using multi-device memory
+# mapping (cuMemCreate, cuMemMap) with virtual address management.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform

diff --git a/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py b/cuda_bindings/examples/2_Concepts_and_Techniques/streamOrderedAllocation_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates stream-ordered memory allocation (cudaMallocAsync
+# / cudaFreeAsync) and memory pool release thresholds.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform

diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates asynchronous copy from global to shared memory
+# (memcpy_async) in matrix multiplication kernels.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform

diff --git a/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py b/cuda_bindings/examples/3_CUDA_Features/simpleCudaGraphs_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates CUDA Graphs for capture and replay of GPU
+# workloads, including manual graph construction and stream capture.
+#
+# ################################################################################
+
 import ctypes
 import random as rnd
 

diff --git a/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py b/cuda_bindings/examples/4_CUDA_Libraries/conjugateGradientMultiBlockCG_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates a conjugate gradient solver using cooperative
+# groups and multi-block grid synchronization.
+#
+# ################################################################################
+
 import ctypes
 import math
 import platform
@@ -350,3 +357,7 @@ def main():
     if math.sqrt(dot_result_local) >= tol:
         print("conjugateGradientMultiBlockCG FAILED", file=sys.stderr)
         sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates isotropic finite-difference wave propagation
+# modelling across multiple GPUs with peer-to-peer halo exchange.
+#
+# ################################################################################
+
 import time
 
 import numpy as np

diff --git a/cuda_bindings/examples/extra/jit_program_test.py b/cuda_bindings/examples/extra/jit_program_test.py
@@ -1,6 +1,13 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+# ################################################################################
+#
+# This example demonstrates JIT compilation of CUDA kernels using NVRTC
+# and the Driver API (saxpy kernel).
+#
+# ################################################################################
+
 import ctypes
 
 import numpy as np

diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
@@ -4,9 +4,9 @@
 
 # ################################################################################
 #
-# This demo illustrates how to use CUDA graphs to capture and execute
-# multiple kernel launches with minimal overhead. The graph performs a
-# sequence of vector operations: add, multiply, and subtract.
+# This example demonstrates CUDA graphs to capture and execute multiple
+# kernel launches with minimal overhead. The graph performs a sequence of
+# vector operations: add, multiply, and subtract.
 #
 # ################################################################################
 

diff --git a/cuda_core/examples/gl_interop_plasma.py b/cuda_core/examples/gl_interop_plasma.py
@@ -4,10 +4,12 @@
 
 # ################################################################################
 #
-# Real-time Plasma Effect -- CUDA/OpenGL Interop with cuda.core.GraphicsResource
+# This example demonstrates cuda.core.GraphicsResource for CUDA/OpenGL
+# interop: a CUDA kernel writes pixels directly into an OpenGL PBO with
+# zero copies through the CPU. Requires pyglet.
 #
 # ################################################################################
-#
+
 # What this example teaches
 # =========================
 # How to use cuda.core.GraphicsResource to let a CUDA kernel write pixels
@@ -18,12 +20,12 @@
 # Normally, getting CUDA results onto the screen would require:
 #   CUDA -> CPU memory -> OpenGL  (two slow copies across the PCIe bus)
 #
-# GraphicsResource eliminates the CPU round-trip.  The pixel data stays
+# GraphicsResource eliminates the CPU round-trip. The pixel data stays
 # on the GPU the entire time:
 #
 #   1. OpenGL allocates a PBO (Pixel Buffer Object) -- a raw GPU buffer.
 #   2. GraphicsResource.from_gl_buffer() registers that PBO with CUDA.
-#      Now both CUDA and OpenGL have access to the *same* GPU memory.
+#      Now both CUDA and OpenGL have access to the same GPU memory.
 #
 #   +----------------------+       +---------------------+
 #   |    OpenGL PBO        |       |  GraphicsResource   |
@@ -39,23 +41,21 @@
 #   4. glTexSubImage2D -- OpenGL copies PBO into a texture (GPU-to-GPU)
 #   5. draw            -- OpenGL renders the texture to the window
 #
-#   Why is there a copy in step 4?  OpenGL can only render from a
-#   "texture" object, not from a raw buffer. The glTexSubImage2D step
+#   Why is there a copy in step 4? OpenGL can only render from a
+#   texture object, not from a raw buffer. The glTexSubImage2D step
 #   copies the PBO bytes into a texture, but this happens entirely on
 #   the GPU and it is very fast. The big win from GraphicsResource is
-#   that we never copy pixels from the CPU to the GPU and then and back.
+#   that we never copy pixels from the CPU to the GPU and then back.
 #
 # What you should see
 # ===================
-# A window showing smoothly animated, colorful swirling patterns (a "plasma"
-# effect popular in the demoscene).  The window title shows the current FPS.
+# A window showing smoothly animated, colorful swirling patterns (a plasma
+# effect popular in the demoscene). The window title shows the current FPS.
 # Close the window or press Escape to exit.
 #
 # Requirements
 # ============
 #   pip install pyglet
-#
-# ################################################################################
 
 import ctypes
 import sys

diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
@@ -4,20 +4,11 @@
 
 # ################################################################################
 #
-# This demo illustrates:
-#
-#   1. How to use the JIT LTO feature provided by the Linker class to link multiple objects together
-#   2. That linking allows for libraries to modify workflows dynamically at runtime
-#
-# This demo mimics a relationship between a library and a user. The user's sole responsibility is to
-# provide device code that generates some art. Whereas the library is responsible for all steps involved in
-# setting up the device, launch configurations and arguments, as well as linking the provided device code.
-#
-# Two algorithms are implemented:
-#   1. A Mandelbrot set
-#   2. A Julia set
-#
-# The user can choose which algorithm to use at runtime and generate the resulting image.
+# This example demonstrates the JIT LTO feature of the Linker class to link
+# multiple objects together, allowing libraries to modify workflows at runtime.
+# It mimics a library-user relationship: the user provides device code that
+# generates art (Mandelbrot or Julia set), while the library handles device
+# setup, launch config, and linking.
 #
 # ################################################################################