NVIDIA · KRRT7 · May 11, 2026 · May 12, 2026 · May 11, 2026 · May 12, 2026
diff --git a/cuda_core/cuda/core/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx
@@ -4,9 +4,7 @@
 
 from libc.string cimport memset
 
-from cuda.core._device import Device
 from cuda.core._utils.cuda_utils import (
-    CUDAError,
     cast_to_3_tuple,
     driver,
 )
@@ -70,16 +68,7 @@ cdef class LaunchConfig:
         self.grid = cast_to_3_tuple("LaunchConfig.grid", grid)
         self.block = cast_to_3_tuple("LaunchConfig.block", block)
 
-        # FIXME: Calling Device() strictly speaking is not quite right; we should instead
-        # look up the device from stream. We probably need to defer the checks related to
-        # device compute capability or attributes.
-        # thread block clusters are supported starting H100
         if cluster is not None:
-            cc = Device().compute_capability
-            if cc < (9, 0):
-                raise CUDAError(
-                    f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
-                )
             self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
         else:
             self.cluster = None
@@ -92,9 +81,6 @@ cdef class LaunchConfig:
 
         self.is_cooperative = is_cooperative
 
-        if self.is_cooperative and not Device().properties.cooperative_launch:
-            raise CUDAError("cooperative kernels are not supported on this device")
-
     def _identity(self):
         return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS)
 

diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
@@ -17,6 +17,7 @@ from cuda.core._utils.cuda_utils cimport (
 )
 from cuda.core._module import Kernel
 from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import CUDAError
 from math import prod
 
 
@@ -52,14 +53,26 @@ def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, k
 
     drv_cfg = conf._to_native_launch_config()
     drv_cfg.hStream = as_cu(s._h_stream)
+    if conf.cluster is not None:
+        _check_cluster_launch(conf, s)
     if conf.is_cooperative:
         _check_cooperative_launch(kernel, conf, s)
     with nogil:
         HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL))
 
 
+cdef _check_cluster_launch(config: LaunchConfig, stream: Stream):
+    cc = stream.device.compute_capability
+    if cc < (9, 0):
+        raise CUDAError(
+            f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
+        )
+
+
 cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
     dev = stream.device
+    if not dev.properties.cooperative_launch:
+        raise CUDAError("cooperative kernels are not supported on this device")
     num_sm = dev.properties.multiprocessor_count
     max_grid_size = (
         kernel.occupancy.max_active_blocks_per_multiprocessor(prod(config.block), config.shmem_size) * num_sm

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -25,7 +25,6 @@
     launch,
 )
 from cuda.core._memory._legacy import _SynchronousMemoryResource
-from cuda.core._utils.cuda_utils import CUDAError
 from cuda.core.typing import ObjectCodeFormatType, SourceCodeType
 
 
@@ -63,66 +62,58 @@ def test_launch_config_shmem_size():
     assert config.shmem_size == 0
 
 
-def test_launch_config_cluster_grid_conversion(init_cuda):
+def test_launch_config_cluster_grid_conversion():
     """Test that LaunchConfig preserves original grid values and conversion happens in native config."""
-    try:
-        # Test case 1: 1D - Issue #867 example
-        config = LaunchConfig(grid=4, cluster=2, block=32)
-        assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
-        assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
-        assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"
+    # Test case 1: 1D - Issue #867 example
+    config = LaunchConfig(grid=4, cluster=2, block=32)
+    assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
+    assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
+    assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"
 
-        # Test case 2: 2D grid and cluster
-        config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
-        assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
-        assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"
+    # Test case 2: 2D grid and cluster
+    config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
+    assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
+    assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"
 
-        # Test case 3: 3D full specification
-        config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
-        assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
-        assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"
+    # Test case 3: 3D full specification
+    config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
+    assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
+    assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"
 
-        # Test case 4: Identity case
-        config = LaunchConfig(grid=1, cluster=1, block=32)
-        assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"
+    # Test case 4: Identity case
+    config = LaunchConfig(grid=1, cluster=1, block=32)
+    assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"
 
-        # Test case 5: No cluster (should not convert grid)
-        config = LaunchConfig(grid=4, block=32)
-        assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
-        assert config.cluster is None
-
-    except CUDAError:
-        pytest.skip("Driver or GPU not new enough for thread block clusters")
+    # Test case 5: No cluster (should not convert grid)
+    config = LaunchConfig(grid=4, block=32)
+    assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
+    assert config.cluster is None
 
 
 def test_launch_config_native_conversion(init_cuda):
     """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
     from cuda.core._launch_config import _to_native_launch_config
 
-    try:
-        # Test case 1: 1D - Issue #867 example
-        config = LaunchConfig(grid=4, cluster=2, block=32)
-        native_config = _to_native_launch_config(config)
-        assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
-        assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
-        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
-
-        # Test case 2: 2D grid and cluster
-        config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
-        native_config = _to_native_launch_config(config)
-        assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
-        assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
-        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
-
-        # Test case 3: No cluster (should not convert grid)
-        config = LaunchConfig(grid=4, block=32)
-        native_config = _to_native_launch_config(config)
-        assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
-        assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
-        assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
-
-    except CUDAError:
-        pytest.skip("Driver or GPU not new enough for thread block clusters")
+    # Test case 1: 1D - Issue #867 example
+    config = LaunchConfig(grid=4, cluster=2, block=32)
+    native_config = _to_native_launch_config(config)
+    assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
+    assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
+    assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+
+    # Test case 2: 2D grid and cluster
+    config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
+    native_config = _to_native_launch_config(config)
+    assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
+    assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
+    assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
+
+    # Test case 3: No cluster (should not convert grid)
+    config = LaunchConfig(grid=4, block=32)
+    native_config = _to_native_launch_config(config)
+    assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
+    assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
+    assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"
 
 
 def test_launch_invalid_values(init_cuda):