diff --git a/cuda_core/cuda/core/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx index b1a9a96cb2..03e76dbbe3 100644 --- a/cuda_core/cuda/core/_launch_config.pyx +++ b/cuda_core/cuda/core/_launch_config.pyx @@ -4,9 +4,7 @@ from libc.string cimport memset -from cuda.core._device import Device from cuda.core._utils.cuda_utils import ( - CUDAError, cast_to_3_tuple, driver, ) @@ -70,16 +68,7 @@ cdef class LaunchConfig: self.grid = cast_to_3_tuple("LaunchConfig.grid", grid) self.block = cast_to_3_tuple("LaunchConfig.block", block) - # FIXME: Calling Device() strictly speaking is not quite right; we should instead - # look up the device from stream. We probably need to defer the checks related to - # device compute capability or attributes. - # thread block clusters are supported starting H100 if cluster is not None: - cc = Device().compute_capability - if cc < (9, 0): - raise CUDAError( - f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" - ) self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster) else: self.cluster = None @@ -92,9 +81,6 @@ cdef class LaunchConfig: self.is_cooperative = is_cooperative - if self.is_cooperative and not Device().properties.cooperative_launch: - raise CUDAError("cooperative kernels are not supported on this device") - def _identity(self): return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS) diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index e6a07ad28e..3148c0ef89 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -17,6 +17,7 @@ from cuda.core._utils.cuda_utils cimport ( ) from cuda.core._module import Kernel from cuda.core._stream import Stream +from cuda.core._utils.cuda_utils import CUDAError from math import prod @@ -52,14 +53,26 @@ def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, k drv_cfg = conf._to_native_launch_config() drv_cfg.hStream = as_cu(s._h_stream) + if conf.cluster is not None: + _check_cluster_launch(conf, s) if conf.is_cooperative: _check_cooperative_launch(kernel, conf, s) with nogil: HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL)) +cdef _check_cluster_launch(config: LaunchConfig, stream: Stream): + cc = stream.device.compute_capability + if cc < (9, 0): + raise CUDAError( + f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" + ) + + cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream): dev = stream.device + if not dev.properties.cooperative_launch: + raise CUDAError("cooperative kernels are not supported on this device") num_sm = dev.properties.multiprocessor_count max_grid_size = ( kernel.occupancy.max_active_blocks_per_multiprocessor(prod(config.block), config.shmem_size) * num_sm diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index f4858cdaef..6b5ed64998 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -25,7 +25,6 @@ launch, ) from cuda.core._memory._legacy import _SynchronousMemoryResource -from cuda.core._utils.cuda_utils import CUDAError from cuda.core.typing import ObjectCodeFormatType, SourceCodeType @@ -63,66 +62,58 @@ def test_launch_config_shmem_size(): assert config.shmem_size == 0 -def test_launch_config_cluster_grid_conversion(init_cuda): +def test_launch_config_cluster_grid_conversion(): """Test that LaunchConfig preserves original grid values and conversion happens in native config.""" - try: - # Test case 1: 1D - Issue #867 example - config = LaunchConfig(grid=4, cluster=2, block=32) - assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}" - assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}" - assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}" + # Test case 1: 1D - Issue #867 example + config = LaunchConfig(grid=4, cluster=2, block=32) + assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}" + assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}" + assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}" - # Test case 2: 2D grid and cluster - config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32) - assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}" - assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}" + # Test case 2: 2D grid and cluster + config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32) + assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}" + assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}" - # Test case 3: 3D full specification - config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8)) - assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}" - assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}" + # Test case 3: 3D full specification + config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8)) + assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}" + assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}" - # Test case 4: Identity case - config = LaunchConfig(grid=1, cluster=1, block=32) - assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}" + # Test case 4: Identity case + config = LaunchConfig(grid=1, cluster=1, block=32) + assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}" - # Test case 5: No cluster (should not convert grid) - config = LaunchConfig(grid=4, block=32) - assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}" - assert config.cluster is None - - except CUDAError: - pytest.skip("Driver or GPU not new enough for thread block clusters") + # Test case 5: No cluster (should not convert grid) + config = LaunchConfig(grid=4, block=32) + assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}" + assert config.cluster is None def test_launch_config_native_conversion(init_cuda): """Test that _to_native_launch_config correctly converts grid from cluster units to block units.""" from cuda.core._launch_config import _to_native_launch_config - try: - # Test case 1: 1D - Issue #867 example - config = LaunchConfig(grid=4, cluster=2, block=32) - native_config = _to_native_launch_config(config) - assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}" - assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}" - assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}" - - # Test case 2: 2D grid and cluster - config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32) - native_config = _to_native_launch_config(config) - assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}" - assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}" - assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}" - - # Test case 3: No cluster (should not convert grid) - config = LaunchConfig(grid=4, block=32) - native_config = _to_native_launch_config(config) - assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}" - assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}" - assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}" - - except CUDAError: - pytest.skip("Driver or GPU not new enough for thread block clusters") + # Test case 1: 1D - Issue #867 example + config = LaunchConfig(grid=4, cluster=2, block=32) + native_config = _to_native_launch_config(config) + assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}" + assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}" + assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}" + + # Test case 2: 2D grid and cluster + config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32) + native_config = _to_native_launch_config(config) + assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}" + assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}" + assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}" + + # Test case 3: No cluster (should not convert grid) + config = LaunchConfig(grid=4, block=32) + native_config = _to_native_launch_config(config) + assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}" + assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}" + assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}" def test_launch_invalid_values(init_cuda):