Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions cuda_core/cuda/core/_launch_config.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

from libc.string cimport memset

from cuda.core._device import Device
from cuda.core._utils.cuda_utils import (
CUDAError,
cast_to_3_tuple,
driver,
)
Expand Down Expand Up @@ -70,16 +68,7 @@ cdef class LaunchConfig:
self.grid = cast_to_3_tuple("LaunchConfig.grid", grid)
self.block = cast_to_3_tuple("LaunchConfig.block", block)

# FIXME: Calling Device() strictly speaking is not quite right; we should instead
# look up the device from stream. We probably need to defer the checks related to
# device compute capability or attributes.
# thread block clusters are supported starting H100
if cluster is not None:
cc = Device().compute_capability
if cc < (9, 0):
raise CUDAError(
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
)
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", cluster)
else:
self.cluster = None
Expand All @@ -92,9 +81,6 @@ cdef class LaunchConfig:

self.is_cooperative = is_cooperative

if self.is_cooperative and not Device().properties.cooperative_launch:
raise CUDAError("cooperative kernels are not supported on this device")

def _identity(self):
return tuple(getattr(self, attr) for attr in _LAUNCH_CONFIG_ATTRS)

Expand Down
13 changes: 13 additions & 0 deletions cuda_core/cuda/core/_launcher.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ from cuda.core._utils.cuda_utils cimport (
)
from cuda.core._module import Kernel
from cuda.core._stream import Stream
from cuda.core._utils.cuda_utils import CUDAError
from math import prod


Expand Down Expand Up @@ -52,14 +53,26 @@ def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, k

drv_cfg = conf._to_native_launch_config()
drv_cfg.hStream = as_cu(s._h_stream)
if conf.cluster is not None:
_check_cluster_launch(conf, s)
if conf.is_cooperative:
_check_cooperative_launch(kernel, conf, s)
with nogil:
HANDLE_RETURN(cydriver.cuLaunchKernelEx(&drv_cfg, func_handle, args_ptr, NULL))


cdef _check_cluster_launch(config: LaunchConfig, stream: Stream):
cc = stream.device.compute_capability
if cc < (9, 0):
raise CUDAError(
f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
)


cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
dev = stream.device
if not dev.properties.cooperative_launch:
raise CUDAError("cooperative kernels are not supported on this device")
num_sm = dev.properties.multiprocessor_count
max_grid_size = (
kernel.occupancy.max_active_blocks_per_multiprocessor(prod(config.block), config.shmem_size) * num_sm
Expand Down
91 changes: 41 additions & 50 deletions cuda_core/tests/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
launch,
)
from cuda.core._memory._legacy import _SynchronousMemoryResource
from cuda.core._utils.cuda_utils import CUDAError
from cuda.core.typing import ObjectCodeFormatType, SourceCodeType


Expand Down Expand Up @@ -63,66 +62,58 @@ def test_launch_config_shmem_size():
assert config.shmem_size == 0


def test_launch_config_cluster_grid_conversion(init_cuda):
def test_launch_config_cluster_grid_conversion():
"""Test that LaunchConfig preserves original grid values and conversion happens in native config."""
try:
# Test case 1: 1D - Issue #867 example
config = LaunchConfig(grid=4, cluster=2, block=32)
assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"
# Test case 1: 1D - Issue #867 example
config = LaunchConfig(grid=4, cluster=2, block=32)
assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
assert config.cluster == (2, 1, 1), f"Expected (2, 1, 1), got {config.cluster}"
assert config.block == (32, 1, 1), f"Expected (32, 1, 1), got {config.block}"

# Test case 2: 2D grid and cluster
config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"
# Test case 2: 2D grid and cluster
config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
assert config.grid == (2, 3, 1), f"Expected (2, 3, 1), got {config.grid}"
assert config.cluster == (2, 2, 1), f"Expected (2, 2, 1), got {config.cluster}"

# Test case 3: 3D full specification
config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"
# Test case 3: 3D full specification
config = LaunchConfig(grid=(2, 2, 2), cluster=(3, 3, 3), block=(8, 8, 8))
assert config.grid == (2, 2, 2), f"Expected (2, 2, 2), got {config.grid}"
assert config.cluster == (3, 3, 3), f"Expected (3, 3, 3), got {config.cluster}"

# Test case 4: Identity case
config = LaunchConfig(grid=1, cluster=1, block=32)
assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"
# Test case 4: Identity case
config = LaunchConfig(grid=1, cluster=1, block=32)
assert config.grid == (1, 1, 1), f"Expected (1, 1, 1), got {config.grid}"

# Test case 5: No cluster (should not convert grid)
config = LaunchConfig(grid=4, block=32)
assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
assert config.cluster is None

except CUDAError:
pytest.skip("Driver or GPU not new enough for thread block clusters")
# Test case 5: No cluster (should not convert grid)
config = LaunchConfig(grid=4, block=32)
assert config.grid == (4, 1, 1), f"Expected (4, 1, 1), got {config.grid}"
assert config.cluster is None


def test_launch_config_native_conversion(init_cuda):
"""Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
from cuda.core._launch_config import _to_native_launch_config

try:
# Test case 1: 1D - Issue #867 example
config = LaunchConfig(grid=4, cluster=2, block=32)
native_config = _to_native_launch_config(config)
assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"

# Test case 2: 2D grid and cluster
config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
native_config = _to_native_launch_config(config)
assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"

# Test case 3: No cluster (should not convert grid)
config = LaunchConfig(grid=4, block=32)
native_config = _to_native_launch_config(config)
assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"

except CUDAError:
pytest.skip("Driver or GPU not new enough for thread block clusters")
# Test case 1: 1D - Issue #867 example
config = LaunchConfig(grid=4, cluster=2, block=32)
native_config = _to_native_launch_config(config)
assert native_config.gridDimX == 8, f"Expected gridDimX=8, got {native_config.gridDimX}"
assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"

# Test case 2: 2D grid and cluster
config = LaunchConfig(grid=(2, 3), cluster=(2, 2), block=32)
native_config = _to_native_launch_config(config)
assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
assert native_config.gridDimY == 6, f"Expected gridDimY=6, got {native_config.gridDimY}"
assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"

# Test case 3: No cluster (should not convert grid)
config = LaunchConfig(grid=4, block=32)
native_config = _to_native_launch_config(config)
assert native_config.gridDimX == 4, f"Expected gridDimX=4, got {native_config.gridDimX}"
assert native_config.gridDimY == 1, f"Expected gridDimY=1, got {native_config.gridDimY}"
assert native_config.gridDimZ == 1, f"Expected gridDimZ=1, got {native_config.gridDimZ}"


def test_launch_invalid_values(init_cuda):
Expand Down
Loading