diff --git a/cuda_core/cuda/core/_launch_config.pxd b/cuda_core/cuda/core/_launch_config.pxd index 112007b9cf..9a5c0854b8 100644 --- a/cuda_core/cuda/core/_launch_config.pxd +++ b/cuda_core/cuda/core/_launch_config.pxd @@ -10,13 +10,15 @@ from cuda.bindings cimport cydriver cdef class LaunchConfig: """Customizable launch options.""" cdef: - public tuple grid - public tuple cluster - public tuple block - public int shmem_size - public bint is_cooperative + readonly tuple grid + readonly tuple cluster + readonly tuple block + readonly int shmem_size + readonly bint is_cooperative vector[cydriver.CUlaunchAttribute] _attrs + cydriver.CUlaunchConfig _cached_drv_cfg + readonly bint _cache_valid object __weakref__ cdef cydriver.CUlaunchConfig _to_native_launch_config(self) diff --git a/cuda_core/cuda/core/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx index b1a9a96cb2..328072a9de 100644 --- a/cuda_core/cuda/core/_launch_config.pyx +++ b/cuda_core/cuda/core/_launch_config.pyx @@ -91,6 +91,7 @@ cdef class LaunchConfig: self.shmem_size = shmem_size self.is_cooperative = is_cooperative + self._cache_valid = False if self.is_cooperative and not Device().properties.cooperative_launch: raise CUDAError("cooperative kernels are not supported on this device") @@ -112,19 +113,19 @@ cdef class LaunchConfig: return hash(self._identity()) cdef cydriver.CUlaunchConfig _to_native_launch_config(self): + if self._cache_valid: + return self._cached_drv_cfg + cdef cydriver.CUlaunchConfig drv_cfg cdef cydriver.CUlaunchAttribute attr memset(&drv_cfg, 0, sizeof(drv_cfg)) self._attrs.resize(0) - # Handle grid dimensions and cluster configuration if self.cluster is not None: - # Convert grid from cluster units to block units drv_cfg.gridDimX = self.grid[0] * self.cluster[0] drv_cfg.gridDimY = self.grid[1] * self.cluster[1] drv_cfg.gridDimZ = self.grid[2] * self.cluster[2] - # Set up cluster attribute attr.id = cydriver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.cluster self._attrs.push_back(attr) @@ -142,10 +143,18 @@ cdef class LaunchConfig: drv_cfg.numAttrs = self._attrs.size() drv_cfg.attrs = self._attrs.data() + # Cache the result. attrs points into self._attrs which is stable + # as long as _attrs is never resized after this point (guaranteed + # because we skip resize(0) on the fast path above). + self._cached_drv_cfg = drv_cfg + self._cache_valid = True return drv_cfg -# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above +# TODO: once all modules are cythonized, this function can be dropped in favor of the cdef method above. +# NOTE: unlike the cdef method above, this cpdef wrapper creates Python driver objects on every call +# and does NOT use the _cache_valid / _cached_drv_cfg cache. The cache is only in the cdef method, +# which is called from _launcher.pyx and _module.pyx. cpdef object _to_native_launch_config(LaunchConfig config): """Convert LaunchConfig to native driver CUlaunchConfig. diff --git a/cuda_core/docs/source/release/1.0.1-notes.rst b/cuda_core/docs/source/release/1.0.1-notes.rst index b3cc3b4496..9566ae5afe 100644 --- a/cuda_core/docs/source/release/1.0.1-notes.rst +++ b/cuda_core/docs/source/release/1.0.1-notes.rst @@ -7,6 +7,17 @@ ================================= +Breaking changes +---------------- + +- :class:`LaunchConfig` fields (``grid``, ``block``, ``cluster``, + ``shmem_size``, ``is_cooperative``) are now read-only after construction. + Assigning to them from Python raises ``AttributeError``. Mutation was + previously possible but was never intended given that :class:`LaunchConfig` + is a hashable value type. Code that mutates a config after creation should + construct a new :class:`LaunchConfig` instead. + + Fixes and enhancements ---------------------- diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index f4858cdaef..e2d5ac7e2d 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -63,6 +63,76 @@ def test_launch_config_shmem_size(): assert config.shmem_size == 0 +def test_launch_config_fields_are_readonly(): + config = LaunchConfig(grid=(2, 2, 2), block=(4, 4, 4), shmem_size=256, is_cooperative=False) + typed_values = { + "grid": (1, 1, 1), + "block": (1, 1, 1), + "cluster": (1, 1, 1), + "shmem_size": 0, + "is_cooperative": False, + } + for field, value in typed_values.items(): + with pytest.raises(AttributeError): + setattr(config, field, value) + + +def test_launch_config_native_conversion_stable(init_cuda): + """The cpdef _to_native_launch_config wrapper returns consistent values across calls.""" + from cuda.core._launch_config import _to_native_launch_config + + config = LaunchConfig(grid=(4, 1, 1), block=(32, 1, 1)) + first = _to_native_launch_config(config) + second = _to_native_launch_config(config) + assert first.gridDimX == second.gridDimX == 4 + assert first.blockDimX == second.blockDimX == 32 + assert first.sharedMemBytes == second.sharedMemBytes == 0 + assert first.numAttrs == second.numAttrs == 0 + + +def test_launch_config_native_conversion_stable_cooperative(init_cuda): + """The cpdef _to_native_launch_config wrapper returns consistent attrs for cooperative configs.""" + from cuda.core._launch_config import _to_native_launch_config + + try: + config = LaunchConfig(grid=1, block=1, is_cooperative=True) + except CUDAError: + pytest.skip("Device does not support cooperative launches") + first = _to_native_launch_config(config) + second = _to_native_launch_config(config) + assert first.numAttrs == second.numAttrs == 1 + + +def test_launch_config_native_conversion_stable_cluster(init_cuda): + """The cpdef _to_native_launch_config wrapper returns consistent values for cluster configs.""" + from cuda.core._launch_config import _to_native_launch_config + + try: + config = LaunchConfig(grid=2, cluster=2, block=32) + except CUDAError: + pytest.skip("Device does not support thread block clusters") + first = _to_native_launch_config(config) + second = _to_native_launch_config(config) + assert first.gridDimX == second.gridDimX == 4 # 2 clusters * 2 blocks/cluster + assert first.numAttrs == second.numAttrs == 1 # cluster dimension attribute + + +def test_launch_config_cdef_cache_populated_by_launch(init_cuda): + """The cdef _to_native_launch_config cache (_cache_valid) is set after launch() and persists.""" + code = 'extern "C" __global__ void noop() {}' + program = Program(code, SourceCodeType.CXX) + ker = program.compile(ObjectCodeFormatType.CUBIN).get_kernel("noop") + stream = Device().create_stream() + + config = LaunchConfig(grid=1, block=1) + assert not config._cache_valid + launch(stream, config, ker) + assert config._cache_valid + # Second launch reuses the cache (fast path) — _cache_valid stays True + launch(stream, config, ker) + assert config._cache_valid + + def test_launch_config_cluster_grid_conversion(init_cuda): """Test that LaunchConfig preserves original grid values and conversion happens in native config.""" try: