From a1b9c090710a8e654b8714e6b118bdbc839cbae4 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 12 May 2026 13:44:51 -0400 Subject: [PATCH 1/6] Add infrastructure for type-checking cuda_core --- .pre-commit-config.yaml | 23 +++++++++++++++++++++-- .spdx-ignore | 3 +++ cuda_core/MANIFEST.in | 3 ++- cuda_core/cuda/core/py.typed | 0 cuda_core/pyproject.toml | 12 +++++++++++- 5 files changed, 37 insertions(+), 4 deletions(-) create mode 100644 cuda_core/cuda/core/py.typed diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 859e298bc49..d3a6af13f53 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,8 +19,9 @@ repos: hooks: - id: ruff-check args: [--fix, --show-fixes] - exclude: ^cuda_bindings/cuda/bindings/_internal/_fast_enum\.py$ + exclude: (^cuda_bindings/cuda/bindings/_internal/_fast_enum\.py$)|(.*\.pyi$) - id: ruff-format + exclude: .*\.pyi$ - repo: local hooks: @@ -42,6 +43,15 @@ repos: language: system files: '^.*/docs/source/.*\.md$' + - id: stubgen-pyx-cuda-core + name: Generate .pyi stubs for cuda_core + entry: stubgen-pyx cuda_core/cuda --continue-on-error --include-private + language: python + files: ^cuda_core/cuda/.*\.(pyx|pxd)$ + pass_filenames: false + additional_dependencies: + - stubgen-pyx==0.2.6 + # Standard hooks - repo: https://github.com/pre-commit/pre-commit-hooks rev: "3e8a8703264a2f4a69428a0aa4dcb512790b2c8c" # frozen: v6.0.0 @@ -56,7 +66,7 @@ repos: - id: check-yaml - id: debug-statements - id: end-of-file-fixer - exclude: &gen_exclude '^(?:cuda_python/README\.md|cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?)$' + exclude: &gen_exclude '^(?:cuda_python/README\.md|cuda_bindings/cuda/bindings/.*\.in?|cuda_bindings/docs/source/module/.*\.rst?|.*\.pyi)$' - id: mixed-line-ending - id: trailing-whitespace exclude: | @@ -79,9 +89,18 @@ repos: rev: 8e5c80792e2ec0c87804d8ef915bf35e2caea6da # frozen: v1.20.0 hooks: - id: mypy + alias: mypy-pathfinder name: mypy-pathfinder files: ^cuda_pathfinder/cuda/.*\.py$ # Exclude tests directory args: [--config-file=cuda_pathfinder/pyproject.toml] + - id: mypy + alias: mypy-cuda-core + name: mypy-cuda-core + files: ^cuda_core/cuda/.*\.(py|pyi)$ + pass_filenames: false + args: [--config-file=cuda_core/pyproject.toml, cuda_core/cuda/core] + additional_dependencies: + - numpy - repo: https://github.com/rhysd/actionlint rev: "914e7df21a07ef503a81201c76d2b11c789d3fca" # frozen: v1.7.12 diff --git a/.spdx-ignore b/.spdx-ignore index 866b2274e06..3e2cca9446d 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -13,4 +13,7 @@ cuda_core/cuda/core/_include/dlpack.h cuda_core/cuda/core/_include/aoti_shim.h cuda_core/cuda/core/_include/aoti_shim.def +# Generated by stubgen-pyx; regenerated on every commit so a header would be lost +cuda_core/cuda/**/*.pyi + qa/ctk-next.drawio.svg diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in index f476ae8ef2c..9e86f0a33bb 100644 --- a/cuda_core/MANIFEST.in +++ b/cuda_core/MANIFEST.in @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -recursive-include cuda/core *.pyx *.pxd *.pxi +recursive-include cuda/core *.pyx *.pxd *.pxi *.pyi recursive-include cuda/core/_cpp *.cpp *.hpp recursive-include cuda/core/_include *.h *.hpp +include cuda/core/py.typed diff --git a/cuda_core/cuda/core/py.typed b/cuda_core/cuda/core/py.typed new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 9c2d36ea144..10bc6dabb8b 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -94,7 +94,7 @@ include = ["cuda.core*"] include-package-data = false [tool.setuptools.package-data] -"*" = ["*.pxd"] +"*" = ["*.pxd", "*.pyi", "py.typed"] "cuda.core._include" = ["*.h", "*.hpp"] "cuda.core._cpp" = ["*.hpp"] @@ -108,6 +108,16 @@ version_file = "cuda/core/_version.py" tag_regex = "^cuda-core-(?Pv\\d+\\.\\d+\\.\\d+(?:[ab]\\d+)?)" git_describe_command = ["git", "describe", "--dirty", "--tags", "--long", "--match", "cuda-core-v*[0-9]*"] +[tool.mypy] +# Best to use minimum supported version here, so we don't accidentally use newer +# type features. +python_version = "3.10" +explicit_package_bases = true +namespace_packages = true +mypy_path = "cuda_core" +ignore_missing_imports = true +implicit_reexport = true + [tool.cibuildwheel] skip = "*-musllinux_*" build-verbosity = 1 From 317c54d81c2f67e51c119c65c24d1ffef2bd1e63 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 12 May 2026 13:45:26 -0400 Subject: [PATCH 2/6] Update all type annotations so they pass type-checking --- cuda_core/cuda/core/_device.pyx | 12 +- cuda_core/cuda/core/_dlpack.pyx | 14 +- cuda_core/cuda/core/_event.pyx | 18 +- cuda_core/cuda/core/_launcher.pyx | 5 + cuda_core/cuda/core/_layout.pyx | 7 +- cuda_core/cuda/core/_linker.pyx | 16 +- cuda_core/cuda/core/_memory/_buffer.pyx | 18 +- .../core/_memory/_device_memory_resource.pyx | 5 + .../core/_memory/_graph_memory_resource.pyx | 8 +- cuda_core/cuda/core/_memory/_legacy.py | 11 +- cuda_core/cuda/core/_memory/_memory_pool.pyx | 9 +- .../cuda/core/_memory/_peer_access_utils.pyx | 12 +- .../core/_memory/_virtual_memory_resource.py | 45 +++-- cuda_core/cuda/core/_module.pyx | 6 +- cuda_core/cuda/core/_program.pyx | 6 +- cuda_core/cuda/core/_resource_handles.pyx | 173 ++++++++++-------- cuda_core/cuda/core/_stream.pyx | 29 ++- cuda_core/cuda/core/_utils/cuda_utils.pyx | 21 ++- .../core/_utils/enum_explanations_helpers.py | 3 +- cuda_core/cuda/core/checkpoint.py | 12 +- .../cuda/core/graph/_adjacency_set_proxy.pyx | 7 +- cuda_core/cuda/core/graph/_graph_builder.pyx | 4 + .../cuda/core/graph/_graph_definition.pyx | 55 ++++-- cuda_core/cuda/core/graph/_graph_node.pyx | 13 +- cuda_core/cuda/core/graph/_subclasses.pyx | 10 +- cuda_core/cuda/core/system/__init__.py | 12 +- cuda_core/cuda/core/system/_device.pyx | 5 +- cuda_core/cuda/core/system/_nvml_context.pyx | 4 +- cuda_core/cuda/core/typing.py | 27 ++- .../cuda/core/utils/_program_cache/_keys.py | 4 +- 30 files changed, 367 insertions(+), 204 deletions(-) diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 67255506a2d..ecc361ef264 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -39,6 +39,12 @@ from cuda.core._utils.cuda_utils import ( ) from cuda.core._stream cimport default_stream +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import cuda.core.system # no-cython-lint + from cuda.core.graph import GraphBuilder + # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python, # but it seems it is very convenient to expose them for testing purposes... _tls = threading.local() @@ -1208,7 +1214,7 @@ class Device: def __reduce__(self): return Device, (self.device_id,) - def set_current(self, ctx: Context = None) -> Context | None: + def set_current(self, ctx: Context | None = None) -> Context | None: """Set device to be used for GPU executions. Initializes CUDA and sets the calling thread to a valid CUDA @@ -1274,7 +1280,7 @@ class Device: self._has_inited = True self._context = Context._from_handle(Context, h_context, self._device_id) # Store owning context - def create_context(self, options: ContextOptions = None) -> Context: + def create_context(self, options: ContextOptions | None = None) -> Context: """Create a new :obj:`~_context.Context` object. Note @@ -1433,7 +1439,7 @@ class Device: self._check_context_initialized() handle_return(runtime.cudaDeviceSynchronize()) - def create_graph_builder(self) -> "GraphBuilder": + def create_graph_builder(self) -> GraphBuilder: """Create a new :obj:`~graph.GraphBuilder` object. Returns diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx index 371ced011bb..460c2cb184c 100644 --- a/cuda_core/cuda/core/_dlpack.pyx +++ b/cuda_core/cuda/core/_dlpack.pyx @@ -1,7 +1,8 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 + from enum import IntEnum @@ -165,8 +166,11 @@ cpdef object make_py_capsule(object buf, bint versioned): return ret +# Values are fixed by the DLPack spec; see _include/dlpack.h. They are +# hard-coded here (rather than referencing the cdef extern names) so that the +# generated .pyi stub doesn't reference Cython-only identifiers. class DLDeviceType(IntEnum): - kDLCPU = _kDLCPU - kDLCUDA = _kDLCUDA - kDLCUDAHost = _kDLCUDAHost - kDLCUDAManaged = _kDLCUDAManaged + kDLCPU = 1 + kDLCUDA = 2 + kDLCUDAHost = 3 + kDLCUDAManaged = 13 diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 3f5fb7ace26..5f113365a9b 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -31,12 +31,17 @@ from cuda.core._utils.cuda_utils cimport ( import cython from dataclasses import dataclass import multiprocessing +from typing import TYPE_CHECKING from cuda.core._utils.cuda_utils import ( CUDAError, check_multiprocessing_start_method, ) +if TYPE_CHECKING: + import cuda.bindings.driver # no-cython-lint + from cuda.core._device import Device + @dataclass cdef class EventOptions: @@ -149,12 +154,6 @@ cdef class Event: """ self._h_event.reset() - def __isub__(self, other): - return NotImplemented - - def __rsub__(self, other): - return NotImplemented - def __sub__(self, other: Event): # return self - other (in milliseconds) cdef float timing @@ -330,9 +329,12 @@ cdef class IPCEventDescriptor: self._is_blocking_sync = is_blocking_sync return self - def __eq__(self, IPCEventDescriptor rhs): + def __eq__(self, rhs) -> bool: # No need to check self._is_blocking_sync. - return self._reserved == rhs._reserved + if not isinstance(rhs, IPCEventDescriptor): + return NotImplemented + cdef IPCEventDescriptor _rhs = rhs + return self._reserved == _rhs._reserved def __reduce__(self): return IPCEventDescriptor._init, (self._reserved, self._is_blocking_sync) diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index e6a07ad28e6..f7f1b74a4b1 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -18,6 +18,11 @@ from cuda.core._utils.cuda_utils cimport ( from cuda.core._module import Kernel from cuda.core._stream import Stream from math import prod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cuda.core.graph import GraphBuilder + from cuda.core.typing import IsStreamType def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, kernel: Kernel, *kernel_args): diff --git a/cuda_core/cuda/core/_layout.pyx b/cuda_core/cuda/core/_layout.pyx index 3e2580d11d1..56f914baa0e 100644 --- a/cuda_core/cuda/core/_layout.pyx +++ b/cuda_core/cuda/core/_layout.pyx @@ -176,8 +176,11 @@ cdef class _StridedLayout: f"_StridedLayout(shape={self.shape}, strides={self.strides}, itemsize={self.itemsize}, _slice_offset={self.slice_offset})" ) - def __eq__(self : _StridedLayout, other : _StridedLayout) -> bool: - return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and _base_layout_equal(self.base, other.base) + def __eq__(self, other) -> bool: + if not isinstance(other, _StridedLayout): + return NotImplemented + cdef _StridedLayout _other = <_StridedLayout>other + return self.itemsize == _other.itemsize and self.slice_offset == _other.slice_offset and _base_layout_equal(self.base, _other.base) @property def ndim(self : _StridedLayout): diff --git a/cuda_core/cuda/core/_linker.pyx b/cuda_core/cuda/core/_linker.pyx index 8f513ce1217..3138c3ad0f7 100644 --- a/cuda_core/cuda/core/_linker.pyx +++ b/cuda_core/cuda/core/_linker.pyx @@ -26,7 +26,7 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, HANDLE_RETURN_NVJITLINK import sys from dataclasses import dataclass -from typing import Union +from typing import TYPE_CHECKING, Union from warnings import warn from cuda.pathfinder._optional_cuda_import import _optional_cuda_import @@ -39,7 +39,17 @@ from cuda.core._utils.cuda_utils import ( driver, is_sequence, ) -from cuda.core.typing import CompilerBackendType +from cuda.core.typing import CompilerBackendType, ObjectCodeFormatType + +if TYPE_CHECKING: + import cuda.bindings.driver # no-cython-lint + import cuda.bindings.nvjitlink # no-cython-lint + +# Module-level annotations to ensure stubgen-pyx keeps the above imports in +# the generated `.pyi` so that the LinkerHandleT forward references resolve. +# These names are not assigned, so they only affect __annotations__. +_keep_driver_in_stub: "cuda.bindings.driver.CUlinkState" +_keep_nvjitlink_in_stub: "cuda.bindings.nvjitlink.nvJitLinkHandle" ctypedef const char* const_char_ptr ctypedef void* void_ptr @@ -68,7 +78,7 @@ cdef class Linker: Options for the linker. If not provided, default options will be used. """ - def __init__(self, *object_codes: ObjectCode, options: "LinkerOptions" = None): + def __init__(self, *object_codes: ObjectCode, options: LinkerOptions | None = None): Linker_init(self, object_codes, options) def link(self, target_type: ObjectCodeFormatType | str) -> ObjectCode: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 5d3bdbb873c..fee0b0aaaae 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -28,16 +28,22 @@ from cuda.core._stream cimport Stream, Stream_accept, default_stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN, _parse_fill_value import sys -from typing import TypeVar +from typing import TYPE_CHECKING +# ByteString was deprecated in favor of BufferProtocol in Python 3.12. +# When Python 3.12 is our minimum version, we can update this. +# mypy needs /something/ at the top-level, so we set that an then +# override rather than putting both branches in an if/else. +from collections.abc import ByteString as BufferProtocol if sys.version_info >= (3, 12): from collections.abc import Buffer as BufferProtocol -else: - BufferProtocol = object from cuda.core._dlpack import classify_dl_device, make_py_capsule from cuda.core._device import Device +if TYPE_CHECKING: + from cuda.core.graph import GraphBuilder + # ============================================================================= # MR deallocation callback (invoked from C++ shared_ptr deleter) @@ -218,7 +224,7 @@ cdef class Buffer: self.close() return False - def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer: + def copy_to(self, dst: Buffer | None = None, *, stream: Stream | GraphBuilder) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. Copies the data from this buffer to the provided dst buffer. @@ -330,7 +336,7 @@ cdef class Buffer: max_version: tuple[int, int] | None = None, dl_device: tuple[int, int] | None = None, copy: bool | None = None, - ) -> TypeVar("PyCapsule"): + ): # Note: we ignore the stream argument entirely (as if it is -1). # It is the user's responsibility to maintain stream order. if dl_device is not None: @@ -369,7 +375,7 @@ cdef class Buffer: return self._mem_attrs.device_id @property - def handle(self) -> DevicePointerType: + def handle(self) -> int: """Return the buffer handle object. .. caution:: diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index b7b8b247a92..f85b794965f 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -26,6 +26,11 @@ import uuid from cuda.core._memory._peer_access_utils import PeerAccessibleBySetProxy, replace_peer_accessible_by from cuda.core._utils.cuda_utils import check_multiprocessing_start_method +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cuda.core._device import Device + __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index 8fdc324dc59..5f240ff60c4 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -18,6 +18,12 @@ from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from functools import cache +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cuda.core._device import Device + from cuda.core.graph import GraphBuilder + from cuda.core.typing import DevicePointerType __all__ = ['GraphMemoryResource'] @@ -111,7 +117,7 @@ cdef class cyGraphMemoryResource(MemoryResource): cdef Stream s = Stream_accept(stream) return GMR_allocate(self, size, s) - def deallocate(self, ptr: "DevicePointerType", size_t size, *, stream: Stream | GraphBuilder): + def deallocate(self, ptr: DevicePointerType, size_t size, *, stream: Stream | GraphBuilder): """ Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. """ diff --git a/cuda_core/cuda/core/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py index 510974364da..62b7df12692 100644 --- a/cuda_core/cuda/core/_memory/_legacy.py +++ b/cuda_core/cuda/core/_memory/_legacy.py @@ -7,8 +7,9 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from cuda.core._memory._buffer import DevicePointerType from cuda.core._stream import Stream + from cuda.core.graph import GraphBuilder + from cuda.core.typing import DevicePointerType from cuda.core._memory._buffer import Buffer, MemoryResource from cuda.core._utils.cuda_utils import ( @@ -28,7 +29,7 @@ class LegacyPinnedMemoryResource(MemoryResource): # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc? - def allocate(self, size, *, stream: Stream | None = None) -> Buffer: + def allocate(self, size, *, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate a buffer of the requested size. ``cuMemAllocHost`` is synchronous, so this resource ignores any @@ -59,7 +60,7 @@ def allocate(self, size, *, stream: Stream | None = None) -> Buffer: ptr = 0 return Buffer._init(ptr, size, self) - def deallocate(self, ptr: DevicePointerType, size, *, stream: Stream | None = None): + def deallocate(self, ptr: DevicePointerType, size, *, stream: Stream | GraphBuilder | None = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -105,7 +106,7 @@ def __init__(self, device_id): self._device_id = Device(device_id).device_id - def allocate(self, size, *, stream: Stream | None = None) -> Buffer: + def allocate(self, size, *, stream: Stream | GraphBuilder | None = None) -> Buffer: # cuMemAlloc is synchronous; stream is accepted (and validated) # for interface conformance but not used. from cuda.core._stream import Stream_accept @@ -119,7 +120,7 @@ def allocate(self, size, *, stream: Stream | None = None) -> Buffer: ptr = 0 return Buffer._init(ptr, size, self) - def deallocate(self, ptr, size, *, stream: Stream | None = None): + def deallocate(self, ptr, size, *, stream: Stream | GraphBuilder | None = None): from cuda.core._stream import Stream_accept if stream is not None: diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index 4da5e26ea92..5114d79d0d1 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -26,6 +26,13 @@ from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) +import uuid +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from cuda.core.graph import GraphBuilder + from cuda.core.typing import DevicePointerType + cdef class _MemPoolAttributes: """Provides access to memory pool attributes.""" @@ -145,7 +152,7 @@ cdef class _MemPool(MemoryResource): cdef Stream s = Stream_accept(stream) return _MP_allocate(self, size, s) - def deallocate(self, ptr: "DevicePointerType", size_t size, *, stream: Stream | GraphBuilder): + def deallocate(self, ptr: DevicePointerType, size_t size, *, stream: Stream | GraphBuilder): """Deallocate a buffer previously allocated by this resource. Parameters diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx index 8086aaff170..1e04a7482fc 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -4,9 +4,9 @@ from __future__ import annotations -from collections.abc import Callable, Iterable, MutableSet +from collections.abc import Callable, Iterable, MutableSet, Set as AbstractSet from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from cuda.bindings cimport cydriver from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource @@ -336,22 +336,22 @@ class PeerAccessibleBySetProxy(MutableSet): if to_add or to_remove: self._apply(to_add, to_remove) - def __ior__(self, other): + def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] self.update(other) return self - def __iand__(self, other): + def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: self.intersection_update(other) return self - def __isub__(self, other): + def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: if other is self: self.clear() else: self.difference_update(other) return self - def __ixor__(self, other): + def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] self.symmetric_difference_update(other) return self diff --git a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py index 78a35e850fb..a1171191687 100644 --- a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py +++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from cuda.core._stream import Stream + from cuda.core.graph import GraphBuilder from cuda.core._device import Device from cuda.core._memory._buffer import Buffer, MemoryResource @@ -22,6 +23,7 @@ ) from cuda.core._utils.version import binding_version from cuda.core.typing import ( + DevicePointerType, VirtualMemoryAccessType, VirtualMemoryAllocationType, VirtualMemoryGranularityType, @@ -107,28 +109,28 @@ class VirtualMemoryResourceOptions: _allocation_type[VirtualMemoryAllocationType.MANAGED] = _t.CU_MEM_ALLOCATION_TYPE_MANAGED @staticmethod - def _access_to_flags(spec: str): + def _access_to_flags(spec: VirtualMemoryAccessType | None): flags = VirtualMemoryResourceOptions._access_flags.get(spec) if flags is None: raise ValueError(f"Unknown access spec: {spec!r}") return flags @staticmethod - def _allocation_type_to_driver(spec: str): + def _allocation_type_to_driver(spec: VirtualMemoryAllocationType): alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec) if alloc_type is None: raise ValueError(f"Unsupported allocation_type: {spec!r}") return alloc_type @staticmethod - def _location_type_to_driver(spec: str): + def _location_type_to_driver(spec: VirtualMemoryLocationType): loc_type = VirtualMemoryResourceOptions._location_type.get(spec) if loc_type is None: raise ValueError(f"Unsupported location_type: {spec!r}") return loc_type @staticmethod - def _handle_type_to_driver(spec: str): + def _handle_type_to_driver(spec: VirtualMemoryHandleType | None): if spec == "win32": raise NotImplementedError("win32 is currently not supported, please reach out to the CUDA Python team") handle_type = VirtualMemoryResourceOptions._handle_types.get(spec) @@ -137,7 +139,7 @@ def _handle_type_to_driver(spec: str): return handle_type @staticmethod - def _granularity_to_driver(spec: str): + def _granularity_to_driver(spec: VirtualMemoryGranularityType): granularity = VirtualMemoryResourceOptions._granularity.get(spec) if granularity is None: raise ValueError(f"Unsupported granularity: {spec!r}") @@ -152,7 +154,7 @@ class VirtualMemoryResource(MemoryResource): device_id : Device | int Device for which a memory resource is constructed. - config : VirtualMemoryResourceOptions + config : VirtualMemoryResourceOptions, optional A configuration object for the VirtualMemoryResource @@ -163,8 +165,8 @@ class VirtualMemoryResource(MemoryResource): in cuda.core should already meet the common needs. """ - def __init__(self, device_id: Device | int, config: VirtualMemoryResourceOptions = None): - self.device = Device(device_id) + def __init__(self, device_id: Device | int, config: VirtualMemoryResourceOptions | None = None): + self.device: Device | None = Device(device_id) self.config = check_or_create_options( VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False ) @@ -193,7 +195,9 @@ def _align_up(size: int, gran: int) -> int: """ return (size + gran - 1) & ~(gran - 1) - def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer: + def modify_allocation( + self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions | None = None + ) -> Buffer: """ Grow an existing allocation using CUDA VMM, with a configurable policy. @@ -224,6 +228,10 @@ def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryRes prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type) + # Caller must not invoke modify_allocation on a host-located resource; + # we rely on the dataclass invariant that self.device is non-None for + # device-located resources (it's only None when location is host). + assert self.device is not None, "modify_allocation requires a device-located resource" prop.location.id = self.device.device_id prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type) @@ -335,7 +343,9 @@ def _grow_allocation_fast_path( trans.commit() # Update the buffer size (pointer stays the same) - buf._size = new_size + # TODO: #2049 This is a real bug, accessing _size which doesn't exist. + # Fix bug and remove the "type: ignore[attr-defined]" comment. + buf._size = new_size # type: ignore[attr-defined] return buf def _grow_allocation_slow_path( @@ -474,7 +484,7 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: return descs - def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer: + def allocate(self, size: int, *, stream: Stream | GraphBuilder | None = None) -> Buffer: """ Allocate a buffer of the given size using CUDA virtual memory. @@ -513,7 +523,7 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer: prop = driver.CUmemAllocationProp() prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) - prop.location.id = self.device.device_id if config.location_type == "device" else -1 + prop.location.id = self.device.device_id if self.device is not None else -1 prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type) prop.win32HandleMetaData = 0 @@ -559,13 +569,13 @@ def allocate(self, size: int, *, stream: Stream | None = None) -> Buffer: buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) return buf - def deallocate(self, ptr: int, size: int, *, stream: Stream | None = None) -> None: + def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder | None = None) -> None: """ Deallocate memory on the device using CUDA VMM APIs. Parameters ---------- - ptr : int + ptr : DevicePointerType The pointer to the memory to deallocate. size : int The size in bytes of the memory to deallocate. @@ -573,6 +583,11 @@ def deallocate(self, ptr: int, size: int, *, stream: Stream | None = None) -> No Keyword-only. Unused because virtual memory operations are synchronous. """ + if ptr is None: + ptr = 0 + else: + ptr = int(ptr) + if stream is not None: from cuda.core._stream import Stream_accept @@ -608,7 +623,7 @@ def device_id(self) -> int: Returns: int: CUDA device ID. -1 if the memory resource allocates host memory """ - return self.device.device_id if self.config.location_type == "device" else -1 + return self.device.device_id if self.device is not None else -1 def __repr__(self) -> str: """ diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 96ac65effc3..c9849443e68 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -233,7 +233,9 @@ cdef class KernelAttributes: ) -MaxPotentialBlockSizeOccupancyResult = namedtuple("MaxPotential", ("min_grid_size", "max_block_size")) +MaxPotentialBlockSizeOccupancyResult = namedtuple( + "MaxPotentialBlockSizeOccupancyResult", ("min_grid_size", "max_block_size") +) cdef class KernelOccupancy: @@ -520,7 +522,7 @@ cdef class Kernel: return self.handle @staticmethod - def from_handle(handle, mod: ObjectCode = None) -> Kernel: + def from_handle(handle, mod: ObjectCode | None = None) -> Kernel: """Creates a new :obj:`Kernel` object from a kernel handle. Parameters diff --git a/cuda_core/cuda/core/_program.pyx b/cuda_core/cuda/core/_program.pyx index 2ef38775d1a..72099bceced 100644 --- a/cuda_core/cuda/core/_program.pyx +++ b/cuda_core/cuda/core/_program.pyx @@ -11,8 +11,12 @@ from __future__ import annotations from dataclasses import dataclass import threading +from typing import TYPE_CHECKING from warnings import warn +if TYPE_CHECKING: + from cuda.core.utils._program_cache import ProgramCacheResource # no-cython-lint + from cuda.bindings import nvrtc from cuda.pathfinder._optional_cuda_import import _optional_cuda_import @@ -90,7 +94,7 @@ cdef class Program: name_expressions: tuple | list = (), logs=None, *, - cache: "ProgramCacheResource | None" = None, + cache: ProgramCacheResource | None = None, ) -> ObjectCode: """Compile the program to the specified target type. diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index a1dc05464ac..8a414956efc 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -18,23 +18,6 @@ from cuda.bindings cimport cynvrtc from cuda.bindings cimport cynvvm from cuda.bindings cimport cynvjitlink -from ._resource_handles cimport ( - ContextHandle, - GreenCtxHandle, - StreamHandle, - EventHandle, - MemoryPoolHandle, - DevicePtrHandle, - LibraryHandle, - KernelHandle, - GraphHandle, - GraphicsResourceHandle, - NvrtcProgramHandle, - NvvmProgramHandle, - NvJitLinkHandle, - CuLinkHandle, -) - import cuda.bindings.cydriver as cydriver import cuda.bindings.cynvrtc as cynvrtc import cuda.bindings.cynvvm as cynvvm @@ -313,64 +296,86 @@ cdef void* _get_optional_driver_fn(str name): return NULL return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) -# Context -p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain") -p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease") -p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent") -p_cuGreenCtxCreate = _get_optional_driver_fn("cuGreenCtxCreate") -p_cuGreenCtxDestroy = _get_optional_driver_fn("cuGreenCtxDestroy") -p_cuCtxFromGreenCtx = _get_optional_driver_fn("cuCtxFromGreenCtx") -p_cuDevResourceGenerateDesc = _get_optional_driver_fn("cuDevResourceGenerateDesc") -p_cuGreenCtxStreamCreate = _get_optional_driver_fn("cuGreenCtxStreamCreate") - -# Stream -p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority") -p_cuStreamDestroy = _get_driver_fn("cuStreamDestroy") - -# Event -p_cuEventCreate = _get_driver_fn("cuEventCreate") -p_cuEventDestroy = _get_driver_fn("cuEventDestroy") -p_cuIpcOpenEventHandle = _get_driver_fn("cuIpcOpenEventHandle") - -# Device -p_cuDeviceGetCount = _get_driver_fn("cuDeviceGetCount") - -# Memory pool -p_cuMemPoolSetAccess = _get_driver_fn("cuMemPoolSetAccess") -p_cuMemPoolDestroy = _get_driver_fn("cuMemPoolDestroy") -p_cuMemPoolCreate = _get_driver_fn("cuMemPoolCreate") -p_cuDeviceGetMemPool = _get_driver_fn("cuDeviceGetMemPool") -p_cuMemPoolImportFromShareableHandle = _get_driver_fn("cuMemPoolImportFromShareableHandle") - -# Memory allocation -p_cuMemAllocFromPoolAsync = _get_driver_fn("cuMemAllocFromPoolAsync") -p_cuMemAllocAsync = _get_driver_fn("cuMemAllocAsync") -p_cuMemAlloc = _get_driver_fn("cuMemAlloc") -p_cuMemAllocHost = _get_driver_fn("cuMemAllocHost") - -# Memory deallocation -p_cuMemFreeAsync = _get_driver_fn("cuMemFreeAsync") -p_cuMemFree = _get_driver_fn("cuMemFree") -p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost") - -# IPC -p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer") - -# Library -p_cuLibraryLoadFromFile = _get_driver_fn("cuLibraryLoadFromFile") -p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData") -p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload") -p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel") - -# Graph -p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy") - -# Linker -p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") - -# Graphics interop -p_cuGraphicsUnmapResources = _get_driver_fn("cuGraphicsUnmapResources") -p_cuGraphicsUnregisterResource = _get_driver_fn("cuGraphicsUnregisterResource") + +cdef void _init_driver_fn_pointers() noexcept: + global p_cuDevicePrimaryCtxRetain, p_cuDevicePrimaryCtxRelease, p_cuCtxGetCurrent + global p_cuGreenCtxCreate, p_cuGreenCtxDestroy, p_cuCtxFromGreenCtx + global p_cuDevResourceGenerateDesc, p_cuGreenCtxStreamCreate + global p_cuStreamCreateWithPriority, p_cuStreamDestroy + global p_cuEventCreate, p_cuEventDestroy, p_cuIpcOpenEventHandle + global p_cuDeviceGetCount + global p_cuMemPoolSetAccess, p_cuMemPoolDestroy, p_cuMemPoolCreate + global p_cuDeviceGetMemPool, p_cuMemPoolImportFromShareableHandle + global p_cuMemAllocFromPoolAsync, p_cuMemAllocAsync, p_cuMemAlloc, p_cuMemAllocHost + global p_cuMemFreeAsync, p_cuMemFree, p_cuMemFreeHost + global p_cuMemPoolImportPointer + global p_cuLibraryLoadFromFile, p_cuLibraryLoadData, p_cuLibraryUnload, p_cuLibraryGetKernel + global p_cuGraphDestroy + global p_cuLinkDestroy + global p_cuGraphicsUnmapResources, p_cuGraphicsUnregisterResource + + # Context + p_cuDevicePrimaryCtxRetain = _get_driver_fn("cuDevicePrimaryCtxRetain") + p_cuDevicePrimaryCtxRelease = _get_driver_fn("cuDevicePrimaryCtxRelease") + p_cuCtxGetCurrent = _get_driver_fn("cuCtxGetCurrent") + p_cuGreenCtxCreate = _get_optional_driver_fn("cuGreenCtxCreate") + p_cuGreenCtxDestroy = _get_optional_driver_fn("cuGreenCtxDestroy") + p_cuCtxFromGreenCtx = _get_optional_driver_fn("cuCtxFromGreenCtx") + p_cuDevResourceGenerateDesc = _get_optional_driver_fn("cuDevResourceGenerateDesc") + p_cuGreenCtxStreamCreate = _get_optional_driver_fn("cuGreenCtxStreamCreate") + + # Stream + p_cuStreamCreateWithPriority = _get_driver_fn("cuStreamCreateWithPriority") + p_cuStreamDestroy = _get_driver_fn("cuStreamDestroy") + + # Event + p_cuEventCreate = _get_driver_fn("cuEventCreate") + p_cuEventDestroy = _get_driver_fn("cuEventDestroy") + p_cuIpcOpenEventHandle = _get_driver_fn("cuIpcOpenEventHandle") + + # Device + p_cuDeviceGetCount = _get_driver_fn("cuDeviceGetCount") + + # Memory pool + p_cuMemPoolSetAccess = _get_driver_fn("cuMemPoolSetAccess") + p_cuMemPoolDestroy = _get_driver_fn("cuMemPoolDestroy") + p_cuMemPoolCreate = _get_driver_fn("cuMemPoolCreate") + p_cuDeviceGetMemPool = _get_driver_fn("cuDeviceGetMemPool") + p_cuMemPoolImportFromShareableHandle = _get_driver_fn("cuMemPoolImportFromShareableHandle") + + # Memory allocation + p_cuMemAllocFromPoolAsync = _get_driver_fn("cuMemAllocFromPoolAsync") + p_cuMemAllocAsync = _get_driver_fn("cuMemAllocAsync") + p_cuMemAlloc = _get_driver_fn("cuMemAlloc") + p_cuMemAllocHost = _get_driver_fn("cuMemAllocHost") + + # Memory deallocation + p_cuMemFreeAsync = _get_driver_fn("cuMemFreeAsync") + p_cuMemFree = _get_driver_fn("cuMemFree") + p_cuMemFreeHost = _get_driver_fn("cuMemFreeHost") + + # IPC + p_cuMemPoolImportPointer = _get_driver_fn("cuMemPoolImportPointer") + + # Library + p_cuLibraryLoadFromFile = _get_driver_fn("cuLibraryLoadFromFile") + p_cuLibraryLoadData = _get_driver_fn("cuLibraryLoadData") + p_cuLibraryUnload = _get_driver_fn("cuLibraryUnload") + p_cuLibraryGetKernel = _get_driver_fn("cuLibraryGetKernel") + + # Graph + p_cuGraphDestroy = _get_driver_fn("cuGraphDestroy") + + # Linker + p_cuLinkDestroy = _get_driver_fn("cuLinkDestroy") + + # Graphics interop + p_cuGraphicsUnmapResources = _get_driver_fn("cuGraphicsUnmapResources") + p_cuGraphicsUnregisterResource = _get_driver_fn("cuGraphicsUnregisterResource") + + +_init_driver_fn_pointers() + # ============================================================================= # NVRTC function pointer initialization @@ -380,7 +385,11 @@ cdef void* _get_nvrtc_fn(str name): capsule = cynvrtc.__pyx_capi__[name] return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) -p_nvrtcDestroyProgram = _get_nvrtc_fn("nvrtcDestroyProgram") +cdef void _init_nvrtc_fn_pointers() noexcept: + global p_nvrtcDestroyProgram + p_nvrtcDestroyProgram = _get_nvrtc_fn("nvrtcDestroyProgram") + +_init_nvrtc_fn_pointers() # ============================================================================= # NVVM function pointer initialization @@ -393,7 +402,11 @@ cdef void* _get_nvvm_fn(str name): capsule = cynvvm.__pyx_capi__[name] return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) -p_nvvmDestroyProgram = _get_nvvm_fn("nvvmDestroyProgram") +cdef void _init_nvvm_fn_pointers() noexcept: + global p_nvvmDestroyProgram + p_nvvmDestroyProgram = _get_nvvm_fn("nvvmDestroyProgram") + +_init_nvvm_fn_pointers() # ============================================================================= # nvJitLink function pointer initialization @@ -406,4 +419,8 @@ cdef void* _get_nvjitlink_fn(str name): capsule = cynvjitlink.__pyx_capi__[name] return PyCapsule_GetPointer(capsule, PyCapsule_GetName(capsule)) -p_nvJitLinkDestroy = _get_nvjitlink_fn("nvJitLinkDestroy") +cdef void _init_nvjitlink_fn_pointers() noexcept: + global p_nvJitLinkDestroy + p_nvJitLinkDestroy = _get_nvjitlink_fn("nvJitLinkDestroy") + +_init_nvjitlink_fn_pointers() diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index f487a0a53e5..57cd575d65f 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -18,11 +18,12 @@ from cuda.core._utils.cuda_utils cimport ( import cython import warnings from dataclasses import dataclass -from typing import Protocol +from typing import Protocol, TYPE_CHECKING from cuda.core._context cimport Context from cuda.core._device_resources cimport DeviceResources from cuda.core._event import Event, EventOptions + from cuda.core._resource_handles cimport ( ContextHandle, EventHandle, @@ -41,7 +42,10 @@ from cuda.core._resource_handles cimport ( as_py, ) - +if TYPE_CHECKING: + import cuda.bindings.driver # no-cython-lint + from cuda.core._device import Device + from cuda.core.graph import GraphBuilder @dataclass cdef class StreamOptions: @@ -116,8 +120,8 @@ cdef class Stream: return Stream._from_handle(cls, get_per_thread_stream()) @classmethod - def _init(cls, obj: IsStreamType | None = None, options=None, device_id: int = None, - ctx: Context = None): + def _init(cls, obj: IsStreamType | None = None, options=None, device_id: int | None = None, + ctx: Context | None = None): cdef StreamHandle h_stream cdef cydriver.CUstream borrowed cdef ContextHandle h_context @@ -249,7 +253,7 @@ cdef class Stream: with nogil: HANDLE_RETURN(cydriver.cuStreamSynchronize(as_cu(self._h_stream))) - def record(self, event: Event = None, options: EventOptions = None) -> Event: + def record(self, event: Event | None = None, options: EventOptions | None = None) -> Event: """Record an event onto the stream. Creates an :obj:`~_event.Event` object (or reuses the given one) by @@ -397,7 +401,7 @@ cdef class Stream: return Stream._init(obj=_stream_holder()) - def create_graph_builder(self) -> "GraphBuilder": + def create_graph_builder(self) -> GraphBuilder: """Create a new :obj:`~graph.GraphBuilder` object. The new graph builder will be associated with this stream. @@ -413,13 +417,8 @@ cdef class Stream: return GraphBuilder._init(stream=self, is_stream_owner=False) -# c-only python objects, not public -cdef Stream C_LEGACY_DEFAULT_STREAM = Stream._legacy_default() -cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream._per_thread_default() - -# standard python objects, public -LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM -PER_THREAD_DEFAULT_STREAM = C_PER_THREAD_DEFAULT_STREAM +LEGACY_DEFAULT_STREAM: Stream = Stream._legacy_default() +PER_THREAD_DEFAULT_STREAM: Stream = Stream._per_thread_default() cpdef Stream default_stream(): @@ -441,9 +440,9 @@ cpdef Stream default_stream(): # value is non-zero, including for weird stuff like 123foo if use_ptds: - return C_PER_THREAD_DEFAULT_STREAM + return PER_THREAD_DEFAULT_STREAM else: - return C_LEGACY_DEFAULT_STREAM + return LEGACY_DEFAULT_STREAM cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx index 1bcfa524884..36abb9689c4 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -12,12 +12,21 @@ from collections.abc import Sequence from contextlib import ExitStack from typing import Callable -try: - from cuda.bindings import driver, nvrtc, runtime -except ImportError: - from cuda import cuda as driver - from cuda import cudart as runtime - from cuda import nvrtc +# TODO: Are we sure we don't need this fallback anymore? + +# (Previously wrapped in try/except ImportError for the legacy +# `from cuda import cuda as driver` etc. import path.) +# `as X` form is the PEP 484 explicit re-export marker, which type checkers +# need to treat these names as part of the public API of this module. +from cuda.bindings import driver as driver, nvrtc as nvrtc, runtime as runtime + +# Module-level annotations that reference `driver`, `nvrtc`, and `runtime` so +# that stubgen-pyx keeps these imports in the generated `.pyi` (it would +# otherwise trim them as unused). These names are not assigned, so they only +# affect __annotations__ and have no runtime cost. +_keep_driver_in_stub: 'driver.CUresult' +_keep_nvrtc_in_stub: 'nvrtc.nvrtcResult' +_keep_runtime_in_stub: 'runtime.cudaError_t' from cuda.bindings.nvvm import nvvmError from cuda.bindings.nvjitlink import nvJitLinkError diff --git a/cuda_core/cuda/core/_utils/enum_explanations_helpers.py b/cuda_core/cuda/core/_utils/enum_explanations_helpers.py index c7927e71e42..b9a33a197e4 100644 --- a/cuda_core/cuda/core/_utils/enum_explanations_helpers.py +++ b/cuda_core/cuda/core/_utils/enum_explanations_helpers.py @@ -38,7 +38,8 @@ def _binding_version() -> tuple[int, int, int]: parts = importlib.metadata.version("cuda-bindings").split(".")[:3] except importlib.metadata.PackageNotFoundError: return (0, 0, 0) # For very old versions of cuda-python - return tuple(int(v) for v in parts) + parts_int = ([int(v) for v in parts] + [0, 0, 0])[:3] + return (parts_int[0], parts_int[1], parts_int[2]) def _binding_version_has_usable_enum_docstrings(version: tuple[int, int, int]) -> bool: diff --git a/cuda_core/cuda/core/checkpoint.py b/cuda_core/cuda/core/checkpoint.py index 7f811013d19..70545c95a1e 100644 --- a/cuda_core/cuda/core/checkpoint.py +++ b/cuda_core/cuda/core/checkpoint.py @@ -6,17 +6,15 @@ from collections.abc import Mapping as _Mapping from typing import Any as _Any +# TODO: Are we sure we don't need this fallback anymore? +# (Previously wrapped in try/except ImportError for the legacy +# `from cuda import cuda as _driver` import path.) +from cuda.bindings import driver as _driver from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return from cuda.core._utils.version import binding_version as _binding_version from cuda.core._utils.version import driver_version as _driver_version from cuda.core.typing import ProcessStateType as _ProcessStateType -try: - from cuda.bindings import driver as _driver -except ImportError: - from cuda import cuda as _driver - - _PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateType], ...] = ( ("CU_PROCESS_STATE_RUNNING", "running"), ("CU_PROCESS_STATE_LOCKED", "locked"), @@ -218,7 +216,7 @@ def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None): pairs = [] for old_uuid, new_uuid in gpu_mapping.items(): pair = driver.CUcheckpointGpuPair() - buffers = [] + buffers: list = [] # holds ctypes string-buffer keepalives for the call below pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers) pair.newUuid = _as_cuuuid(driver, new_uuid, buffers) pairs.append(pair) diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx index b3a12774dda..8875284f8fa 100644 --- a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx +++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx @@ -15,7 +15,8 @@ from cuda.core._resource_handles cimport ( graph_node_get_graph, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from collections.abc import MutableSet +from collections.abc import MutableSet, Set as AbstractSet +from typing import Any # ---- Python MutableSet wrapper ---------------------------------------------- @@ -70,7 +71,7 @@ class AdjacencySetProxy(MutableSet): if members: (<_AdjacencySetCore>self._core).remove_edges(members) - def __isub__(self, it): + def __isub__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy": """Remove edges to all nodes in *it* in a single driver call.""" if it is self: self.clear() @@ -98,7 +99,7 @@ class AdjacencySetProxy(MutableSet): if new: (<_AdjacencySetCore>self._core).add_edges(new) - def __ior__(self, it): + def __ior__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy": # type: ignore[override,misc] """Add edges to all nodes in *it* in a single driver call.""" self.update(it) return self diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyx b/cuda_core/cuda/core/graph/_graph_builder.pyx index b745598abab..6961ad20d80 100644 --- a/cuda_core/cuda/core/graph/_graph_builder.pyx +++ b/cuda_core/cuda/core/graph/_graph_builder.pyx @@ -4,6 +4,7 @@ import weakref from dataclasses import dataclass +from typing import TYPE_CHECKING from libc.stdint cimport intptr_t @@ -22,6 +23,9 @@ from cuda.core._utils.cuda_utils import ( handle_return, ) +if TYPE_CHECKING: + from cuda.core.graph._graph_definition import GraphDefinition + __all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions'] diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyx b/cuda_core/cuda/core/graph/_graph_definition.pyx index 413a17368d8..4ce5dfa266d 100644 --- a/cuda_core/cuda/core/graph/_graph_definition.pyx +++ b/cuda_core/cuda/core/graph/_graph_definition.pyx @@ -23,10 +23,31 @@ from cuda.core._resource_handles cimport ( ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from typing import TYPE_CHECKING + from cuda.core._utils.cuda_utils import driver from cuda.core.typing import GraphMemoryType +if TYPE_CHECKING: + from cuda.core._device import Device + from cuda.core.graph._subclasses import ( + AllocNode, + ChildGraphNode, + EmptyNode, + EventRecordNode, + EventWaitNode, + FreeNode, + HostCallbackNode, + IfElseNode, + IfNode, + KernelNode, + MemcpyNode, + MemsetNode, + SwitchNode, + WhileNode, + ) + __all__ = ['GraphCondition', 'GraphDefinition'] @@ -103,43 +124,43 @@ cdef class GraphDefinition: return hash(as_intptr(self._h_graph)) @property - def _entry(self) -> "GraphNode": + def _entry(self) -> GraphNode: """Return the internal entry-point GraphNode (no dependencies).""" cdef GraphNode n = GraphNode.__new__(GraphNode) n._h_node = create_graph_node_handle(NULL, self._h_graph) return n - def allocate(self, size_t size, *, device: "Device" | int | None = None, + def allocate(self, size_t size, *, device: Device | int | None = None, memory_type: GraphMemoryType = GraphMemoryType.DEVICE, - peer_access: list["Device" | int] | None = None) -> "AllocNode": + peer_access: list[Device | int] | None = None) -> AllocNode: """Add an entry-point memory allocation node (no dependencies). See :meth:`GraphNode.allocate` for full documentation. """ return self._entry.allocate(size, device=device, memory_type=memory_type, peer_access=peer_access) - def deallocate(self, dptr) -> "FreeNode": + def deallocate(self, dptr) -> FreeNode: """Add an entry-point memory free node (no dependencies). See :meth:`GraphNode.deallocate` for full documentation. """ return self._entry.deallocate(dptr) - def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0) -> "MemsetNode": + def memset(self, dst, value, size_t width, size_t height=1, size_t pitch=0) -> MemsetNode: """Add an entry-point memset node (no dependencies). See :meth:`GraphNode.memset` for full documentation. """ return self._entry.memset(dst, value, width, height, pitch) - def launch(self, config, kernel, *args) -> "KernelNode": + def launch(self, config, kernel, *args) -> KernelNode: """Add an entry-point kernel launch node (no dependencies). See :meth:`GraphNode.launch` for full documentation. """ return self._entry.launch(config, kernel, *args) - def empty(self) -> "EmptyNode": + def empty(self) -> EmptyNode: """Add an entry-point empty node (no dependencies). Returns @@ -149,7 +170,7 @@ cdef class GraphDefinition: """ return self._entry.join() - def join(self, *nodes) -> "EmptyNode": + def join(self, *nodes) -> EmptyNode: """Create an empty node that depends on all given nodes. Parameters @@ -164,35 +185,35 @@ cdef class GraphDefinition: """ return self._entry.join(*nodes) - def memcpy(self, dst, src, size_t size) -> "MemcpyNode": + def memcpy(self, dst, src, size_t size) -> MemcpyNode: """Add an entry-point memcpy node (no dependencies). See :meth:`GraphNode.memcpy` for full documentation. """ return self._entry.memcpy(dst, src, size) - def embed(self, child: GraphDefinition) -> "ChildGraphNode": + def embed(self, child: GraphDefinition) -> ChildGraphNode: """Add an entry-point child graph node (no dependencies). See :meth:`GraphNode.embed` for full documentation. """ return self._entry.embed(child) - def record(self, event) -> "EventRecordNode": + def record(self, event) -> EventRecordNode: """Add an entry-point event record node (no dependencies). See :meth:`GraphNode.record` for full documentation. """ return self._entry.record(event) - def wait(self, event) -> "EventWaitNode": + def wait(self, event) -> EventWaitNode: """Add an entry-point event wait node (no dependencies). See :meth:`GraphNode.wait` for full documentation. """ return self._entry.wait(event) - def callback(self, fn, *, user_data=None) -> "HostCallbackNode": + def callback(self, fn, *, user_data=None) -> HostCallbackNode: """Add an entry-point host callback node (no dependencies). See :meth:`GraphNode.callback` for full documentation. @@ -233,28 +254,28 @@ cdef class GraphDefinition: return GraphCondition._from_handle(c_handle) - def if_then(self, condition: GraphCondition) -> "IfNode": + def if_then(self, condition: GraphCondition) -> IfNode: """Add an entry-point if-conditional node (no dependencies). See :meth:`GraphNode.if_then` for full documentation. """ return self._entry.if_then(condition) - def if_else(self, condition: GraphCondition) -> "IfElseNode": + def if_else(self, condition: GraphCondition) -> IfElseNode: """Add an entry-point if-else conditional node (no dependencies). See :meth:`GraphNode.if_else` for full documentation. """ return self._entry.if_else(condition) - def while_loop(self, condition: GraphCondition) -> "WhileNode": + def while_loop(self, condition: GraphCondition) -> WhileNode: """Add an entry-point while-loop conditional node (no dependencies). See :meth:`GraphNode.while_loop` for full documentation. """ return self._entry.while_loop(condition) - def switch(self, condition: GraphCondition, unsigned int count) -> "SwitchNode": + def switch(self, condition: GraphCondition, unsigned int count) -> SwitchNode: """Add an entry-point switch conditional node (no dependencies). See :meth:`GraphNode.switch` for full documentation. diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index a5577d134de..ae48a4f0bb4 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -6,6 +6,8 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from cpython.ref cimport Py_INCREF from libc.stddef cimport size_t @@ -65,10 +67,13 @@ from cuda.core.graph._adjacency_set_proxy import AdjacencySetProxy from cuda.core._utils.cuda_utils import driver from cuda.core.typing import GraphMemoryType +if TYPE_CHECKING: + from cuda.core._device import Device + __all__ = ['GraphNode'] # See _cpp/REGISTRY_DESIGN.md (Level 2: Resource Handle -> Python Object) -_node_registry = weakref.WeakValueDictionary() +_node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueDictionary() cdef inline GraphNode _registered(GraphNode n): @@ -126,7 +131,7 @@ cdef class GraphNode: return driver.CUgraphNodeType(node_type) @property - def graph(self) -> "GraphDefinition": + def graph(self) -> GraphDefinition: """Return the GraphDefinition this node belongs to.""" return GraphDefinition._from_handle(graph_node_get_graph(self._h_node)) @@ -219,9 +224,9 @@ cdef class GraphNode: """ return GN_join(self, nodes) - def allocate(self, size_t size, *, device: "Device" | int | None = None, + def allocate(self, size_t size, *, device: Device | int | None = None, memory_type: GraphMemoryType = GraphMemoryType.DEVICE, - peer_access: list["Device" | int] | None = None) -> AllocNode: + peer_access: list[Device | int] | None = None) -> AllocNode: """Add a memory allocation node depending on this node. Parameters diff --git a/cuda_core/cuda/core/graph/_subclasses.pyx b/cuda_core/cuda/core/graph/_subclasses.pyx index 3550e993fe1..eb23c6bcc57 100644 --- a/cuda_core/cuda/core/graph/_subclasses.pyx +++ b/cuda_core/cuda/core/graph/_subclasses.pyx @@ -478,7 +478,7 @@ cdef class ChildGraphNode(GraphNode): f" child=0x{as_intptr(self._h_child_graph):x}>") @property - def child_graph(self) -> "GraphDefinition": + def child_graph(self) -> GraphDefinition: """The embedded graph definition (non-owning wrapper).""" return GraphDefinition._from_handle(self._h_child_graph) @@ -722,7 +722,7 @@ cdef class IfNode(ConditionalNode): f" condition=0x{self._condition._c_handle:x}>") @property - def then(self) -> "GraphDefinition": + def then(self) -> GraphDefinition: """The 'then' branch graph.""" return self._branches[0] @@ -735,12 +735,12 @@ cdef class IfElseNode(ConditionalNode): f" condition=0x{self._condition._c_handle:x}>") @property - def then(self) -> "GraphDefinition": + def then(self) -> GraphDefinition: """The ``then`` branch graph (executed when condition is non-zero).""" return self._branches[0] @property - def else_(self) -> "GraphDefinition": + def else_(self) -> GraphDefinition: """The ``else`` branch graph (executed when condition is zero).""" return self._branches[1] @@ -753,7 +753,7 @@ cdef class WhileNode(ConditionalNode): f" condition=0x{self._condition._c_handle:x}>") @property - def body(self) -> "GraphDefinition": + def body(self) -> GraphDefinition: """The loop body graph.""" return self._branches[0] diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py index c662ee97754..685519f9b80 100644 --- a/cuda_core/cuda/core/system/__init__.py +++ b/cuda_core/cuda/core/system/__init__.py @@ -8,6 +8,7 @@ # contexts created, so that a user can use NVML to explore things about their # system without loading CUDA. +from typing import TYPE_CHECKING __all__ = [ "CUDA_BINDINGS_NVML_IS_COMPATIBLE", @@ -22,7 +23,16 @@ from ._system import * -if CUDA_BINDINGS_NVML_IS_COMPATIBLE: +# The TYPE_CHECKING branch is split out from the runtime branch so that +# stubgen-pyx, which only recognizes the literal `if TYPE_CHECKING:` form, +# preserves these imports in the generated .pyi. When +# CUDA_BINDINGS_NVML_IS_COMPATIBLE is no longer necessary, this complexity can +# be removed. +if TYPE_CHECKING: + from ._device import * + from ._system_events import * + from .exceptions import * +elif CUDA_BINDINGS_NVML_IS_COMPATIBLE: from ._device import * from ._device import __all__ as _device_all from ._system_events import * diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 9c8224e54aa..0da8b190caf 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -6,7 +6,7 @@ from libc.stdint cimport intptr_t, uint64_t from libc.math cimport ceil from multiprocessing import cpu_count -from typing import Iterable +from typing import Iterable, TYPE_CHECKING import warnings from cuda.bindings import nvml @@ -34,6 +34,9 @@ from cuda.core.system.typing import ( ThermalTarget, ) +if TYPE_CHECKING: + import cuda.core # no-cython-lint + cdef object _pstate_to_int(object pstate): if pstate == nvml.Pstates.PSTATE_UNKNOWN: diff --git a/cuda_core/cuda/core/system/_nvml_context.pyx b/cuda_core/cuda/core/system/_nvml_context.pyx index 25445805642..910284809d2 100644 --- a/cuda_core/cuda/core/system/_nvml_context.pyx +++ b/cuda_core/cuda/core/system/_nvml_context.pyx @@ -9,10 +9,10 @@ from cuda.bindings import nvml from cuda.core.system import exceptions -_NVML_STATE = _NVMLState.UNINITIALIZED +cdef _NVMLState _NVML_STATE = _NVMLState.UNINITIALIZED -_NVML_OWNER_PID = 0 +cdef int _NVML_OWNER_PID = 0 _lock = threading.Lock() diff --git a/cuda_core/cuda/core/typing.py b/cuda_core/cuda/core/typing.py index 1a6d377579d..5f633afeb6a 100644 --- a/cuda_core/cuda/core/typing.py +++ b/cuda_core/cuda/core/typing.py @@ -4,11 +4,28 @@ """Public type aliases, protocols, and enumerations used in cuda.core API signatures.""" -try: - from enum import StrEnum -except ImportError: - from backports.strenum import StrEnum +import sys +from typing import TYPE_CHECKING from typing import Literal as _Literal +from typing import TypeAlias as _TypeAlias + +if TYPE_CHECKING: + # `backports.strenum` ships no type stubs and typeshed conditionally gates + # `enum.StrEnum` behind `sys.version_info >= (3, 11)`. Declaring a minimal + # local shape here (mirroring typeshed's 3.11 StrEnum) lets mypy at + # `python_version = "3.10"` infer subclass members as `Literal[Foo.MEMBER]` + # rather than bare `str`. + from enum import Enum + + class StrEnum(str, Enum): + _value_: str + + +if not TYPE_CHECKING: + if sys.version_info >= (3, 11): + from enum import StrEnum + else: + from backports.strenum import StrEnum from cuda.core._context import DeviceResourcesType from cuda.core._stream import IsStreamType @@ -36,7 +53,7 @@ # A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting # :attr:`Buffer.handle`. -DevicePointerType = driver.CUdeviceptr | int | None +DevicePointerType: _TypeAlias = driver.CUdeviceptr | int | None ProcessStateType = _Literal["running", "locked", "checkpointed", "failed"] diff --git a/cuda_core/cuda/core/utils/_program_cache/_keys.py b/cuda_core/cuda/core/utils/_program_cache/_keys.py index fbb5ef3f890..273ffd33316 100644 --- a/cuda_core/cuda/core/utils/_program_cache/_keys.py +++ b/cuda_core/cuda/core/utils/_program_cache/_keys.py @@ -197,7 +197,9 @@ def _linker_backend_and_version(use_driver: bool) -> tuple[str, str]: return ("driver", str(_driver_version())) nvjitlink = sys.modules.get("cuda.bindings.nvjitlink") if nvjitlink is None: - from cuda.bindings import nvjitlink + from cuda.bindings import nvjitlink as _nvjitlink + + nvjitlink = _nvjitlink return ("nvJitLink", str(nvjitlink.version())) From a2a17d12e58c94f0d149643d16baac8c0697cea3 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 12 May 2026 13:45:47 -0400 Subject: [PATCH 3/6] Add new .pyi files --- cuda_core/cuda/core/_context.pyi | 86 + cuda_core/cuda/core/_device.pyi | 913 ++++++++ cuda_core/cuda/core/_device_resources.pyi | 147 ++ cuda_core/cuda/core/_dlpack.pyi | 24 + cuda_core/cuda/core/_event.pyi | 179 ++ cuda_core/cuda/core/_graphics.pyi | 224 ++ cuda_core/cuda/core/_kernel_arg_handler.pyi | 16 + cuda_core/cuda/core/_launch_config.pyi | 80 + cuda_core/cuda/core/_launcher.pyi | 30 + cuda_core/cuda/core/_layout.pyi | 581 +++++ cuda_core/cuda/core/_linker.pyi | 249 +++ cuda_core/cuda/core/_memory/_buffer.pyi | 292 +++ .../core/_memory/_device_memory_resource.pyi | 225 ++ .../core/_memory/_graph_memory_resource.pyi | 119 ++ cuda_core/cuda/core/_memory/_ipc.pyi | 86 + .../core/_memory/_managed_memory_resource.pyi | 108 + cuda_core/cuda/core/_memory/_memory_pool.pyi | 127 ++ .../cuda/core/_memory/_peer_access_utils.pyi | 138 ++ .../core/_memory/_pinned_memory_resource.pyi | 148 ++ cuda_core/cuda/core/_memoryview.pyi | 305 +++ cuda_core/cuda/core/_module.pyi | 489 +++++ cuda_core/cuda/core/_program.pyi | 440 ++++ cuda_core/cuda/core/_resource_handles.pyi | 22 + cuda_core/cuda/core/_stream.pyi | 229 ++ cuda_core/cuda/core/_tensor_bridge.pyi | 82 + cuda_core/cuda/core/_tensor_map.pyi | 335 +++ cuda_core/cuda/core/_utils/cuda_utils.pyi | 144 ++ cuda_core/cuda/core/_utils/version.pyi | 14 + .../cuda/core/graph/_adjacency_set_proxy.pyi | 59 + cuda_core/cuda/core/graph/_graph_builder.pyi | 461 ++++ .../cuda/core/graph/_graph_definition.pyi | 238 +++ cuda_core/cuda/core/graph/_graph_node.pyi | 376 ++++ cuda_core/cuda/core/graph/_subclasses.pyi | 339 +++ cuda_core/cuda/core/graph/_utils.pyi | 3 + cuda_core/cuda/core/system/_device.pyi | 1900 +++++++++++++++++ cuda_core/cuda/core/system/_nvml_context.pyi | 33 + cuda_core/cuda/core/system/_system.pyi | 75 + cuda_core/cuda/core/system/_system_events.pyi | 133 ++ 38 files changed, 9449 insertions(+) create mode 100644 cuda_core/cuda/core/_context.pyi create mode 100644 cuda_core/cuda/core/_device.pyi create mode 100644 cuda_core/cuda/core/_device_resources.pyi create mode 100644 cuda_core/cuda/core/_dlpack.pyi create mode 100644 cuda_core/cuda/core/_event.pyi create mode 100644 cuda_core/cuda/core/_graphics.pyi create mode 100644 cuda_core/cuda/core/_kernel_arg_handler.pyi create mode 100644 cuda_core/cuda/core/_launch_config.pyi create mode 100644 cuda_core/cuda/core/_launcher.pyi create mode 100644 cuda_core/cuda/core/_layout.pyi create mode 100644 cuda_core/cuda/core/_linker.pyi create mode 100644 cuda_core/cuda/core/_memory/_buffer.pyi create mode 100644 cuda_core/cuda/core/_memory/_device_memory_resource.pyi create mode 100644 cuda_core/cuda/core/_memory/_graph_memory_resource.pyi create mode 100644 cuda_core/cuda/core/_memory/_ipc.pyi create mode 100644 cuda_core/cuda/core/_memory/_managed_memory_resource.pyi create mode 100644 cuda_core/cuda/core/_memory/_memory_pool.pyi create mode 100644 cuda_core/cuda/core/_memory/_peer_access_utils.pyi create mode 100644 cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi create mode 100644 cuda_core/cuda/core/_memoryview.pyi create mode 100644 cuda_core/cuda/core/_module.pyi create mode 100644 cuda_core/cuda/core/_program.pyi create mode 100644 cuda_core/cuda/core/_resource_handles.pyi create mode 100644 cuda_core/cuda/core/_stream.pyi create mode 100644 cuda_core/cuda/core/_tensor_bridge.pyi create mode 100644 cuda_core/cuda/core/_tensor_map.pyi create mode 100644 cuda_core/cuda/core/_utils/cuda_utils.pyi create mode 100644 cuda_core/cuda/core/_utils/version.pyi create mode 100644 cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi create mode 100644 cuda_core/cuda/core/graph/_graph_builder.pyi create mode 100644 cuda_core/cuda/core/graph/_graph_definition.pyi create mode 100644 cuda_core/cuda/core/graph/_graph_node.pyi create mode 100644 cuda_core/cuda/core/graph/_subclasses.pyi create mode 100644 cuda_core/cuda/core/graph/_utils.pyi create mode 100644 cuda_core/cuda/core/system/_device.pyi create mode 100644 cuda_core/cuda/core/system/_nvml_context.pyi create mode 100644 cuda_core/cuda/core/system/_system.pyi create mode 100644 cuda_core/cuda/core/system/_system_events.pyi diff --git a/cuda_core/cuda/core/_context.pyi b/cuda_core/cuda/core/_context.pyi new file mode 100644 index 00000000000..cd52a055bfe --- /dev/null +++ b/cuda_core/cuda/core/_context.pyi @@ -0,0 +1,86 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_context.pyx + +from __future__ import annotations + +from collections.abc import Sequence +from dataclasses import dataclass + +from cuda.core._device_resources import (DeviceResources, SMResource, + WorkqueueResource) +from cuda.core._stream import StreamOptions + + +class Context: + """CUDA context wrapper. + + Context objects represent CUDA contexts and cannot be instantiated directly. + Use Device or Stream APIs to obtain context objects. + """ + + def close(self): + """Release this context wrapper's underlying CUDA handles.""" + + def __init__(self, *args, **kwargs): + ... + + @property + def handle(self): + """Return the underlying CUcontext handle.""" + + @property + def _handle(self): + ... + + @property + def is_green(self) -> bool: + """True if this context was created from device resources.""" + + @property + def resources(self) -> DeviceResources: + """Query the hardware resources provisioned for this context. + + For green contexts, returns the resources this context was created + with (SM partition, workqueue config). For primary contexts, returns + the full device resources. + + Raises :class:`RuntimeError` if the context has been closed. + """ + + def create_stream(self, options: StreamOptions | None=None): + """Create a new stream bound to this green context. + + This method is only available on green contexts. For primary + contexts, use :meth:`Device.create_stream` instead. + + Parameters + ---------- + options : :obj:`~_stream.StreamOptions`, optional + Customizable dataclass for stream creation options. + + Returns + ------- + :obj:`~_stream.Stream` + Newly created stream object. + """ + + def __eq__(self, other): + ... + + def __hash__(self) -> int: + ... + + def __repr__(self) -> str: + ... + +@dataclass +class ContextOptions: + """Options for context creation. + + Attributes + ---------- + resources : :obj:`~cuda.core.typing.DeviceResourcesType` + Device resources used to create a green context. + """ + resources: DeviceResourcesType +__all__ = ['Context', 'ContextOptions'] +DeviceResourcesType = Sequence[SMResource | WorkqueueResource] \ No newline at end of file diff --git a/cuda_core/cuda/core/_device.pyi b/cuda_core/cuda/core/_device.pyi new file mode 100644 index 00000000000..dcbe2694e3f --- /dev/null +++ b/cuda_core/cuda/core/_device.pyi @@ -0,0 +1,913 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_device.pyx + +from __future__ import annotations + +import threading + +import cuda.core.system +from cuda.core._context import Context, ContextOptions +from cuda.core._device_resources import DeviceResources +from cuda.core._event import Event, EventOptions +from cuda.core._memory._buffer import Buffer, MemoryResource +from cuda.core._stream import IsStreamType, Stream, StreamOptions +from cuda.core._utils.cuda_utils import ComputeCapability +from cuda.core.graph import GraphBuilder + + +class DeviceProperties: + """ + A class to query various attributes of a CUDA device. + + Attributes are read-only and provide information about the device. + """ + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def _init(cls, handle): + ... + + @property + def max_threads_per_block(self) -> int: + """int: Maximum number of threads per block.""" + + @property + def max_block_dim_x(self) -> int: + """int: Maximum block dimension X.""" + + @property + def max_block_dim_y(self) -> int: + """int: Maximum block dimension Y.""" + + @property + def max_block_dim_z(self) -> int: + """int: Maximum block dimension Z.""" + + @property + def max_grid_dim_x(self) -> int: + """int: Maximum grid dimension X.""" + + @property + def max_grid_dim_y(self) -> int: + """int: Maximum grid dimension Y.""" + + @property + def max_grid_dim_z(self) -> int: + """int: Maximum grid dimension Z.""" + + @property + def max_shared_memory_per_block(self) -> int: + """int: Maximum shared memory available per block in bytes.""" + + @property + def total_constant_memory(self) -> int: + """int: Memory available on device for constant variables in a CUDA C kernel in bytes.""" + + @property + def warp_size(self) -> int: + """int: Warp size in threads.""" + + @property + def max_pitch(self) -> int: + """int: Maximum pitch in bytes allowed by memory copies.""" + + @property + def maximum_texture1d_width(self) -> int: + """int: Maximum 1D texture width.""" + + @property + def maximum_texture1d_linear_width(self) -> int: + """int: Maximum width for a 1D texture bound to linear memory.""" + + @property + def maximum_texture1d_mipmapped_width(self) -> int: + """int: Maximum mipmapped 1D texture width.""" + + @property + def maximum_texture2d_width(self) -> int: + """int: Maximum 2D texture width.""" + + @property + def maximum_texture2d_height(self) -> int: + """int: Maximum 2D texture height.""" + + @property + def maximum_texture2d_linear_width(self) -> int: + """int: Maximum width for a 2D texture bound to linear memory.""" + + @property + def maximum_texture2d_linear_height(self) -> int: + """int: Maximum height for a 2D texture bound to linear memory.""" + + @property + def maximum_texture2d_linear_pitch(self) -> int: + """int: Maximum pitch in bytes for a 2D texture bound to linear memory.""" + + @property + def maximum_texture2d_mipmapped_width(self) -> int: + """int: Maximum mipmapped 2D texture width.""" + + @property + def maximum_texture2d_mipmapped_height(self) -> int: + """int: Maximum mipmapped 2D texture height.""" + + @property + def maximum_texture3d_width(self) -> int: + """int: Maximum 3D texture width.""" + + @property + def maximum_texture3d_height(self) -> int: + """int: Maximum 3D texture height.""" + + @property + def maximum_texture3d_depth(self) -> int: + """int: Maximum 3D texture depth.""" + + @property + def maximum_texture3d_width_alternate(self) -> int: + """int: Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported.""" + + @property + def maximum_texture3d_height_alternate(self) -> int: + """int: Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported.""" + + @property + def maximum_texture3d_depth_alternate(self) -> int: + """int: Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported.""" + + @property + def maximum_texturecubemap_width(self) -> int: + """int: Maximum cubemap texture width or height.""" + + @property + def maximum_texture1d_layered_width(self) -> int: + """int: Maximum 1D layered texture width.""" + + @property + def maximum_texture1d_layered_layers(self) -> int: + """int: Maximum layers in a 1D layered texture.""" + + @property + def maximum_texture2d_layered_width(self) -> int: + """int: Maximum 2D layered texture width.""" + + @property + def maximum_texture2d_layered_height(self) -> int: + """int: Maximum 2D layered texture height.""" + + @property + def maximum_texture2d_layered_layers(self) -> int: + """int: Maximum layers in a 2D layered texture.""" + + @property + def maximum_texturecubemap_layered_width(self) -> int: + """int: Maximum cubemap layered texture width or height.""" + + @property + def maximum_texturecubemap_layered_layers(self) -> int: + """int: Maximum layers in a cubemap layered texture.""" + + @property + def maximum_surface1d_width(self) -> int: + """int: Maximum 1D surface width.""" + + @property + def maximum_surface2d_width(self) -> int: + """int: Maximum 2D surface width.""" + + @property + def maximum_surface2d_height(self) -> int: + """int: Maximum 2D surface height.""" + + @property + def maximum_surface3d_width(self) -> int: + """int: Maximum 3D surface width.""" + + @property + def maximum_surface3d_height(self) -> int: + """int: Maximum 3D surface height.""" + + @property + def maximum_surface3d_depth(self) -> int: + """int: Maximum 3D surface depth.""" + + @property + def maximum_surface1d_layered_width(self) -> int: + """int: Maximum 1D layered surface width.""" + + @property + def maximum_surface1d_layered_layers(self) -> int: + """int: Maximum layers in a 1D layered surface.""" + + @property + def maximum_surface2d_layered_width(self) -> int: + """int: Maximum 2D layered surface width.""" + + @property + def maximum_surface2d_layered_height(self) -> int: + """int: Maximum 2D layered surface height.""" + + @property + def maximum_surface2d_layered_layers(self) -> int: + """int: Maximum layers in a 2D layered surface.""" + + @property + def maximum_surfacecubemap_width(self) -> int: + """int: Maximum cubemap surface width.""" + + @property + def maximum_surfacecubemap_layered_width(self) -> int: + """int: Maximum cubemap layered surface width.""" + + @property + def maximum_surfacecubemap_layered_layers(self) -> int: + """int: Maximum layers in a cubemap layered surface.""" + + @property + def max_registers_per_block(self) -> int: + """int: Maximum number of 32-bit registers available to a thread block.""" + + @property + def clock_rate(self) -> int: + """int: Typical clock frequency in kilohertz.""" + + @property + def texture_alignment(self) -> int: + """int: Alignment requirement for textures.""" + + @property + def texture_pitch_alignment(self) -> int: + """int: Pitch alignment requirement for textures.""" + + @property + def gpu_overlap(self) -> bool: + """bool: Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use :attr:`~DeviceProperties.async_engine_count` instead.""" + + @property + def multiprocessor_count(self) -> int: + """int: Number of multiprocessors on device.""" + + @property + def kernel_exec_timeout(self) -> bool: + """bool: Specifies whether there is a run time limit on kernels.""" + + @property + def integrated(self) -> bool: + """bool: Device is integrated with host memory.""" + + @property + def can_map_host_memory(self) -> bool: + """bool: Device can map host memory into CUDA address space.""" + + @property + def compute_mode(self) -> int: + """int: Compute mode (See CUcomputemode for details).""" + + @property + def concurrent_kernels(self) -> bool: + """bool: Device can possibly execute multiple kernels concurrently.""" + + @property + def ecc_enabled(self) -> bool: + """bool: Device has ECC support enabled.""" + + @property + def pci_bus_id(self) -> int: + """int: PCI bus ID of the device.""" + + @property + def pci_device_id(self) -> int: + """int: PCI device ID of the device.""" + + @property + def pci_domain_id(self) -> int: + """int: PCI domain ID of the device.""" + + @property + def tcc_driver(self) -> bool: + """bool: Device is using TCC driver model.""" + + @property + def memory_clock_rate(self) -> int: + """int: Peak memory clock frequency in kilohertz.""" + + @property + def global_memory_bus_width(self) -> int: + """int: Global memory bus width in bits.""" + + @property + def l2_cache_size(self) -> int: + """int: Size of L2 cache in bytes.""" + + @property + def max_threads_per_multiprocessor(self) -> int: + """int: Maximum resident threads per multiprocessor.""" + + @property + def unified_addressing(self) -> bool: + """bool: Device shares a unified address space with the host.""" + + @property + def compute_capability_major(self) -> int: + """int: Major compute capability version number.""" + + @property + def compute_capability_minor(self) -> int: + """int: Minor compute capability version number.""" + + @property + def global_l1_cache_supported(self) -> bool: + """bool: Device supports caching globals in L1.""" + + @property + def local_l1_cache_supported(self) -> bool: + """bool: Device supports caching locals in L1.""" + + @property + def max_shared_memory_per_multiprocessor(self) -> int: + """int: Maximum shared memory available per multiprocessor in bytes.""" + + @property + def max_registers_per_multiprocessor(self) -> int: + """int: Maximum number of 32-bit registers available per multiprocessor.""" + + @property + def managed_memory(self) -> bool: + """bool: Device can allocate managed memory on this system.""" + + @property + def multi_gpu_board(self) -> bool: + """bool: Device is on a multi-GPU board.""" + + @property + def multi_gpu_board_group_id(self) -> int: + """int: Unique id for a group of devices on the same multi-GPU board.""" + + @property + def host_native_atomic_supported(self) -> bool: + """bool: Link between the device and the host supports all native atomic operations.""" + + @property + def single_to_double_precision_perf_ratio(self) -> int: + """int: Ratio of single precision performance (in floating-point operations per second) to double precision performance.""" + + @property + def pageable_memory_access(self) -> bool: + """bool: Device supports coherently accessing pageable memory without calling cudaHostRegister on it.""" + + @property + def concurrent_managed_access(self) -> bool: + """bool: Device can coherently access managed memory concurrently with the CPU.""" + + @property + def compute_preemption_supported(self) -> bool: + """bool: Device supports compute preemption.""" + + @property + def can_use_host_pointer_for_registered_mem(self) -> bool: + """bool: Device can access host registered memory at the same virtual address as the CPU.""" + + @property + def cooperative_launch(self) -> bool: + """bool: Device supports launching cooperative kernels via cuLaunchCooperativeKernel.""" + + @property + def max_shared_memory_per_block_optin(self) -> int: + """int: Maximum optin shared memory per block.""" + + @property + def pageable_memory_access_uses_host_page_tables(self) -> bool: + """bool: Device accesses pageable memory via the host's page tables.""" + + @property + def direct_managed_mem_access_from_host(self) -> bool: + """bool: The host can directly access managed memory on the device without migration.""" + + @property + def virtual_memory_management_supported(self) -> bool: + """bool: Device supports virtual memory management APIs like cuMemAddressReserve, cuMemCreate, cuMemMap and related APIs.""" + + @property + def handle_type_posix_file_descriptor_supported(self) -> bool: + """bool: Device supports exporting memory to a posix file descriptor with cuMemExportToShareableHandle, if requested via cuMemCreate.""" + + @property + def handle_type_win32_handle_supported(self) -> bool: + """bool: Device supports exporting memory to a Win32 NT handle with cuMemExportToShareableHandle, if requested via cuMemCreate.""" + + @property + def handle_type_win32_kmt_handle_supported(self) -> bool: + """bool: Device supports exporting memory to a Win32 KMT handle with cuMemExportToShareableHandle, if requested via cuMemCreate.""" + + @property + def max_blocks_per_multiprocessor(self) -> int: + """int: Maximum number of blocks per multiprocessor.""" + + @property + def generic_compression_supported(self) -> bool: + """bool: Device supports compression of memory.""" + + @property + def max_persisting_l2_cache_size(self) -> int: + """int: Maximum L2 persisting lines capacity setting in bytes.""" + + @property + def max_access_policy_window_size(self) -> int: + """int: Maximum value of CUaccessPolicyWindow.num_bytes.""" + + @property + def gpu_direct_rdma_with_cuda_vmm_supported(self) -> bool: + """bool: Device supports specifying the GPUDirect RDMA flag with cuMemCreate.""" + + @property + def reserved_shared_memory_per_block(self) -> int: + """int: Shared memory reserved by CUDA driver per block in bytes.""" + + @property + def sparse_cuda_array_supported(self) -> bool: + """bool: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.""" + + @property + def read_only_host_register_supported(self) -> bool: + """bool: True if device supports using the cuMemHostRegister flag CU_MEMHOSTREGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU, False if not.""" + + @property + def memory_pools_supported(self) -> bool: + """bool: Device supports using the cuMemAllocAsync and cuMemPool family of APIs.""" + + @property + def gpu_direct_rdma_supported(self) -> bool: + """bool: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information).""" + + @property + def gpu_direct_rdma_flush_writes_options(self) -> int: + """int: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the CUflushGPUDirectRDMAWritesOptions enum.""" + + @property + def gpu_direct_rdma_writes_ordering(self) -> int: + """int: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See CUGPUDirectRDMAWritesOrdering for the numerical values returned here.""" + + @property + def mempool_supported_handle_types(self) -> int: + """int: Handle types supported with mempool based IPC.""" + + @property + def deferred_mapping_cuda_array_supported(self) -> bool: + """bool: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.""" + + @property + def numa_config(self) -> int: + """int: NUMA configuration of a device: value is of type CUdeviceNumaConfig enum.""" + + @property + def numa_id(self) -> int: + """int: NUMA node ID of the GPU memory.""" + + @property + def multicast_supported(self) -> bool: + """bool: Device supports switch multicast and reduction operations.""" + + @property + def surface_alignment(self) -> int: + """int: Surface alignment requirement in bytes.""" + + @property + def async_engine_count(self) -> int: + """int: Number of asynchronous engines.""" + + @property + def can_tex2d_gather(self) -> bool: + """bool: True if device supports 2D texture gather operations, False if not.""" + + @property + def maximum_texture2d_gather_width(self) -> int: + """int: Maximum 2D texture gather width.""" + + @property + def maximum_texture2d_gather_height(self) -> int: + """int: Maximum 2D texture gather height.""" + + @property + def stream_priorities_supported(self) -> bool: + """bool: True if device supports stream priorities, False if not.""" + + @property + def can_flush_remote_writes(self) -> bool: + """bool: The CU_STREAM_WAIT_VALUE_FLUSH flag and the CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See Stream Memory Operations for additional details.""" + + @property + def host_register_supported(self) -> bool: + """bool: Device supports host memory registration via cudaHostRegister.""" + + @property + def timeline_semaphore_interop_supported(self) -> bool: + """bool: External timeline semaphore interop is supported on the device.""" + + @property + def cluster_launch(self) -> bool: + """bool: Indicates device supports cluster launch.""" + + @property + def can_use_64_bit_stream_mem_ops(self) -> bool: + """bool: 64-bit operations are supported in cuStreamBatchMemOp and related MemOp APIs.""" + + @property + def can_use_stream_wait_value_nor(self) -> bool: + """bool: CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs.""" + + @property + def dma_buf_supported(self) -> bool: + """bool: Device supports buffer sharing with dma_buf mechanism.""" + + @property + def ipc_event_supported(self) -> bool: + """bool: Device supports IPC Events.""" + + @property + def mem_sync_domain_count(self) -> int: + """int: Number of memory domains the device supports.""" + + @property + def tensor_map_access_supported(self) -> bool: + """bool: Device supports accessing memory using Tensor Map.""" + + @property + def handle_type_fabric_supported(self) -> bool: + """bool: Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate().""" + + @property + def unified_function_pointers(self) -> bool: + """bool: Device supports unified function pointers.""" + + @property + def mps_enabled(self) -> bool: + """bool: Indicates if contexts created on this device will be shared via MPS.""" + + @property + def host_numa_id(self) -> int: + """int: NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA.""" + + @property + def d3d12_cig_supported(self) -> bool: + """bool: Device supports CIG with D3D12.""" + + @property + def mem_decompress_algorithm_mask(self) -> int: + """int: The returned value shall be interpreted as a bitmask, where the individual bits are described by the CUmemDecompressAlgorithm enum.""" + + @property + def mem_decompress_maximum_length(self) -> int: + """int: The returned value is the maximum length in bytes of a single decompress operation that is allowed.""" + + @property + def vulkan_cig_supported(self) -> bool: + """bool: Device supports CIG with Vulkan.""" + + @property + def gpu_pci_device_id(self) -> int: + """int: The combined 16-bit PCI device ID and 16-bit PCI vendor ID. + + Returns 0 if the driver does not support this query. + """ + + @property + def gpu_pci_subsystem_id(self) -> int: + """int: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. + + Returns 0 if the driver does not support this query. + """ + + @property + def host_numa_virtual_memory_management_supported(self) -> bool: + """bool: Device supports HOST_NUMA location with the virtual memory management APIs like cuMemCreate, cuMemMap and related APIs.""" + + @property + def host_numa_memory_pools_supported(self) -> bool: + """bool: Device supports HOST_NUMA location with the cuMemAllocAsync and cuMemPool family of APIs.""" + + @property + def host_numa_multinode_ipc_supported(self) -> bool: + """bool: Device supports HOST_NUMA location IPC between nodes in a multi-node system.""" + + @property + def host_memory_pools_supported(self) -> bool: + """bool: Device supports HOST location with the cuMemAllocAsync and cuMemPool family of APIs.""" + + @property + def host_virtual_memory_management_supported(self) -> bool: + """bool: Device supports HOST location with the virtual memory management APIs like cuMemCreate, cuMemMap and related APIs.""" + + @property + def host_alloc_dma_buf_supported(self) -> bool: + """bool: Device supports page-locked host memory buffer sharing with dma_buf mechanism.""" + + @property + def only_partial_host_native_atomic_supported(self) -> bool: + """bool: Link between the device and the host supports only some native atomic operations.""" + +class Device: + """Represent a GPU and act as an entry point for cuda.core features. + + This is a singleton object that helps ensure interoperability + across multiple libraries imported in the process to both see + and use the same GPU device. + + While acting as the entry point, many other CUDA resources can be + allocated such as streams and buffers. Any :obj:`~_context.Context` dependent + resource created through this device, will continue to refer to + this device's context. + + Newly returned :obj:`~_device.Device` objects are thread-local singletons + for a specified device. + + Note + ---- + Will not initialize the GPU. + + Parameters + ---------- + device_id : int, optional + Device ordinal to return a :obj:`~_device.Device` object for. + Default value of `None` return the currently used device. + + """ + __slots__ = ('_device_id', '_memory_resource', '_has_inited', '_properties', '_resources', '_uuid', '_context', '__weakref__') + + def __new__(cls, device_id: Device | int | None=None): + ... + + def _check_context_initialized(self): + ... + + @classmethod + def get_all_devices(cls): + """ + Query the available device instances. + + Returns + ------- + tuple of Device + A tuple containing instances of available devices. + """ + + def to_system_device(self) -> 'cuda.core.system.Device': + """ + Get the corresponding :class:`cuda.core.system.Device` (which is used + for NVIDIA Machine Library (NVML) access) for this + :class:`cuda.core.Device` (which is used for CUDA access). + + The devices are mapped to one another by their UUID. + + Returns + ------- + cuda.core.system.Device + The corresponding system-level device instance used for NVML access. + """ + + @property + def device_id(self) -> int: + """Return device ordinal.""" + + @property + def pci_bus_id(self) -> str: + """Return a PCI Bus Id string for this device.""" + + def can_access_peer(self, peer: Device | int) -> bool: + """Check if this device can access memory from the specified peer device. + + Queries whether peer-to-peer memory access is supported between this + device and the specified peer device. + + Parameters + ---------- + peer : Device | int + The peer device to check accessibility to. Can be a :obj:`~_device.Device` object or device ID. + """ + + @property + def uuid(self) -> str: + """Return a UUID for the device. + + Returns 16-octets identifying the device. If the device is in + MIG mode, returns its MIG UUID which uniquely identifies the + subscribed MIG compute instance. + + Note + ---- + MIG UUID is only returned when device is in MIG mode and the + driver is older than CUDA 11.4. + + The UUID is cached after first access to avoid repeated CUDA API calls. + + """ + + @property + def name(self) -> str: + """Return the device name.""" + + @property + def properties(self) -> DeviceProperties: + """Return a :obj:`~_device.DeviceProperties` class with information about the device.""" + + @property + def resources(self) -> DeviceResources: + """Return the hardware resource query namespace for this device.""" + + @property + def compute_capability(self) -> ComputeCapability: + """Return a named tuple with 2 fields: major and minor.""" + + @property + def arch(self) -> str: + """Return compute capability as a string (e.g., '75' for CC 7.5).""" + + @property + def context(self) -> Context: + """Return the :obj:`~_context.Context` associated with this device. + + Note + ---- + Device must be initialized. + + """ + + @property + def memory_resource(self) -> MemoryResource: + """Return :obj:`~_memory.MemoryResource` associated with this device.""" + + @memory_resource.setter + def memory_resource(self, mr): + ... + + @property + def default_stream(self) -> Stream: + """Return default CUDA :obj:`~_stream.Stream` associated with this device. + + The type of default stream returned depends on if the environment + variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set. + + If set, returns a per-thread default stream. Otherwise returns + the legacy stream. + + """ + + def __int__(self): + """Return device_id.""" + + def __repr__(self): + ... + + def __hash__(self) -> int: + ... + + def __eq__(self, other) -> bool: + ... + + def __reduce__(self): + ... + + def set_current(self, ctx: Context | None=None) -> Context | None: + """Set device to be used for GPU executions. + + Initializes CUDA and sets the calling thread to a valid CUDA + context. By default the primary context is used, but optional `ctx` + parameter can be used to explicitly supply a :obj:`~_context.Context` object. + + Providing a `ctx` causes the previous set context to be popped and returned. + + Parameters + ---------- + ctx : :obj:`~_context.Context`, optional + Optional context to push onto this device's current thread stack. + + Returns + ------- + :obj:`~_context.Context`, optional + Popped context. + + Examples + -------- + Acts as an entry point of this object. Users always start a code by + calling this method, e.g. + + >>> from cuda.core import Device + >>> dev0 = Device(0) + >>> dev0.set_current() + >>> # ... do work on device 0 ... + + """ + + def create_context(self, options: ContextOptions | None=None) -> Context: + """Create a new :obj:`~_context.Context` object. + + Note + ---- + The newly created context will not be set as current. + + Parameters + ---------- + options : :obj:`~_context.ContextOptions`, optional + Customizable dataclass for context creation options. + + Returns + ------- + :obj:`~_context.Context` + Newly created context object. + + """ + + def create_stream(self, obj: IsStreamType | None=None, options: StreamOptions | None=None) -> Stream: + """Create a :obj:`~_stream.Stream` object. + + New stream objects can be created in two different ways: + + 1) Create a new CUDA stream with customizable ``options``. + 2) Wrap an existing foreign `obj` supporting the ``__cuda_stream__`` protocol. + + Option (2) internally holds a reference to the foreign object + such that the lifetime is managed. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + obj : :obj:`~_stream.IsStreamType`, optional + Any object supporting the ``__cuda_stream__`` protocol. + options : :obj:`~_stream.StreamOptions`, optional + Customizable dataclass for stream creation options. + + Returns + ------- + :obj:`~_stream.Stream` + Newly created stream object. + + """ + + def create_event(self, options: EventOptions | None=None) -> Event: + """Create an :obj:`~_event.Event` object without recording it to a :obj:`~_stream.Stream`. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + options : :obj:`EventOptions`, optional + Customizable dataclass for event creation options. + + Returns + ------- + :obj:`~_event.Event` + Newly created event object. + + """ + + def allocate(self, size, *, stream: Stream | GraphBuilder) -> Buffer: + """Allocate device memory from a specified stream. + + Allocates device memory of `size` bytes on the specified `stream` + using the memory resource currently associated with this Device. + + Note + ---- + Device must be initialized. + + Parameters + ---------- + size : int + Number of bytes to allocate. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword-only. The stream establishing the stream ordering semantic. + Must be passed explicitly; pass ``self.default_stream`` to use + the default stream. + + Returns + ------- + :obj:`~_memory.Buffer` + Newly created buffer object. + + """ + + def sync(self): + """Synchronize the device. + + Note + ---- + Device must be initialized. + + """ + + def create_graph_builder(self) -> GraphBuilder: + """Create a new :obj:`~graph.GraphBuilder` object. + + Returns + ------- + :obj:`~graph.GraphBuilder` + Newly created graph builder object. + + """ +_tls = threading.local() +_lock = threading.Lock() \ No newline at end of file diff --git a/cuda_core/cuda/core/_device_resources.pyi b/cuda_core/cuda/core/_device_resources.pyi new file mode 100644 index 00000000000..0e9846d8a42 --- /dev/null +++ b/cuda_core/cuda/core/_device_resources.pyi @@ -0,0 +1,147 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_device_resources.pyx + +from __future__ import annotations + +from collections.abc import Sequence as SequenceABC +from dataclasses import dataclass + + +@dataclass +class SMResourceOptions: + """Customizable :obj:`SMResource.split` options. + + Each field accepts a scalar (for a single group) or a ``Sequence`` + (for multiple groups). ``count`` drives the number of groups; other + ``Sequence`` fields must match its length. + + Attributes + ---------- + count : int or Sequence[int], optional + Requested SM count per group. ``None`` means discovery mode + (auto-detect). (Default to ``None``) + coscheduled_sm_count : int or Sequence[int], optional + Minimum number of SMs guaranteed to be co-scheduled in each + group. (Default to ``None``) + preferred_coscheduled_sm_count : int or Sequence[int], optional + Preferred co-scheduled SM count; the driver tries to satisfy + this but may fall back to ``coscheduled_sm_count``. + (Default to ``None``) + backfill : bool or Sequence[bool], optional + If ``True``, allow the driver to relax the co-scheduling + constraint when assigning SMs. This enables requesting + arbitrary aligned SM counts that the driver would otherwise + reject due to hardware topology constraints. + (Default to ``False``) + """ + count: int | SequenceABC | None = None + coscheduled_sm_count: int | SequenceABC | None = None + preferred_coscheduled_sm_count: int | SequenceABC | None = None + backfill: bool | SequenceABC = False + +@dataclass +class WorkqueueResourceOptions: + """Customizable :obj:`WorkqueueResource.configure` options. + + Attributes + ---------- + sharing_scope : str, optional + Workqueue sharing scope. Accepted values: ``"device_ctx"`` + or ``"green_ctx_balanced"``. (Default to ``None``) + """ + sharing_scope: str | None = None + +class SMResource: + """Represent an SM (streaming multiprocessor) resource partition. + + Instances are returned by :obj:`DeviceResources.sm` or + :meth:`SMResource.split` and cannot be instantiated directly. + """ + + def __init__(self, *args, **kwargs): + ... + + @property + def handle(self) -> int: + """Return the address of the underlying ``CUdevResource`` struct.""" + + @property + def sm_count(self) -> int: + """Total SMs available in this resource.""" + + @property + def min_partition_size(self) -> int: + """Minimum SM count required to create a partition.""" + + @property + def coscheduled_alignment(self) -> int: + """Number of SMs guaranteed to be co-scheduled.""" + + @property + def flags(self) -> int: + """Raw flags from the underlying SM resource.""" + + def split(self, options, *, dry_run: bool=False): + """Split this SM resource into groups and a remainder. + + Parameters + ---------- + options : :obj:`SMResourceOptions` + Split configuration (count, co-scheduling constraints). + dry_run : bool, optional + If ``True``, return filled-in metadata without creating + usable resource objects. (Default to ``False``) + + Returns + ------- + tuple[list[:obj:`SMResource`], :obj:`SMResource`] + ``(groups, remainder)`` where each group holds a disjoint + SM partition and *remainder* holds any unassigned SMs. + """ + +class WorkqueueResource: + """Represent a workqueue resource for a device or green context. + + Merges ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG`` and + ``CU_DEV_RESOURCE_TYPE_WORKQUEUE`` under one user-facing type. + Instances are returned by :obj:`DeviceResources.workqueue` and + cannot be instantiated directly. + """ + + def __init__(self, *args, **kwargs): + ... + + @property + def handle(self) -> int: + """Return the address of the underlying config ``CUdevResource`` struct.""" + + def configure(self, options): + """Configure the workqueue resource in place. + + Parameters + ---------- + options : :obj:`WorkqueueResourceOptions` + Configuration options (sharing scope, etc.). + """ + +class DeviceResources: + """Namespace for hardware resource queries. + + When obtained via :obj:`Device.resources`, queries return full device + resources. When obtained via :obj:`Context.resources` or + :obj:`Stream.resources`, queries return the resources provisioned for + that context. + + This class cannot be instantiated directly. + """ + + def __init__(self, *args, **kwargs): + ... + + @property + def sm(self) -> SMResource: + """Return the :obj:`SMResource` for this device or context.""" + + @property + def workqueue(self) -> WorkqueueResource: + """Return the :obj:`WorkqueueResource` for this device or context.""" +__all__ = ['DeviceResources', 'SMResource', 'SMResourceOptions', 'WorkqueueResource', 'WorkqueueResourceOptions'] \ No newline at end of file diff --git a/cuda_core/cuda/core/_dlpack.pyi b/cuda_core/cuda/core/_dlpack.pyi new file mode 100644 index 00000000000..e140050eff7 --- /dev/null +++ b/cuda_core/cuda/core/_dlpack.pyi @@ -0,0 +1,24 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_dlpack.pyx + +from __future__ import annotations + +from enum import IntEnum + +_DLDeviceType = int +DLDataTypeCode = int + +class DLDeviceType(IntEnum): + kDLCPU = 1 + kDLCUDA = 2 + kDLCUDAHost = 3 + kDLCUDAManaged = 13 + +def make_py_capsule(buf: object, versioned: bool) -> object: + ... + +def classify_dl_device(buf) -> tuple[int, int]: + """Classify a buffer into a DLPack (device_type, device_id) pair. + + ``buf`` must expose ``is_device_accessible``, ``is_host_accessible``, + ``is_managed``, and ``device_id`` attributes. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_event.pyi b/cuda_core/cuda/core/_event.pyi new file mode 100644 index 00000000000..995e5c2650e --- /dev/null +++ b/cuda_core/cuda/core/_event.pyi @@ -0,0 +1,179 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_event.pyx + +from __future__ import annotations + +from dataclasses import dataclass + +import cuda.bindings.driver +import cython +from cuda.core._context import Context +from cuda.core._device import Device + + +@dataclass +class EventOptions: + """Customizable :obj:`~_event.Event` options. + + Attributes + ---------- + timing_enabled : bool, optional + Event will record timing data. (Default to False) + blocking_sync : bool, optional + If True, the event uses blocking synchronization: a CPU + thread that calls :meth:`Event.sync` blocks (yields) until + the event has completed. Otherwise (the default), the CPU + thread busy-waits until the event has completed. + (Default to False) + ipc_enabled : bool, optional + Event will be suitable for interprocess use. + Note that timing_enabled must be False. (Default to False) + + """ + timing_enabled: bool | None = False + blocking_sync: bool | None = False + ipc_enabled: bool | None = False + +class Event: + """Represent a record at a specific point of execution within a CUDA stream. + + Applications can asynchronously record events at any point in + the program. An event keeps a record of all previous work within + the last recorded stream. + + Events can be used to monitor device's progress, query completion + of work up to event's record, help establish dependencies + between GPU work submissions, and record the elapsed time (in milliseconds) + on GPU: + + .. code-block:: python + + # To create events and record the timing: + s = Device().create_stream() + e1 = Device().create_event({"timing_enabled": True}) + e2 = Device().create_event({"timing_enabled": True}) + s.record(e1) + # ... run some GPU works ... + s.record(e2) + e2.sync() + print(f"time = {e2 - e1} milliseconds") + + Directly creating an :obj:`~_event.Event` is not supported due to ambiguity, + and they should instead be created through a :obj:`~_stream.Stream` object. + + """ + + def close(self): + """Destroy the event. + + Releases the event handle. The underlying CUDA event is destroyed + when the last reference is released. + """ + + def __init__(self, *args, **kwargs): + ... + + def __sub__(self, other: Event): + ... + + def __hash__(self) -> int: + ... + + def __eq__(self, other) -> bool: + ... + + def __repr__(self) -> str: + ... + + @property + def ipc_descriptor(self) -> IPCEventDescriptor: + """Descriptor for sharing this event with other processes.""" + + @classmethod + def from_ipc_descriptor(cls, ipc_descriptor: IPCEventDescriptor) -> Event: + """Import an event that was exported from another process. + + Parameters + ---------- + ipc_descriptor : :obj:`~_memory._ipc.IPCEventDescriptor` + The IPC descriptor obtained from :attr:`~Event.ipc_descriptor` in + another process. + + Returns + ------- + :obj:`~_event.Event` + A new event backed by the imported IPC handle. + + """ + + @property + def is_ipc_enabled(self) -> bool: + """Return True if the event can be shared across process boundaries, otherwise False.""" + + @property + def is_timing_enabled(self) -> bool: + """Return True if the event records timing data, otherwise False.""" + + @property + def is_blocking_sync(self) -> bool: + """Return True if the event uses blocking synchronization (the CPU + thread blocks on :meth:`sync` instead of busy-waiting), otherwise False. + """ + + def sync(self): + """Synchronize until the event completes. + + If the event was created with ``blocking_sync=True``, the + calling CPU thread blocks (yields) until the event has been + completed by the device. Otherwise (the default) the CPU + thread busy-waits until the event has completed. + + """ + + @property + def is_done(self) -> bool: + """Return True if all captured works have been completed, otherwise False.""" + + @property + def handle(self) -> cuda.bindings.driver.CUevent: + """Return the underlying CUevent object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Event.handle)``. + """ + + @property + def device(self) -> Device: + """Return the :obj:`~_device.Device` singleton associated with this event. + + Note + ---- + The current context on the device may differ from this + event's context. This case occurs when a different CUDA + context is set current after a event is created. + + """ + + @property + def context(self) -> Context: + """Return the :obj:`~_context.Context` associated with this event.""" + +class IPCEventDescriptor: + """Serializable object describing an event that can be shared between processes.""" + + def __init__(self, *arg, **kwargs): + ... + + @staticmethod + def _init(reserved: bytes, is_blocking_sync: cython.bint): + ... + + def __eq__(self, rhs) -> bool: + ... + + def __reduce__(self): + ... + +def _reduce_event(event): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_graphics.pyi b/cuda_core/cuda/core/_graphics.pyi new file mode 100644 index 00000000000..6d8c39594da --- /dev/null +++ b/cuda_core/cuda/core/_graphics.pyi @@ -0,0 +1,224 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_graphics.pyx + +from __future__ import annotations + +from cuda.bindings import cydriver +from cuda.core._memory._buffer import Buffer +from cuda.core._stream import Stream + + +class GraphicsResource: + """RAII wrapper for a CUDA graphics resource (``CUgraphicsResource``). + + A :class:`GraphicsResource` represents an OpenGL buffer or image that has + been registered for access by CUDA. This enables zero-copy sharing of GPU + data between CUDA compute kernels and graphics renderers. + + Mapping the resource returns a :class:`~cuda.core.Buffer` whose lifetime + controls when the graphics resource is unmapped. This keeps stream-ordered + cleanup tied to the mapped pointer itself rather than to mutable state on + the :class:`GraphicsResource` object. + + The resource is automatically unregistered when :meth:`close` is called or + when the object is garbage collected. + + :class:`GraphicsResource` objects should not be instantiated directly. + Use the factory classmethods :meth:`from_gl_buffer` or :meth:`from_gl_image`. + + Examples + -------- + Register an OpenGL VBO, map it to get a buffer, and write to it from CUDA: + + .. code-block:: python + + resource = GraphicsResource.from_gl_buffer(vbo) + + with resource.map(stream=s) as buf: + view = StridedMemoryView.from_buffer(buf, shape=(256,), dtype=np.float32) + # view.ptr is a CUDA device pointer into the GL buffer + + Or scope registration separately from mapping: + + .. code-block:: python + + with GraphicsResource.from_gl_buffer(vbo) as resource: + with resource.map(stream=s) as buf: + # ... launch kernels using buf.handle, buf.size ... + pass + """ + + def close(self, stream=None): + """Unregister this graphics resource from CUDA. + + If the resource is currently mapped, it is unmapped first. After + closing, the resource cannot be used again. + + Parameters + ---------- + stream : :class:`~cuda.core.Stream`, optional + Optional override for the stream used to close the currently + mapped buffer, if one exists. + """ + + def __init__(self): + ... + + @classmethod + def from_gl_buffer(cls, gl_buffer: int, *, flags=None, stream=None) -> GraphicsResource: + """Register an OpenGL buffer object for CUDA access. + + Parameters + ---------- + gl_buffer : int + The OpenGL buffer name (``GLuint``) to register. + flags : str or sequence of str, optional + Registration flags specifying intended usage. Accepted values: + ``"none"``, ``"read_only"``, ``"write_discard"``, + ``"surface_load_store"``, ``"texture_gather"``. + Multiple flags can be combined by passing a sequence + (e.g., ``("surface_load_store", "read_only")``). + Defaults to ``None`` (no flags). + stream : :class:`~cuda.core.Stream`, optional + If provided, the resource can be used directly as a context manager + and it will be mapped on entry:: + + with GraphicsResource.from_gl_buffer(vbo, stream=s) as buf: + view = StridedMemoryView.from_buffer(buf, shape=(256,), dtype=np.float32) + + If omitted, the returned resource can still be used as a context + manager to scope registration and automatic cleanup:: + + with GraphicsResource.from_gl_buffer(vbo) as resource: + with resource.map(stream=s) as buf: + ... + + Returns + ------- + GraphicsResource + A new graphics resource wrapping the registered GL buffer. + The returned resource can be used as a context manager. If + *stream* was given, entering maps the resource and yields a + :class:`~cuda.core.Buffer`; otherwise entering yields the + :class:`GraphicsResource` itself and closes it on exit. + + Raises + ------ + CUDAError + If the registration fails (e.g., no current GL context, invalid + buffer name, or operating system error). + ValueError + If an unknown flag string is provided. + """ + + @classmethod + def from_gl_image(cls, image: int, target: int, *, flags=None) -> GraphicsResource: + """Register an OpenGL texture or renderbuffer for CUDA access. + + Parameters + ---------- + image : int + The OpenGL texture or renderbuffer name (``GLuint``) to register. + target : int + The OpenGL target type (e.g., ``GL_TEXTURE_2D``). + flags : str or sequence of str, optional + Registration flags specifying intended usage. Accepted values: + ``"none"``, ``"read_only"``, ``"write_discard"``, + ``"surface_load_store"``, ``"texture_gather"``. + Multiple flags can be combined by passing a sequence + (e.g., ``("surface_load_store", "read_only")``). + Defaults to ``None`` (no flags). + + Returns + ------- + GraphicsResource + A new graphics resource wrapping the registered GL image. + + Raises + ------ + CUDAError + If the registration fails. + ValueError + If an unknown flag string is provided. + """ + + def _get_mapped_buffer(self): + ... + + def map(self, *, stream: Stream) -> Buffer: + """Map this graphics resource for CUDA access. + + After mapping, a CUDA device pointer into the underlying graphics + memory is available as a :class:`~cuda.core.Buffer`. + + Can be used as a context manager for automatic unmapping:: + + with resource.map(stream=s) as buf: + # use buf.handle, buf.size, etc. + # automatically unmapped here + + Parameters + ---------- + stream : :class:`~cuda.core.Stream` + Keyword-only. The CUDA stream on which to perform the mapping. + Must be passed explicitly; pass ``device.default_stream`` to use + the default stream. + + Returns + ------- + Buffer + A buffer whose lifetime controls when the graphics resource is + unmapped. + + Raises + ------ + RuntimeError + If the resource is already mapped or has been closed. + CUDAError + If the mapping fails. + """ + + def unmap(self, *, stream: Stream | None=None): + """Unmap this graphics resource, releasing it back to the graphics API. + + After unmapping, the :class:`~cuda.core.Buffer` previously returned + by :meth:`map` must not be used. + + Parameters + ---------- + stream : :class:`~cuda.core.Stream`, optional + If provided, overrides the stream that will be used when the + mapped buffer is closed. Otherwise the mapping stream is reused. + + Raises + ------ + RuntimeError + If the resource is not currently mapped or has been closed. + CUDAError + If the unmapping fails. + """ + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc_val, exc_tb): + ... + + @property + def is_mapped(self) -> bool: + """Whether the resource is currently mapped for CUDA access.""" + + @property + def handle(self) -> int: + """The raw ``CUgraphicsResource`` handle as a Python int.""" + + @property + def resource_handle(self) -> int: + """Alias for :attr:`handle`.""" + + def __repr__(self): + ... +__all__ = ['GraphicsResource'] +_REGISTER_FLAGS = {'none': cydriver.CU_GRAPHICS_REGISTER_FLAGS_NONE, 'read_only': cydriver.CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY, 'write_discard': cydriver.CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD, 'surface_load_store': cydriver.CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST, 'texture_gather': cydriver.CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER} + +def _parse_register_flags(flags): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_kernel_arg_handler.pyi b/cuda_core/cuda/core/_kernel_arg_handler.pyi new file mode 100644 index 00000000000..d66a5465840 --- /dev/null +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyi @@ -0,0 +1,16 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_kernel_arg_handler.pyx + +from __future__ import annotations + +from libcpp.complex import complex as cpp_complex + + +class ParamHolder: + + def __init__(self, kernel_args): + ... + + def __dealloc__(self): + ... +cpp_single_complex = cpp_complex.complex +cpp_double_complex = cpp_complex.complex \ No newline at end of file diff --git a/cuda_core/cuda/core/_launch_config.pyi b/cuda_core/cuda/core/_launch_config.pyi new file mode 100644 index 00000000000..b31731af4cb --- /dev/null +++ b/cuda_core/cuda/core/_launch_config.pyi @@ -0,0 +1,80 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_launch_config.pyx + +from __future__ import annotations + + +class LaunchConfig: + """Customizable launch options. + + Note + ---- + When cluster is specified, the grid parameter represents the number of + clusters (not blocks). The hierarchy is: grid (clusters) -> cluster (blocks) -> + block (threads). Each dimension in grid specifies clusters in the grid, each dimension in + cluster specifies blocks per cluster, and each dimension in block specifies + threads per block. + + Attributes + ---------- + grid : Union[tuple, int] + Collection of threads that will execute a kernel function. When cluster + is not specified, this represents the number of blocks, otherwise + this represents the number of clusters. + cluster : Union[tuple, int] + Group of blocks (Thread Block Cluster) that will execute on the same + GPU Processing Cluster (GPC). Blocks within a cluster have access to + distributed shared memory and can be explicitly synchronized. + block : Union[tuple, int] + Group of threads (Thread Block) that will execute on the same + streaming multiprocessor (SM). Threads within a thread blocks have + access to shared memory and can be explicitly synchronized. + shmem_size : int, optional + Dynamic shared-memory size per thread block in bytes. + (Default to size 0) + is_cooperative : bool, optional + Whether this config can be used to launch a cooperative kernel. + """ + + def __init__(self, grid=None, cluster=None, block=None, shmem_size=None, is_cooperative=False): + """Initialize LaunchConfig with validation. + + Parameters + ---------- + grid : Union[tuple, int], optional + Grid dimensions (number of blocks or clusters if cluster is specified) + cluster : Union[tuple, int], optional + Cluster dimensions (Thread Block Cluster) + block : Union[tuple, int], optional + Block dimensions (threads per block) + shmem_size : int, optional + Dynamic shared memory size in bytes (default: 0) + is_cooperative : bool, optional + Whether to launch as cooperative kernel (default: False) + """ + + def _identity(self): + ... + + def __repr__(self): + """Return string representation of LaunchConfig.""" + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... +_LAUNCH_CONFIG_ATTRS = ('grid', 'cluster', 'block', 'shmem_size', 'is_cooperative') + +def _to_native_launch_config(config: LaunchConfig) -> object: + """Convert LaunchConfig to native driver CUlaunchConfig. + + Parameters + ---------- + config : LaunchConfig + High-level launch configuration + + Returns + ------- + driver.CUlaunchConfig + Native CUDA driver launch configuration + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_launcher.pyi b/cuda_core/cuda/core/_launcher.pyi new file mode 100644 index 00000000000..ec8c927500a --- /dev/null +++ b/cuda_core/cuda/core/_launcher.pyi @@ -0,0 +1,30 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_launcher.pyx + +from __future__ import annotations + +from cuda.core._launch_config import LaunchConfig +from cuda.core._module import Kernel +from cuda.core._stream import Stream +from cuda.core.graph import GraphBuilder +from cuda.core.typing import IsStreamType + + +def launch(stream: Stream | GraphBuilder | IsStreamType, config: LaunchConfig, kernel: Kernel, *kernel_args): + """Launches a :obj:`~_module.Kernel` + object with launch-time configuration. + + Parameters + ---------- + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + The stream establishing the stream ordering semantic of a + launch. + config : :obj:`LaunchConfig` + Launch configurations inline with options provided by + :obj:`~_launcher.LaunchConfig` dataclass. + kernel : :obj:`~_module.Kernel` + Kernel to launch. + *kernel_args : Any + Variable length argument list that is provided to the + launching kernel. + + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_layout.pyi b/cuda_core/cuda/core/_layout.pyi new file mode 100644 index 00000000000..024c4368ccf --- /dev/null +++ b/cuda_core/cuda/core/_layout.pyi @@ -0,0 +1,581 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_layout.pyx + +from __future__ import annotations + +import cython +from libcpp import vector + +OrderFlag = int +Property = int + +@cython.final +class _StridedLayout: + """ + A class describing the layout of a multi-dimensional tensor + with a shape, strides and itemsize. + + Parameters + ---------- + shape : tuple + A tuple of non-negative integers. + strides : tuple, optional + If provided, must be a tuple of integers of the same length as ``shape``. + Otherwise, the strides are assumed to be implicitly C-contiguous and the resulting + layout's :attr:`strides` will be None. + itemsize : int + The number of bytes per single element (dtype size). + divide_strides : bool, optional + If True, the provided :attr:`strides` will be divided by the :attr:`itemsize`. + + + See also :meth:`dense`. + + + Attributes + ---------- + itemsize : int + The number of bytes per single element (dtype size). + slice_offset : int + The offset (as a number of elements, not bytes) of the element at + index ``(0,) * ndim``. See also :attr:`slice_offset_in_bytes`. + """ + + def __init__(self: _StridedLayout, shape: tuple[int, ...], strides: tuple[int, ...] | None, itemsize: int, divide_strides: bool=False) -> None: + ... + + @classmethod + def dense(cls, shape: tuple[int], itemsize: int, stride_order: str | tuple[int]='C') -> _StridedLayout: + """ + Creates a new _StridedLayout instance with dense strides. + + Parameters + ---------- + shape : tuple + A tuple of non-negative integers. + itemsize : int + The number of bytes per single element of the tensor. + stride_order : str or tuple, optional + The order of the strides: + * 'C' (default) - the strides are computed in C-order (increasing from the right to the left) + * 'F' - the strides are computed in F-order (increasing from the left to the right) + * A tuple - it must be a permutation of ``tuple(range(len(shape)))``. + The last element of the tuple is the axis with stride 1. + + See also :attr:`stride_order`. + + + .. highlight:: python + .. code-block:: python + + assert _StridedLayout.dense((5, 3, 7), 1, "C") == _StridedLayout((5, 3, 7), (21, 7, 1), 1) + assert _StridedLayout.dense((5, 3, 7), 1, "F") == _StridedLayout((5, 3, 7), (1, 5, 15), 1) + assert _StridedLayout.dense((5, 3, 7), 1, (2, 0, 1)) == _StridedLayout((5, 3, 7), (3, 1, 15), 1) + + """ + + @classmethod + def dense_like(cls, other: _StridedLayout, stride_order: str | tuple[int]='K') -> _StridedLayout: + """ + Creates a _StridedLayout with the same :attr:`shape` and :attr:`itemsize` as the other layout, + but with contiguous strides in the specified order and no slice offset. + + See also :attr:`is_dense`. + + Parameters + ---------- + other : _StridedLayout + The _StridedLayout to copy the :attr:`shape` and :attr:`itemsize` from. + stride_order : str or tuple, optional + The order of the strides: + * 'K' (default) - keeps the order of the strides as in the ``other`` layout. + * 'C' - the strides are computed in C-order (increasing from the right to the left) + * 'F' - the strides are computed in F-order (increasing from the left to the right) + * A tuple - it must be a permutation of ``tuple(range(len(shape)))``. + The last element of the tuple is the axis with stride 1. + + See also :attr:`stride_order`. + + + .. highlight:: python + .. code-block:: python + + layout = _StridedLayout.dense((5, 3, 7), 1).permuted((2, 0, 1)) + assert layout == _StridedLayout((7, 5, 3), (1, 21, 7), 1) + + # dense_like with the default "K" stride_order + # keeps the same order of strides as in the original layout + assert _StridedLayout.dense_like(layout) == layout + # "C", "F" recompute the strides accordingly + assert _StridedLayout.dense_like(layout, "C") == _StridedLayout((7, 5, 3), (15, 3, 1), 1) + assert _StridedLayout.dense_like(layout, "F") == _StridedLayout((7, 5, 3), (1, 7, 35), 1) + """ + + def __repr__(self: _StridedLayout) -> str: + ... + + def __eq__(self, other) -> bool: + ... + + @property + def ndim(self: _StridedLayout): + """ + The number of dimensions (length of the shape tuple). + + :type: int + """ + + @property + def shape(self: _StridedLayout): + """ + Shape of the tensor. + + :type: tuple[int] + """ + + @property + def strides(self: _StridedLayout): + """ + Strides of the tensor (in **counts**, not bytes). + If _StridedLayout was created with strides=None, the + returned value is None and layout is implicitly C-contiguous. + + :type: tuple[int] | None + """ + + @property + def strides_in_bytes(self: _StridedLayout): + """ + Strides of the tensor (in bytes). + + :type: tuple[int] | None + """ + + @property + def stride_order(self: _StridedLayout): + """ + A permutation of ``tuple(range(ndim))`` describing the + relative order of the strides. + + .. highlight:: python + .. code-block:: python + + # C-contiguous layout + assert _StridedLayout.dense((5, 3, 7), 1).stride_order == (0, 1, 2) + # F-contiguous layout + assert _StridedLayout.dense((5, 3, 7), 1, stride_order="F").stride_order == (2, 1, 0) + # Permuted layout + assert _StridedLayout.dense((5, 3, 7), 1, stride_order=(2, 0, 1)).stride_order == (2, 0, 1) + + :type: tuple[int] + """ + + @property + def volume(self: _StridedLayout): + """ + The number of elements in the tensor, i.e. the product of the shape tuple. + + :type: int + """ + + @property + def is_unique(self: _StridedLayout): + """ + If True, each element of a tensor with this layout is mapped to + a unique memory offset. + + All contiguous layouts are unique and so are layouts that can be created + by permuting, slicing, flattening, squeezing, repacking, or reshaping + a contiguous layout. + Conversely, broadcast layouts (layouts with a 0 stride + for some extent greater than 1) are not unique. + + For layouts resulting from manual stride manipulations + (such as with ``numpy.lib.stride_tricks``), the check + may inaccurately report False, as the exact uniqueness + check may be expensive. + + :type: bool + """ + + @property + def is_contiguous_c(self: _StridedLayout): + """ + True iff the layout is contiguous in C-order, i.e. + the rightmost stride is 1 and each subsequent + stride to the left is the product of the + extent and the stride to the right. + + .. highlight:: python + .. code-block:: python + + layout = _StridedLayout.dense((2, 5, 3), 1, "C") + assert layout == _StridedLayout((2, 5, 3), (15, 3, 1), 1) + assert layout.is_contiguous_c + + See also :attr:`is_contiguous_any`. + + :type: bool + """ + + @property + def is_contiguous_f(self: _StridedLayout): + """ + True iff the layout is contiguous in F-order, i.e. + the leftmost stride is 1 and each subsequent + stride to the right is the product of the + stride and extent to the left. + + .. highlight:: python + .. code-block:: python + + layout = _StridedLayout.dense((2, 5, 3), 1, "F") + assert layout == _StridedLayout((2, 5, 3), (1, 2, 10), 1) + assert layout.is_contiguous_f + + See also :attr:`is_contiguous_any`. + + :type: bool + """ + + @property + def is_contiguous_any(self: _StridedLayout): + """ + True iff the layout is contiguous in some axis order, i.e. + there exists a permutation of axes such that the layout + is C-contiguous. + + In a contiguous layout, the strides are non-negative and + the mapping of elements to the memory offset range + ``[min_offset, max_offset]`` is 1-to-1. + + .. highlight:: python + .. code-block:: python + + # dense defaults to C-contiguous + layout = _StridedLayout.dense((5, 3, 7), 1) + assert layout.is_contiguous_c and not layout.is_contiguous_f + assert layout.is_contiguous_any + + # reversing the order of axes gives F-contiguous layout + permuted = layout.permuted((2, 1, 0)) + assert not permuted.is_contiguous_c and permuted.is_contiguous_f + assert permuted.is_contiguous_any + + # neither C- nor F-order but still contiguous + permuted = layout.permuted((2, 0, 1)) + assert not permuted.is_contiguous_c and not permuted.is_contiguous_f + assert permuted.is_contiguous_any + + # slicing the right-most extent creates a gap in the + # offset_bounds range that is not reachable with any + # element in the sliced layout + sliced = layout[:, :, :-1] + assert not sliced.is_contiguous_c and not sliced.is_contiguous_f + assert not sliced.is_contiguous_any + + :type: bool + """ + + @property + def is_dense(self: _StridedLayout): + """ + A dense layout is contiguous (:attr:`is_contiguous_any` is True) + and has no slice offset (:attr:`slice_offset_in_bytes` is 0). + + In a dense layout, elements are mapped 1-to-1 to the ``[0, volume - 1]`` + memory offset range. + + :type: bool + """ + + @property + def offset_bounds(self: _StridedLayout): + """ + The memory offset range ``[min_offset, max_offset]`` (in element counts, not bytes) + that elements of a tensor with this layout are mapped to. + + If the layout is empty (i.e. ``volume == 0``), the returned tuple is ``(0, -1)``. + Otherwise, ``min_offset <= max_offset`` and all elements of the tensor with + this layout are mapped within the ``[min_offset, max_offset]`` range. + + .. highlight:: python + .. code-block:: python + + # Possible implementation of the offset_bounds + def offset_bounds(layout : _StridedLayout): + if layout.volume == 0: + return 0, -1 + ndim = layout.ndim + shape = layout.shape + strides = layout.strides + idx_min = [shape[i] - 1 if strides[i] < 0 else 0 for i in range(ndim)] + idx_max = [shape[i] - 1 if strides[i] > 0 else 0 for i in range(ndim)] + min_offset = sum(strides[i] * idx_min[i] for i in range(ndim)) + layout.slice_offset + max_offset = sum(strides[i] * idx_max[i] for i in range(ndim)) + layout.slice_offset + return min_offset, max_offset + + :type: tuple[int, int] + """ + + @property + def min_offset(self: _StridedLayout): + """ + See :attr:`offset_bounds` for details. + + :type: int + """ + + @property + def max_offset(self: _StridedLayout): + """ + See :attr:`offset_bounds` for details. + + :type: int + """ + + @property + def slice_offset_in_bytes(self: _StridedLayout): + """ + The memory offset (as a number of bytes) of the element at index ``(0,) * ndim``. + Equal to :attr:`itemsize` ``*`` :attr:`slice_offset`. + + .. note:: + The only way for the index ``(0,) * ndim`` to be mapped to a non-zero offset + is slicing with :meth:`sliced` method (or ``[]`` operator). + + :type: int + """ + + def required_size_in_bytes(self: _StridedLayout) -> int: + """ + The memory allocation size (in bytes) needed so that + all elements of a tensor with this layout can be mapped + within the allocated memory range. + + The function raises an error if ``min_offset < 0``. + Otherwise, the returned value is equal to + ``(max_offset + 1) * itemsize``. + + .. hint:: + For dense layouts, the function always succeeds and the + ``(max_offset + 1) * itemsize`` is equal to the ``volume * itemsize``. + + .. highlight:: python + .. code-block:: python + + # Allocating memory on a device to copy a host tensor + def device_tensor_like(a : numpy.ndarray, device : ccx.Device) -> StridedMemoryView: + a_view = StridedMemoryView(a, -1) + # get the original layout of ``a`` and convert it to a dense layout + # to avoid overallocating memory (e.g. if the ``a`` was sliced) + layout = a_view._layout.to_dense() + # get the required size in bytes to fit the tensor + required_size = layout.required_size_in_bytes() + # allocate the memory on the device + device.set_current() + mem = device.allocate(required_size, stream=device.default_stream) + # create a view on the newly allocated device memory + b_view = StridedMemoryView.from_buffer(mem, layout, a_view.dtype) + return b_view + """ + + def flattened_axis_mask(self: _StridedLayout) -> axes_mask_t: + """ + A mask describing which axes of this layout are mergeable + using the :meth:`flattened` method. + """ + + def to_dense(self: _StridedLayout, stride_order: object='K') -> _StridedLayout: + """ + Returns a dense layout with the same shape and itemsize, + but with dense strides in the specified order. + + See :meth:`dense_like` method documentation for details. + """ + + def reshaped(self: _StridedLayout, shape: tuple[int]) -> _StridedLayout: + """ + Returns a layout with the new shape, if the new shape is compatible + with the current layout. + + The new shape is compatible if: + * the new and old shapes have the same volume + * the old strides can be split or flattened to match the new shape, + assuming indices are iterated in C-order + + A single extent in the ``shape`` tuple can be set to -1 to indicate + it should be inferred from the old volume and the other extents. + + .. highlight:: python + .. code-block:: python + + layout = _StridedLayout.dense((5, 3, 4), 1) + assert layout.reshaped((20, 3)) == _StridedLayout.dense((20, 3), 1) + assert layout.reshaped((4, -1)) == _StridedLayout.dense((4, 15), 1) + assert layout.permuted((2, 0, 1)).reshaped((4, 15,)) == _StridedLayout((4, 15), (1, 4), 1) + # layout.permuted((2, 0, 1)).reshaped((20, 3)) -> error + """ + + def permuted(self: _StridedLayout, axis_order: tuple[int]) -> _StridedLayout: + """ + Returns a new layout where the shape and strides tuples are permuted + according to the specified permutation of axes. + """ + + def flattened(self: _StridedLayout, start_axis: int=0, end_axis: int=-1, mask: int | None=None) -> _StridedLayout: + """ + Merges consecutive extents into a single extent (equal to the product of merged extents) + if the corresponding strides can be replaced with a single stride + (assuming indices are iterated in C-order, i.e. the rightmost + axis is incremented first). + + .. highlight:: python + .. code-block:: python + + # the two extents can be merged into a single extent + # because layout.strides[0] == layout.strides[1] * layout.shape[1] + layout = _StridedLayout((3, 2), (2, 1), 1) + assert layout.flattened() == _StridedLayout((6,), (1,), 1) + + # the two extents cannot be merged into a single extent + # because layout.strides[0] != layout.strides[1] * layout.shape[1] + layout = _StridedLayout((3, 2), (1, 3), 1) + assert layout.flattened() == layout + + If ``start_axis`` and ``end_axis`` are provided, only the axes in the + inclusive range ``[start_axis, end_axis]`` are considered for flattening. + + Alternatively, a mask specifying which axes to consider can be provided. + A mask of mergeable extents can be obtained using the :meth:`flattened_axis_mask` method. + Masks for layouts with the same number of dimensions can be combined + using the logical ``&`` (bitwise AND) operator. + + .. highlight:: python + .. code-block:: python + + layout = _StridedLayout.dense((4, 5, 3), 4) + layout2 = _StridedLayout((4, 5, 3), (1, 12, 4), 4) + # Even though the two layouts have the same shape initially, + # their shapes differ after flattening. + assert layout.flattened() == _StridedLayout((60,), (1,), 4) + assert layout2.flattened() == _StridedLayout((4, 15), (1, 4), 4) + # With the mask, only extents that are mergeable in both layouts are flattened + # and the resulting shape is the same for both layouts. + mask = layout.flattened_axis_mask() & layout2.flattened_axis_mask() + assert layout.flattened(mask=mask) == _StridedLayout((4, 15), (15, 1), 4) + assert layout2.flattened(mask=mask) == _StridedLayout((4, 15), (1, 4), 4) + """ + + def squeezed(self: _StridedLayout) -> _StridedLayout: + """ + Returns a new layout where all the singleton dimensions (extents equal to 1) + are removed. Additionally, if the layout volume is 0, + the returned layout will be reduced to a 1-dim layout + with shape (0,) and strides (0,). + """ + + def unsqueezed(self: _StridedLayout, axis: int | tuple[int]) -> _StridedLayout: + """ + Returns a new layout where the specified axis or axes are added as singleton extents. + The ``axis`` can be either a single integer in range ``[0, ndim]`` + or a tuple of unique integers in range ``[0, ndim + len(axis) - 1]``. + """ + + def broadcast_to(self: _StridedLayout, shape: tuple[int]) -> _StridedLayout: + """ + Returns a layout with the new shape, if the old shape can be + broadcast to the new one. + + The shapes are compatible if: + * the new shape has the same or greater number of dimensions + * starting from the right, each extent in the old shape must be 1 or + equal to the corresponding extent in the new shape. + + Strides of the added or modified extents are set to 0, the remaining ones are unchanged. + If the shapes are not compatible, a ValueError is raised. + """ + + def repacked(self: _StridedLayout, itemsize: int, data_ptr: int=0, axis: int=-1, keep_dim: bool=True) -> _StridedLayout: + """ + Converts the layout to match the specified itemsize. + If ``new_itemsize < itemsize``, each element of the tensor is **unpacked** into multiple elements, + i.e. the extent at ``axis`` increases by the factor ``itemsize // new_itemsize``. + If ``new_itemsize > itemsize``, the consecutive elements in the tensor are **packed** into a single element, + i.e. the extent at ``axis`` decreases by the factor ``new_itemsize // itemsize``. + In either case, the ``volume * itemsize`` of the layout remains the same. + + The conversion is subject to the following constraints: + * The extent at ``axis`` must be a positive integer. + * The stride at ``axis`` must be 1. + + Moreover, if the ``new_itemsize > itemsize``: + * The extent at ``axis`` must be divisible by ``new_itemsize // itemsize``. + * All other strides must be divisible by ``new_itemsize // itemsize``. + * The ``slice_offset`` must be divisible by ``new_itemsize // itemsize``. + * If ``data_ptr`` is provided, it must be aligned to the new itemsize. + + The maximum itemsize that satisfies all the constraints + can be obtained using the :meth:`max_compatible_itemsize` method. + + If the ``keep_dim`` is False and the extent at ``axis`` would be reduced to 1, + it is omitted from the returned layout. + + .. highlight:: python + .. code-block:: python + + # Repacking the layout with itemsize = 4 bytes as 2, 8, and 16 sized layouts. + layout = _StridedLayout.dense((5, 4), 4) + assert layout.repacked(2) == _StridedLayout.dense((5, 8), 2) + assert layout.repacked(8) == _StridedLayout.dense((5, 2), 8) + assert layout.repacked(16) == _StridedLayout.dense((5, 1), 16) + assert layout.repacked(16, keep_dim=False) == _StridedLayout.dense((5,), 16) + + + .. highlight:: python + .. code-block:: python + + # Viewing (5, 6) float array as (5, 3) complex64 array. + a = numpy.ones((5, 6), dtype=numpy.float32) + float_view = StridedMemoryView(a, -1) + layout = float_view._layout + assert layout.shape == (5, 6) + assert layout.itemsize == 4 + complex_view = float_view.view(layout.repacked(8), numpy.complex64) + assert complex_view._layout.shape == (5, 3) + assert complex_view._layout.itemsize == 8 + b = numpy.from_dlpack(complex_view) + assert b.shape == (5, 3) + """ + + def max_compatible_itemsize(self: _StridedLayout, max_itemsize: int=16, data_ptr: int=0, axis: int=-1) -> int: + """ + Returns the maximum itemsize (but no greater than ``max_itemsize``) that can be used + with the :meth:`repacked` method for the current layout. + """ + + def sliced(self: _StridedLayout, slices: int | slice | tuple[int | slice]) -> _StridedLayout: + """ + Returns a sliced layout. + The ``slices`` parameter can be a single integer, a single :py:class:`slice` object + or a tuple of integers/slices. + + .. hint:: + For convenience, instead of calling this method directly, please rely + on the :py:meth:`~object.__getitem__` operator (i.e. bracket syntax), e.g.: + ``layout[:, start:end:step]``. + + .. note:: + Slicing is purely a layout transformation and does not involve + any data access. + + """ + + def __getitem__(self: _StridedLayout, slices: int | slice | tuple[int | slice]) -> _StridedLayout: + ... +extent_t = int +stride_t = int +axis_t = int +axes_mask_t = int +property_mask_t = int +extents_strides_t = vector.vector +axis_vec_t = vector.vector \ No newline at end of file diff --git a/cuda_core/cuda/core/_linker.pyi b/cuda_core/cuda/core/_linker.pyi new file mode 100644 index 00000000000..32af4e3867d --- /dev/null +++ b/cuda_core/cuda/core/_linker.pyi @@ -0,0 +1,249 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_linker.pyx + +"""Linking machinery for combining object codes. + +This module provides :class:`Linker` for linking one or more +:class:`~cuda.core.ObjectCode` objects, with :class:`LinkerOptions` for +configuration. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Union + +import cuda.bindings.driver +import cuda.bindings.nvjitlink +from cuda.core._module import ObjectCode +from cuda.core.typing import CompilerBackendType, ObjectCodeFormatType + + +class Linker: + """Represent a linking machinery to link one or more object codes into + :class:`~cuda.core.ObjectCode`. + + This object provides a unified interface to multiple underlying + linker libraries (such as nvJitLink or cuLink* from the CUDA driver). + + Parameters + ---------- + object_codes : :class:`~cuda.core.ObjectCode` + One or more ObjectCode objects to be linked. + options : :class:`LinkerOptions`, optional + Options for the linker. If not provided, default options will be used. + """ + + def __init__(self, options: LinkerOptions | None=None, *object_codes: ObjectCode): + ... + + def link(self, target_type: ObjectCodeFormatType | str) -> ObjectCode: + """Link the provided object codes into a single output of the specified target type. + + Parameters + ---------- + target_type : ObjectCodeFormatType | str + The type of the target output. Must be either "cubin" or "ptx". + + Returns + ------- + :class:`~cuda.core.ObjectCode` + The linked object code of the specified target type. + + .. note:: + + Ensure that input object codes were compiled with appropriate + flags for linking (e.g., relocatable device code enabled). + """ + + def get_error_log(self) -> str: + """Get the error log generated by the linker. + + Returns + ------- + str + The error log. + """ + + def get_info_log(self) -> str: + """Get the info log generated by the linker. + + Returns + ------- + str + The info log. + """ + + def close(self): + """Destroy this linker.""" + + @property + def handle(self) -> LinkerHandleT: + """Return the underlying handle object. + + .. note:: + + The type of the returned object depends on the backend. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Linker.handle)``. + """ + + @classmethod + def which_backend(cls) -> CompilerBackendType: + """Return which linking backend will be used. + + Returns :attr:`~CompilerBackendType.NVJITLINK` when the nvJitLink + library is available and meets the minimum version requirement, + otherwise :attr:`~CompilerBackendType.DRIVER`. + + .. note:: + + Prefer letting :class:`Linker` decide. Query ``which_backend()`` + only when you need to dispatch based on input format (for + example: choose PTX vs. LTOIR before constructing a + ``Linker``). The returned value names an implementation + detail whose support matrix may shift across CTK releases. + """ + +@dataclass +class LinkerOptions: + """Customizable options for configuring :class:`Linker`. + + Since the linker may choose to use nvJitLink or the driver APIs as the linking backend, + not all options are applicable. When the system's installed nvJitLink is too old (<12.3), + or not installed, the driver APIs (cuLink) will be used instead. + + Attributes + ---------- + name : str, optional + Name of the linker. If the linking succeeds, the name is passed down to the generated :class:`ObjectCode`. + arch : str, optional + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. + max_register_count : int, optional + Maximum register count. + time : bool, optional + Print timing information to the info log. + Default: False. + verbose : bool, optional + Print verbose messages to the info log. + Default: False. + link_time_optimization : bool, optional + Perform link time optimization. + Default: False. + ptx : bool, optional + Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``. + Default: False. + optimization_level : int, optional + Set optimization level. Only 0 and 3 are accepted. + debug : bool, optional + Generate debug information. + Default: False. + lineinfo : bool, optional + Generate line information. + Default: False. + ftz : bool, optional + Flush denormal values to zero. + Default: False. + prec_div : bool, optional + Use precise division. + Default: True. + prec_sqrt : bool, optional + Use precise square root. + Default: True. + fma : bool, optional + Use fast multiply-add. + Default: True. + kernels_used : [str | tuple[str] | list[str]], optional + Pass a kernel or sequence of kernels that are used; any not in the list can be removed. + variables_used : [str | tuple[str] | list[str]], optional + Pass a variable or sequence of variables that are used; any not in the list can be removed. + optimize_unused_variables : bool, optional + Assume that if a variable is not referenced in device code, it can be removed. + Default: False. + ptxas_options : [str | tuple[str] | list[str]], optional + Pass options to PTXAS. + split_compile : int, optional + Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split + compilation (default). + Default: 1. + split_compile_extended : int, optional + A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. + Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This + option can potentially impact performance of the compiled binary. + Default: 1. + no_cache : bool, optional + Do not cache the intermediate steps of nvJitLink. + Default: False. + """ + name: str | None = '' + arch: str | None = None + max_register_count: int | None = None + time: bool | None = None + verbose: bool | None = None + link_time_optimization: bool | None = None + ptx: bool | None = None + optimization_level: int | None = None + debug: bool | None = None + lineinfo: bool | None = None + ftz: bool | None = None + prec_div: bool | None = None + prec_sqrt: bool | None = None + fma: bool | None = None + kernels_used: str | tuple[str] | list[str] | None = None + variables_used: str | tuple[str] | list[str] | None = None + optimize_unused_variables: bool | None = None + ptxas_options: str | tuple[str] | list[str] | None = None + split_compile: int | None = None + split_compile_extended: int | None = None + no_cache: bool | None = None + + def __post_init__(self): + ... + + def _prepare_nvjitlink_options(self, as_bytes: bool=False) -> list[bytes] | list[str]: + ... + + def _prepare_driver_options(self) -> tuple[list, list]: + ... + + def as_bytes(self, backend: str='nvjitlink') -> list[bytes]: + """Convert linker options to bytes format for the nvjitlink backend. + + Parameters + ---------- + backend : str, optional + The linker backend. Only "nvjitlink" is supported. Default is "nvjitlink". + + Returns + ------- + list[bytes] + List of option strings encoded as bytes. + + Raises + ------ + ValueError + If an unsupported backend is specified. + RuntimeError + If nvJitLink backend is not available. + """ +_keep_driver_in_stub: 'cuda.bindings.driver.CUlinkState' +_keep_nvjitlink_in_stub: 'cuda.bindings.nvjitlink.nvJitLinkHandle' +__all__ = ['Linker', 'LinkerOptions'] +LinkerHandleT = Union['cuda.bindings.nvjitlink.nvJitLinkHandle', 'cuda.bindings.driver.CUlinkState'] +_driver = None +_inited = False +_use_nvjitlink_backend = None +_nvjitlink_input_types = None +_driver_input_types = None + +def _nvjitlink_has_version_symbol(nvjitlink) -> bool: + ... + +def _decide_nvjitlink_or_driver() -> bool: + """Return True if falling back to the cuLink* driver APIs.""" + +def _lazy_init(): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_buffer.pyi b/cuda_core/cuda/core/_memory/_buffer.pyi new file mode 100644 index 00000000000..b2e9e3e5ec9 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_buffer.pyi @@ -0,0 +1,292 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_buffer.pyx + +from __future__ import annotations + +from collections.abc import ByteString as BufferProtocol + +from cuda.core._memory._device_memory_resource import DeviceMemoryResource +from cuda.core._memory._ipc import IPCBufferDescriptor +from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource +from cuda.core._stream import Stream +from cuda.core.graph import GraphBuilder +from cuda.core.typing import DevicePointerType + + +class Buffer: + """Represent a handle to allocated memory. + + This generic object provides a unified representation for how + different memory resources are to give access to their memory + allocations. + + Support for data interchange mechanisms are provided by DLPack. + """ + + def __cinit__(self): + ... + + def _clear(self): + ... + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def _init(cls, ptr: DevicePointerType, size: int, mr: MemoryResource | None=None, ipc_descriptor: IPCBufferDescriptor | None=None, owner: object | None=None): + """Create a Buffer from a raw pointer. + + When ``mr`` is provided, the buffer takes ownership: ``mr.deallocate()`` + is called when the buffer is closed or garbage collected. When ``owner`` + is provided, the owner is kept alive but no deallocation is performed. + """ + + @staticmethod + def _reduce_helper(mr, ipc_descriptor): + ... + + def __reduce__(self): + ... + + @staticmethod + def from_handle(ptr: DevicePointerType, size: int, mr: MemoryResource | None=None, owner: object | None=None) -> Buffer: + """Create a new :class:`Buffer` object from a pointer. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerType` + Allocated buffer handle object + size : int + Memory size of the buffer + mr : :obj:`~_memory.MemoryResource`, optional + Memory resource associated with the buffer. When provided, + :meth:`MemoryResource.deallocate` is called when the buffer is + closed or garbage collected. + owner : object, optional + An object holding external allocation that the ``ptr`` points to. + The reference is kept as long as the buffer is alive. + The ``owner`` and ``mr`` cannot be specified together. + + Note + ---- + When neither ``mr`` nor ``owner`` is specified, this creates a + non-owning reference. The pointer will NOT be freed when the + :class:`Buffer` is closed or garbage collected. + """ + + @classmethod + def from_ipc_descriptor(cls, mr: DeviceMemoryResource | PinnedMemoryResource, ipc_descriptor: IPCBufferDescriptor, *, stream: Stream) -> Buffer: + """Import a buffer that was exported from another process. + + Parameters + ---------- + mr : :obj:`~_memory.DeviceMemoryResource` | :obj:`~_memory.PinnedMemoryResource` + The IPC-enabled memory resource matching the exporting process. + ipc_descriptor : :obj:`~_memory.IPCBufferDescriptor` + The descriptor exported from another process. + stream : :obj:`~_stream.Stream` + Keyword-only. The stream used for asynchronous deallocation when + the buffer is closed or garbage collected. + """ + + @property + def ipc_descriptor(self) -> IPCBufferDescriptor: + """Descriptor for sharing this buffer with other processes.""" + + def close(self, stream: Stream | GraphBuilder | None=None): + """Deallocate this buffer asynchronously on the given stream. + + This buffer is released back to their memory resource + asynchronously on the given stream. + + Parameters + ---------- + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder`, optional + The stream object to use for asynchronous deallocation. If None, + the deallocation stream stored in the handle is used. + """ + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc_val, exc_tb): + ... + + def copy_to(self, dst: Buffer | None=None, *, stream: Stream | GraphBuilder) -> Buffer: + """Copy from this buffer to the dst buffer asynchronously on the given stream. + + Copies the data from this buffer to the provided dst buffer. + If the dst buffer is not provided, then a new buffer is first + allocated using the associated memory resource before the copy. + + Parameters + ---------- + dst : :obj:`~_memory.Buffer`, optional + Destination buffer to copy data to. If not provided, a new buffer + is allocated using this buffer's memory resource. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword argument specifying the stream for the + asynchronous copy + + """ + + def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder): + """Copy from the src buffer to this buffer asynchronously on the given stream. + + Parameters + ---------- + src : :obj:`~_memory.Buffer` + Source buffer to copy data from + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword argument specifying the stream for the + asynchronous copy + + """ + + def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder): + """Fill this buffer with a repeating byte pattern. + + Parameters + ---------- + value : int | :obj:`collections.abc.Buffer` + - int: Must be in range [0, 256). Converted to 1 byte. + - :obj:`collections.abc.Buffer`: Must be 1, 2, or 4 bytes. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Stream for the asynchronous fill operation. + + Raises + ------ + TypeError + If value is not an int and does not support the buffer protocol. + ValueError + If value byte length is not 1, 2, or 4. + If buffer size is not divisible by value byte length. + OverflowError + If int value is outside [0, 256). + + """ + + def __dlpack__(self, *, stream: int | None=None, max_version: tuple[int, int] | None=None, dl_device: tuple[int, int] | None=None, copy: bool | None=None): + ... + + def __dlpack_device__(self) -> tuple[int, int]: + ... + + def __buffer__(self, flags: int, /) -> memoryview: + ... + + def __release_buffer__(self, buffer: memoryview, /): + ... + + @property + def device_id(self) -> int: + """Return the device ordinal of this buffer.""" + + @property + def handle(self) -> int: + """Return the buffer handle object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Buffer.handle)``. + """ + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... + + def __repr__(self) -> str: + ... + + @property + def is_device_accessible(self) -> bool: + """Return True if this buffer can be accessed by the GPU, otherwise False.""" + + @property + def is_host_accessible(self) -> bool: + """Return True if this buffer can be accessed by the CPU, otherwise False.""" + + @property + def is_managed(self) -> bool: + """Return True if this buffer is CUDA managed (unified) memory, otherwise False.""" + + @property + def is_mapped(self) -> bool: + """Return True if this buffer is mapped into the process via IPC.""" + + @property + def memory_resource(self) -> MemoryResource: + """Return the memory resource associated with this buffer.""" + + @property + def size(self) -> int: + """Return the memory size of this buffer.""" + + @property + def owner(self) -> object: + """Return the object holding external allocation.""" + +class MemoryResource: + """Abstract base class for memory resources that manage allocation and + deallocation of buffers. + + Subclasses must implement methods for allocating and deallocation, as well + as properties associated with this memory resource from which all allocated + buffers will inherit. (Since all :class:`Buffer` instances allocated and + returned by the :meth:`allocate` method would hold a reference to self, the + buffer properties are retrieved simply by looking up the underlying memory + resource's respective property.) + """ + + def allocate(self, size: int, *, stream: Stream | GraphBuilder) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword-only. The stream on which to perform the allocation + asynchronously. Must be passed explicitly; pass + ``device.default_stream`` to use the default stream. + + Returns + ------- + Buffer + The allocated buffer object, which can be used for device or host operations + depending on the resource's properties. + """ + + def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerType` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword-only. The stream on which to perform the deallocation + asynchronously. Must be passed explicitly; pass + ``device.default_stream`` to use the default stream. + """ + + @property + def is_device_accessible(self) -> bool: + """Whether buffers allocated by this resource are device-accessible.""" + + @property + def is_host_accessible(self) -> bool: + """Whether buffers allocated by this resource are host-accessible.""" + + @property + def is_managed(self) -> bool: + """Whether buffers allocated by this resource are CUDA managed (unified) memory.""" + + @property + def device_id(self) -> int: + """Device ID associated with this memory resource, or -1 if not applicable.""" +__all__ = ['Buffer', 'MemoryResource'] \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyi b/cuda_core/cuda/core/_memory/_device_memory_resource.pyi new file mode 100644 index 00000000000..7e2204cf1ee --- /dev/null +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyi @@ -0,0 +1,225 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_device_memory_resource.pyx + +from __future__ import annotations + +import uuid +from dataclasses import dataclass + +from cuda.core._device import Device +from cuda.core._memory._ipc import IPCAllocationHandle +from cuda.core._memory._memory_pool import _MemPool + + +@dataclass +class DeviceMemoryResourceOptions: + """Customizable :obj:`~_memory.DeviceMemoryResource` options. + + Attributes + ---------- + ipc_enabled : bool, optional + Specifies whether to create an IPC-enabled memory pool. When set to + True, the memory pool and its allocations can be shared with other + processes. (Default to False) + + max_size : int, optional + Maximum pool size. When set to 0, defaults to a system-dependent value. + (Default to 0) + """ + ipc_enabled: bool = False + max_size: int = 0 + +class DeviceMemoryResource(_MemPool): + """ + A device memory resource managing a stream-ordered memory pool. + + Parameters + ---------- + device_id : Device | int + Device or Device ordinal for which a memory resource is constructed. + + options : DeviceMemoryResourceOptions + Memory resource creation options. + + If set to `None`, the memory resource uses the driver's current + stream-ordered memory pool for the specified `device_id`. If no memory + pool is set as current, the driver's default memory pool for the device + is used. + + If not set to `None`, a new memory pool is created, which is owned by + the memory resource. + + When using an existing (current or default) memory pool, the returned + device memory resource does not own the pool (`is_handle_owned` is + `False`), and closing the resource has no effect. + + Notes + ----- + To create an IPC-Enabled memory resource (MR) that is capable of sharing + allocations between processes, specify ``ipc_enabled=True`` in the initializer + option. Sharing an allocation is a two-step procedure that involves + mapping a memory resource and then mapping buffers owned by that resource. + These steps can be accomplished in several ways. + + An IPC-enabled memory resource can allocate memory buffers but cannot + receive shared buffers. Mapping an MR to another process creates a "mapped + memory resource" (MMR). An MMR cannot allocate memory buffers and can only + receive shared buffers. MRs and MMRs are both of type + :class:`DeviceMemoryResource` and can be distinguished via + :attr:`DeviceMemoryResource.is_mapped`. + + An MR is shared via an allocation handle accessed through the + :attr:`DeviceMemoryResource.allocation_handle` property. The allocation + handle has a platform-specific interpretation; however, memory IPC is + currently only supported for Linux, and in that case allocation handles + are file descriptors. After sending an allocation handle to another + process, it can be used to create an MMR by invoking + :meth:`DeviceMemoryResource.from_allocation_handle`. + + Buffers can be shared as serializable descriptors accessed through the + :attr:`Buffer.ipc_descriptor` property. In a receiving process, a shared + buffer is created by invoking :meth:`Buffer.from_ipc_descriptor` with an + MMR and buffer descriptor, where the MMR corresponds to the MR that + created the described buffer. + + To help manage the association between memory resources and buffers, a + registry is provided. Every MR has a unique identifier (UUID). MMRs can be + registered by calling :meth:`DeviceMemoryResource.register` with the UUID + of the corresponding MR. Registered MMRs can be looked up via + :meth:`DeviceMemoryResource.from_registry`. When registering MMRs in this + way, the use of buffer descriptors can be avoided. Instead, buffer objects + can themselves be serialized and transferred directly. Serialization embeds + the UUID, which is used to locate the correct MMR during reconstruction. + + IPC-enabled memory resources interoperate with the :mod:`multiprocessing` + module to provide a simplified interface. This approach can avoid direct + use of allocation handles, buffer descriptors, MMRs, and the registry. When + using :mod:`multiprocessing` to spawn processes or send objects through + communication channels such as :class:`multiprocessing.Queue`, + :class:`multiprocessing.Pipe`, or :class:`multiprocessing.Connection`, + :class:`Buffer` objects may be sent directly, and in such cases the process + for creating MMRs and mapping buffers will be handled automatically. + + For greater efficiency when transferring many buffers, one may also send + MRs and buffers separately. When an MR is sent via :mod:`multiprocessing`, + an MMR is created and registered in the receiving process. Subsequently, + buffers may be serialized and transferred using ordinary :mod:`pickle` + methods. The reconstruction procedure uses the registry to find the + associated MMR. + """ + + def __cinit__(self, *args, **kwargs): + ... + + def __init__(self, device_id: Device | int, options=None): + ... + + def __reduce__(self): + ... + + @staticmethod + def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource: + """ + Obtain a registered mapped memory resource. + + Raises + ------ + RuntimeError + If no mapped memory resource is found in the registry. + """ + + def register(self, uuid: uuid.UUID) -> DeviceMemoryResource: + """ + Register a mapped memory resource. + + Returns + ------- + The registered mapped memory resource. If one was previously registered + with the given key, it is returned. + """ + + @classmethod + def from_allocation_handle(cls, device_id: Device | int, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: + """Create a device memory resource from an allocation handle. + + Construct a new `DeviceMemoryResource` instance that imports a memory + pool from a shareable handle. The memory pool is marked as owned, and + the resource is associated with the specified `device_id`. + + Parameters + ---------- + device_id : int | Device + The ID of the device or a Device object for which the memory + resource is created. + + alloc_handle : int | IPCAllocationHandle + The shareable handle of the device memory resource to import. If an + integer is supplied, it must represent a valid platform-specific + handle. It is the caller's responsibility to close that handle. + + Returns + ------- + A new device memory resource instance with the imported handle. + """ + + @property + def allocation_handle(self) -> IPCAllocationHandle: + """Shareable handle for this memory pool (requires IPC). + + The handle can be used to share the memory pool with other processes. + The handle is cached in this `MemoryResource` and owned by it. + """ + + @property + def device_id(self) -> int: + """The associated device ordinal.""" + + @property + def peer_accessible_by(self): + """ + Get or set the devices that can access allocations from this memory + pool. Access can be modified at any time and affects all allocations + from this memory pool. + + Returns a set-like proxy of :obj:`~_device.Device` objects that manages + peer access. Inputs are accepted as either :obj:`~_device.Device` + objects or device-ordinal :class:`int` values. + + Examples + -------- + >>> dmr = DeviceMemoryResource(0) + >>> dmr.peer_accessible_by = {1} # grant access to device 1 + >>> assert 1 in dmr.peer_accessible_by + >>> dmr.peer_accessible_by.add(2) # update access to include device 2 + >>> dmr.peer_accessible_by = [] # revoke peer access + """ + + @peer_accessible_by.setter + def peer_accessible_by(self, devices): + ... + + @property + def is_device_accessible(self) -> bool: + """Return True. This memory resource provides device-accessible buffers.""" + + @property + def is_host_accessible(self) -> bool: + """Return False. This memory resource does not provide host-accessible buffers.""" +__all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] + +def DMR_mempool_get_access(dmr: DeviceMemoryResource, device_id: int): + """ + Probes peer access from the given device using cuMemPoolGetAccess. + + Parameters + ---------- + device_id : int or Device + The device to query access for. + + Returns + ------- + str + Access permissions: "rw" for read-write, "r" for read-only, "" for no access. + """ + +def _deep_reduce_device_memory_resource(mr): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi new file mode 100644 index 00000000000..09c5a98185f --- /dev/null +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi @@ -0,0 +1,119 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_graph_memory_resource.pyx + +from __future__ import annotations + +from functools import cache + +from cuda.core._device import Device +from cuda.core._memory._buffer import Buffer, MemoryResource +from cuda.core._stream import Stream +from cuda.core.graph import GraphBuilder +from cuda.core.typing import DevicePointerType + + +class GraphMemoryResourceAttributes: + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def _init(cls, device_id: int): + ... + + def __repr__(self): + ... + + @property + def reserved_mem_current(self): + """Current amount of backing memory allocated.""" + + @property + def reserved_mem_high(self): + """ + High watermark of backing memory allocated. It can be set to zero to + reset it to the current usage. + """ + + @reserved_mem_high.setter + def reserved_mem_high(self, value: int): + ... + + @property + def used_mem_current(self): + """Current amount of memory in use.""" + + @property + def used_mem_high(self): + """ + High watermark of memory in use. It can be set to zero to reset it to + the current usage. + """ + + @used_mem_high.setter + def used_mem_high(self, value: int): + ... + +class cyGraphMemoryResource(MemoryResource): + + def __cinit__(self, device_id: int): + ... + + def allocate(self, size: int, *, stream: Stream | GraphBuilder) -> Buffer: + """ + Allocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. + """ + + def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder): + """ + Deallocate a buffer of the requested size. See documentation for :obj:`~_memory.MemoryResource`. + """ + + def close(self): + """No operation (provided for compatibility).""" + + def trim(self): + """Free unused memory that was cached on the specified device for use with graphs back to the OS.""" + + @property + def attributes(self) -> GraphMemoryResourceAttributes: + """Asynchronous allocation attributes related to graphs.""" + + @property + def device_id(self) -> int: + """The associated device ordinal.""" + + @property + def is_device_accessible(self) -> bool: + """Return True. This memory resource provides device-accessible buffers.""" + + @property + def is_host_accessible(self) -> bool: + """Return False. This memory resource does not provide host-accessible buffers.""" + +class GraphMemoryResource(cyGraphMemoryResource): + """ + A memory resource for memory related to graphs. + + The only supported operations are allocation, deallocation, and a limited + set of status queries. + + This memory resource should be used when building graphs. Using this when + graphs capture is not enabled will result in a runtime error. + + Conversely, allocating memory from a `DeviceMemoryResource` when graph + capturing is enabled results in a runtime error. + + Parameters + ---------- + device_id: int | Device + Device or Device ordinal for which a graph memory resource is obtained. + """ + + def __new__(cls, device_id: int | Device): + ... + + @classmethod + @cache + def _create(cls, device_id: int): + ... +__all__ = ['GraphMemoryResource'] \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_ipc.pyi b/cuda_core/cuda/core/_memory/_ipc.pyi new file mode 100644 index 00000000000..ebeeaa0fd1f --- /dev/null +++ b/cuda_core/cuda/core/_memory/_ipc.pyi @@ -0,0 +1,86 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_ipc.pyx + +from __future__ import annotations + +import uuid + + +class IPCDataForBuffer: + """Data members related to sharing memory buffers via IPC.""" + + def __cinit__(self, ipc_descriptor: IPCBufferDescriptor, is_mapped: bool): + ... + + @property + def ipc_descriptor(self): + ... + + @property + def is_mapped(self): + ... + +class IPCDataForMR: + """Data members related to sharing memory resources via IPC.""" + + def __cinit__(self, alloc_handle: IPCAllocationHandle, is_mapped: bool): + ... + + @property + def alloc_handle(self): + ... + + @property + def is_mapped(self): + ... + + @property + def uuid(self): + ... + +class IPCBufferDescriptor: + """Serializable object describing a buffer that can be shared between processes.""" + + def __init__(self, *arg, **kwargs): + ... + + @staticmethod + def _init(reserved: bytes, size: int): + ... + + def __reduce__(self): + ... + + @property + def size(self): + ... + +class IPCAllocationHandle: + """Shareable handle to an IPC-enabled device memory pool.""" + + def close(self): + """Close the handle.""" + + def __init__(self, *arg, **kwargs): + ... + + @classmethod + def _init(cls, handle: int, uuid): + ... + + def __int__(self) -> int: + ... + + @property + def handle(self) -> int: + ... + + @property + def uuid(self) -> uuid.UUID: + ... +__all__ = ['IPCBufferDescriptor', 'IPCAllocationHandle'] + +def _reduce_allocation_handle(alloc_handle): + ... + +def _reconstruct_allocation_handle(cls, df, uuid): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_managed_memory_resource.pyi b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyi new file mode 100644 index 00000000000..134da7e517b --- /dev/null +++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyi @@ -0,0 +1,108 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_managed_memory_resource.pyx + +from __future__ import annotations + +from dataclasses import dataclass + +from cuda.core._memory._memory_pool import _MemPool +from cuda.core.typing import ManagedMemoryLocationType + + +@dataclass +class ManagedMemoryResourceOptions: + """Customizable :obj:`~_memory.ManagedMemoryResource` options. + + Attributes + ---------- + preferred_location : int | None, optional + A location identifier (device ordinal or NUMA node ID) whose + meaning depends on ``preferred_location_type``. + (Default to ``None``) + + preferred_location_type : ManagedMemoryLocationType | str | None, optional + Controls how ``preferred_location`` is interpreted. + + When set to ``None`` (the default), legacy behavior is used: + ``preferred_location`` is interpreted as a device ordinal, + ``-1`` for host, or ``None`` for no preference. + + When set explicitly, the type determines both the kind of + preferred location and the valid values for + ``preferred_location``: + + - ``"device"``: prefer a specific GPU. ``preferred_location`` + must be a device ordinal (``>= 0``). + - ``"host"``: prefer host memory (OS-managed NUMA placement). + ``preferred_location`` must be ``None``. + - ``"host_numa"``: prefer a specific host NUMA node. + ``preferred_location`` must be a NUMA node ID (``>= 0``), + or ``None`` to derive the NUMA node from the current CUDA + device's ``host_numa_id`` attribute (requires an active + CUDA context). + + (Default to ``None``) + """ + preferred_location: int | None = None + preferred_location_type: ManagedMemoryLocationType | str | None = None + +class ManagedMemoryResource(_MemPool): + """ + A managed memory resource managing a stream-ordered memory pool. + + Managed memory is accessible from both the host and device, with automatic + migration between them as needed. + + Parameters + ---------- + options : ManagedMemoryResourceOptions + Memory resource creation options. + + If set to `None`, the memory resource uses the driver's current + stream-ordered memory pool. If no memory pool is set as current, + the driver's default memory pool is used. + + If not set to `None`, a new memory pool is created, which is owned by + the memory resource. + + When using an existing (current or default) memory pool, the returned + managed memory resource does not own the pool (`is_handle_owned` is + `False`), and closing the resource has no effect. + + Notes + ----- + IPC (Inter-Process Communication) is not currently supported for managed + memory pools. + """ + + def __init__(self, options=None): + ... + + @property + def device_id(self) -> int: + """The preferred device ordinal, or -1 if the preferred location is not a device.""" + + @property + def preferred_location(self) -> tuple[ManagedMemoryLocationType, int | None] | None: + """The preferred location for managed memory allocations. + + Returns ``None`` if no preferred location is set (driver decides), + or a tuple ``(type, id)`` where *type* is one of ``"device"``, + ``"host"``, or ``"host_numa"``, and *id* is the device ordinal, + ``None`` (for ``"host"``), or the NUMA node ID, respectively. + """ + + @property + def is_device_accessible(self) -> bool: + """Return True. This memory resource provides device-accessible buffers.""" + + @property + def is_host_accessible(self) -> bool: + """Return True. This memory resource provides host-accessible buffers.""" + + @property + def is_managed(self) -> bool: + """Return True. This memory resource provides managed (unified) memory buffers.""" +__all__ = ['ManagedMemoryResource', 'ManagedMemoryResourceOptions'] + +def reset_concurrent_access_warning(): + """Reset the concurrent access warning flag for testing purposes.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyi b/cuda_core/cuda/core/_memory/_memory_pool.pyi new file mode 100644 index 00000000000..20434e0c52f --- /dev/null +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyi @@ -0,0 +1,127 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_memory_pool.pyx + +from __future__ import annotations + +import uuid + +from cuda.core._memory._buffer import Buffer, MemoryResource +from cuda.core._stream import Stream +from cuda.core.graph import GraphBuilder +from cuda.core.typing import DevicePointerType + + +class _MemPoolAttributes: + """Provides access to memory pool attributes.""" + + def __init__(self, *args, **kwargs): + ... + + def __repr__(self): + ... + + @property + def reuse_follow_event_dependencies(self): + """Allow memory to be reused when there are event dependencies between streams.""" + + @property + def reuse_allow_opportunistic(self): + """Allow reuse of completed frees without dependencies.""" + + @property + def reuse_allow_internal_dependencies(self): + """Allow insertion of new stream dependencies for memory reuse.""" + + @property + def release_threshold(self): + """Amount of reserved memory to hold before OS release.""" + + @property + def reserved_mem_current(self): + """Current amount of backing memory allocated.""" + + @property + def reserved_mem_high(self): + """High watermark of backing memory allocated.""" + + @property + def used_mem_current(self): + """Current amount of memory in use.""" + + @property + def used_mem_high(self): + """High watermark of memory in use.""" + +class _MemPool(MemoryResource): + + def __cinit__(self): + ... + + def close(self): + """ + Close the memory resource and destroy the associated memory pool + if owned. + """ + + def allocate(self, size: int, *, stream: Stream | GraphBuilder) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword-only. The stream on which to perform the allocation + asynchronously. Must be passed explicitly; pass + ``device.default_stream`` to use the default stream. + + Returns + ------- + Buffer + The allocated buffer object, which is accessible on the device that this memory + resource was created for. + """ + + def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | GraphBuilder): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerType` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : :obj:`~_stream.Stream` | :obj:`~graph.GraphBuilder` + Keyword-only. The stream on which to perform the deallocation + asynchronously. Must be passed explicitly; pass + ``device.default_stream`` to use the default stream. + """ + + @property + def attributes(self) -> _MemPoolAttributes: + """Memory pool attributes.""" + + @property + def handle(self) -> object: + """Handle to the underlying memory pool.""" + + @property + def is_handle_owned(self) -> bool: + """Whether the memory resource handle is owned. If False, ``close`` has no effect.""" + + @property + def is_ipc_enabled(self) -> bool: + """Whether this memory resource has IPC enabled.""" + + @property + def is_mapped(self) -> bool: + """ + Whether this is a mapping of an IPC-enabled memory resource from + another process. If True, allocation is not permitted. + """ + + @property + def uuid(self) -> uuid.UUID | None: + """ + A universally unique identifier for this memory resource. Meaningful + only for IPC-enabled memory resources. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyi b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi new file mode 100644 index 00000000000..95162a395e4 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi @@ -0,0 +1,138 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_peer_access_utils.pyx + +from __future__ import annotations + +from collections.abc import Callable, Iterable, MutableSet +from collections.abc import Set as AbstractSet +from dataclasses import dataclass +from typing import Any + +from cuda.core._memory._device_memory_resource import DeviceMemoryResource + + +@dataclass(frozen=True) +class PeerAccessPlan: + """Normalized peer-access target state and the driver updates it requires.""" + target_ids: tuple[int, ...] + to_add: tuple[int, ...] + to_remove: tuple[int, ...] + +class PeerAccessibleBySetProxy(MutableSet): + """Live driver-backed view of the peer devices granted access to a memory pool. + + Reads (``__contains__``, ``__iter__``, ``len(...)``) call ``cuMemPoolGetAccess``; + writes (``add``, ``discard``, and bulk ops) call ``cuMemPoolSetAccess``. There + is no in-memory mirror, so the view always reflects the current driver state + and stays consistent across multiple wrappers around the same pool. + + Iteration yields :class:`~cuda.core.Device` objects. ``add``, ``discard``, and + ``__contains__`` accept either a :class:`~cuda.core.Device` or a device-ordinal + ``int``; the owner device is silently ignored when supplied. + + All bulk operations (``update``, ``|=``, ``&=``, ``-=``, ``^=``, ``clear``) + issue exactly one ``cuMemPoolSetAccess`` call. This matters: peer-access + transitions can take seconds per pool because every existing memory mapping + is updated, so coalescing into a single driver call lets the toolkit handle + the mappings in parallel. + """ + __slots__ = ('_mr',) + + def __init__(self, mr): + ... + + @classmethod + def _from_iterable(cls, it): + ... + + def __contains__(self, value) -> bool: + ... + + def __iter__(self): + ... + + def __len__(self) -> int: + ... + + def add(self, value) -> None: + """Grant peer access from ``value`` to allocations in this pool.""" + + def discard(self, value) -> None: + """Revoke peer access from ``value`` to allocations in this pool.""" + + def clear(self) -> None: + """Revoke all peer access in a single driver call.""" + + def update(self, *others) -> None: + """Grant peer access to every device in ``others`` in one driver call.""" + + def difference_update(self, *others) -> None: + """Revoke peer access for every device in ``others`` in one driver call.""" + + def intersection_update(self, *others) -> None: + """Restrict peer access to the intersection in a single driver call.""" + + def symmetric_difference_update(self, other) -> None: + """Toggle peer access for every device in ``other`` in one driver call.""" + + def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] + ... + + def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: + ... + + def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: + ... + + def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] + ... + + def __repr__(self) -> str: + ... + + def _apply(self, additions, removals) -> None: + """Compute the diff and issue a single ``cuMemPoolSetAccess``. + + ``additions`` and ``removals`` are user-supplied (``Device | int``); + only the owner device is filtered out. Adds are validated through + :meth:`Device.can_access_peer` via :func:`plan_peer_access_update`; + removals bypass that check (revoking is always permitted). + """ + +def replace_peer_accessible_by(mr: DeviceMemoryResource, devices): + """Replace the full peer-access set in a single batched driver call. + + Backs the ``mr.peer_accessible_by = [...]`` setter. Uses the same planner + as the proxy's bulk ops; the only difference is that adds and removes are + derived from the symmetric difference between current driver state and the + requested target set. + """ + +def normalize_peer_access_targets(owner_device_id: int, requested_devices: Iterable[object], *, resolve_device_id: Callable[[object], int]) -> tuple[int, ...]: + """Return sorted, unique peer device IDs, excluding the owner device.""" + +def plan_peer_access_update(owner_device_id: int, current_peer_ids: Iterable[int], requested_devices: Iterable[object], *, resolve_device_id: Callable[[object], int], can_access_peer: Callable[[int], bool]) -> PeerAccessPlan: + """Compute the peer-access target state and add/remove deltas.""" + +def _resolve_peer_device_id(value): + """Coerce ``Device | int`` into a device-ordinal int.""" + +def _set_pool_access(mr, to_add: tuple, to_remove: tuple): + """Issue one ``cuMemPoolSetAccess`` for the given add/remove deltas. + + The thin Python-callable layer that wraps the actual driver call: building + the ``CUmemAccessDesc`` array and invoking ``cuMemPoolSetAccess`` happens + in here. Tests monkeypatch this on the module to spy on real driver work + without intercepting earlier no-op paths. + + Preconditions: ``len(to_add) + len(to_remove) > 0`` (the caller is + responsible for skipping empty diffs). + """ + +def _apply_peer_access_diff(mr, to_add, to_remove): + """Apply a peer-access diff in at most one driver call. + + Every write path on :class:`PeerAccessibleBySetProxy` and the + ``peer_accessible_by`` setter routes through this function. Empty diffs + short-circuit here so the driver-level helper :func:`_set_pool_access` is + only invoked when there is actual work for ``cuMemPoolSetAccess`` to do. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi new file mode 100644 index 00000000000..03731e0fd19 --- /dev/null +++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyi @@ -0,0 +1,148 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx + +from __future__ import annotations + +import uuid +from dataclasses import dataclass + +from cuda.core._memory._ipc import IPCAllocationHandle +from cuda.core._memory._memory_pool import _MemPool + + +@dataclass +class PinnedMemoryResourceOptions: + """Customizable :obj:`~_memory.PinnedMemoryResource` options. + + Attributes + ---------- + ipc_enabled : bool, optional + Specifies whether to create an IPC-enabled memory pool. When set to + True, the memory pool and its allocations can be shared with other + processes. (Default to False) + + max_size : int, optional + Maximum pool size. When set to 0, defaults to a system-dependent value. + (Default to 0) + + numa_id : int or None, optional + Host NUMA node ID for pool placement. When set to None (the default), + the behavior depends on ``ipc_enabled``: + + - ``ipc_enabled=False``: OS-managed placement (location type HOST). + - ``ipc_enabled=True``: automatically derived from the current CUDA + device's ``host_numa_id`` attribute, requiring an active CUDA + context. + + When set to a non-negative integer, that NUMA node is used explicitly + regardless of ``ipc_enabled`` (location type HOST_NUMA). + """ + ipc_enabled: bool = False + max_size: int = 0 + numa_id: int | None = None + +class PinnedMemoryResource(_MemPool): + """ + A host-pinned memory resource managing a stream-ordered memory pool. + + Parameters + ---------- + options : PinnedMemoryResourceOptions + Memory resource creation options. + + If set to `None`, the memory resource uses the driver's current + stream-ordered memory pool. If no memory + pool is set as current, the driver's default memory pool + is used. + + If not set to `None`, a new memory pool is created, which is owned by + the memory resource. + + When using an existing (current or default) memory pool, the returned + host-pinned memory resource does not own the pool (`is_handle_owned` is + `False`), and closing the resource has no effect. + + Notes + ----- + To create an IPC-Enabled memory resource (MR) that is capable of sharing + allocations between processes, specify ``ipc_enabled=True`` in the initializer + option. When IPC is enabled and ``numa_id`` is not specified, the NUMA node + is automatically derived from the current CUDA device's ``host_numa_id`` + attribute, which requires an active CUDA context. If ``numa_id`` is + explicitly set, that value is used regardless of ``ipc_enabled``. + + See :class:`DeviceMemoryResource` for more details on IPC usage patterns. + """ + + def __init__(self, options=None): + ... + + def __reduce__(self): + ... + + @staticmethod + def from_registry(uuid: uuid.UUID) -> PinnedMemoryResource: + """ + Obtain a registered mapped memory resource. + + Raises + ------ + RuntimeError + If no mapped memory resource is found in the registry. + """ + + def register(self, uuid: uuid.UUID) -> PinnedMemoryResource: + """ + Register a mapped memory resource. + + Returns + ------- + The registered mapped memory resource. If one was previously registered + with the given key, it is returned. + """ + + @classmethod + def from_allocation_handle(cls, alloc_handle: int | IPCAllocationHandle) -> PinnedMemoryResource: + """Create a host-pinned memory resource from an allocation handle. + + Construct a new `PinnedMemoryResource` instance that imports a memory + pool from a shareable handle. The memory pool is marked as owned. + + Parameters + ---------- + alloc_handle : int | IPCAllocationHandle + The shareable handle of the host-pinned memory resource to import. If an + integer is supplied, it must represent a valid platform-specific + handle. It is the caller's responsibility to close that handle. + + Returns + ------- + A new host-pinned memory resource instance with the imported handle. + """ + + @property + def allocation_handle(self) -> IPCAllocationHandle: + """Shareable handle for this memory pool (requires IPC). + + The handle can be used to share the memory pool with other processes. + The handle is cached in this `MemoryResource` and owned by it. + """ + + @property + def device_id(self) -> int: + """Return -1. Pinned memory is host memory and is not associated with a specific device.""" + + @property + def numa_id(self) -> int: + """The host NUMA node ID used for pool placement, or -1 for OS-managed placement.""" + + @property + def is_device_accessible(self) -> bool: + """Return True. This memory resource provides device-accessible buffers.""" + + @property + def is_host_accessible(self) -> bool: + """Return True. This memory resource provides host-accessible buffers.""" +__all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions'] + +def _deep_reduce_pinned_memory_resource(mr): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_memoryview.pyi b/cuda_core/cuda/core/_memoryview.pyi new file mode 100644 index 00000000000..c686a16a8be --- /dev/null +++ b/cuda_core/cuda/core/_memoryview.pyi @@ -0,0 +1,305 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_memoryview.pyx + +from __future__ import annotations + +import functools + +import numpy +from cuda.core._layout import _StridedLayout +from cuda.core._memory import Buffer +from cuda.core._stream import Stream + +from ._dlpack import * + + +class StridedMemoryView: + """A class holding metadata of a strided dense array/tensor. + + A :obj:`StridedMemoryView` instance can be created in three ways: + + 1. Using the :obj:`args_viewable_as_strided_memory` decorator (recommended) + 2. Explicit construction relying on DLPack or CUDA Array Interface, see below. + 3. From :obj:`~_memory.Buffer` and shape and size tuples (see + :meth:`from_buffer` classmethod) + + ``StridedMemoryView(obj, stream_ptr)`` can be used to create a view from + objects supporting either DLPack (up to v1.0) or CUDA Array Interface + (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol + first, then the CAI protocol. A :obj:`BufferError` is raised if neither is + supported. + + Since either way would take a consumer stream, for DLPack it is passed to + ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a + stream order will be established between the consumer stream and the + producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if + ``cudaStreamWaitEvent`` is called by this method. + + To opt-out of the stream ordering operation in either DLPack or CAI, + please pass ``stream_ptr=-1``. Note that this deviates (on purpose) + from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core`` + does not encourage using the (legacy) default/null stream, but is + consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be + internally passed to ``obj.__dlpack__()`` instead. + + Parameters + ---------- + obj : Any + Any objects that supports either DLPack (up to v1.0) or CUDA Array + Interface (v3). + stream_ptr: int + The pointer address (as Python `int`) to the **consumer** stream. + Stream ordering will be properly established unless ``-1`` is passed. + + + Attributes + ----------- + ptr : int + Pointer to the tensor buffer (as a Python `int`). + device_id : int + The device ID for where the tensor is located. It is -1 for CPU tensors + (meaning those only accessible from the host). + is_device_accessible : bool + Whether the tensor data can be accessed on the GPU. + readonly: bool + Whether the tensor data can be modified in place. + exporting_obj : Any + A reference to the original tensor object that is being viewed. + If the view is created with :meth:`from_buffer`, + it will be the Buffer instance passed to the method. + + """ + + def __init__(self, obj: object=None, stream_ptr: int | None=None) -> None: + ... + + @classmethod + def from_dlpack(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView: + """Create a view from an object supporting the `DLPack `_ protocol. + + Parameters + ---------- + obj : object + An object implementing the `DLPack `_ protocol + (via ``__dlpack__``). + stream_ptr : int, optional + Stream pointer for synchronization. If ``None``, no synchronization is performed. + """ + + @classmethod + def from_cuda_array_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView: + """Create a view from an object supporting the `__cuda_array_interface__ `_ protocol. + + Parameters + ---------- + obj : object + An object implementing the `__cuda_array_interface__ `_ protocol. + stream_ptr : int, optional + Stream pointer for synchronization. If ``None``, no synchronization is performed. + """ + + @classmethod + def from_array_interface(cls, obj: object) -> StridedMemoryView: + """Create a view from an object supporting the `__array_interface__ `_ protocol. + + Parameters + ---------- + obj : object + An object implementing the `__array_interface__ `_ protocol (e.g., a numpy array). + """ + + @classmethod + def from_any_interface(cls, obj: object, stream_ptr: int | None=None) -> StridedMemoryView: + """Create a view by automatically selecting the best available protocol. + + Tries `DLPack `_ first, then falls back to + `__cuda_array_interface__ `_. + ``torch.Tensor`` objects are transparently handled via a fast AOTI path + regardless of which protocol is selected. + + Parameters + ---------- + obj : object + An object implementing `DLPack `_ or + `__cuda_array_interface__ `_. + stream_ptr : int, optional + Stream pointer for synchronization. If ``None``, no synchronization is performed. + """ + + @classmethod + def from_buffer(cls, buffer: Buffer, shape: tuple[int, ...], strides: tuple[int, ...] | None=None, *, itemsize: int | None=None, dtype: numpy.dtype | None=None, is_readonly: bool=False) -> StridedMemoryView: + """ + Creates a :obj:`StridedMemoryView` instance from a :obj:`~_memory.Buffer` and shape and strides tuples. + The Buffer can be either allocation coming from a :obj:`MemoryResource` or an external allocation + wrapped in a :obj:`~_memory.Buffer` object with ``Buffer.from_handle(ptr, size, owner=...)``. + + .. caution:: + When creating a :obj:`StridedMemoryView` from a :obj:`~_memory.Buffer`, + no synchronization is performed. It is the user's responsibility to ensure + the data in ``buffer`` is properly synchronized when consuming the view. + + Parameters + ---------- + buffer : :obj:`~_memory.Buffer` + The buffer to create the view from. + shape : :obj:`tuple` + The layout describing the shape, strides and itemsize of the elements in + the buffer. + strides : :obj:`tuple` + The layout describing the shape, strides and itemsize of the elements in + the buffer. + dtype : :obj:`numpy.dtype` + Optional dtype. + If specified, the dtype's itemsize must match the layout's itemsize. + is_readonly : bool, optional + Whether the mark the view as readonly. + """ + + def __dealloc__(self): + ... + + def view(self, layout: _StridedLayout | None=None, dtype: numpy.dtype | None=None) -> StridedMemoryView: + """ + Creates a new view with adjusted layout and dtype. + Same as calling :meth:`from_buffer` with the current buffer. + """ + + def as_tensor_map(self, box_dim=None, *, options=None, element_strides=None, data_type=None, interleave=None, swizzle=None, l2_promotion=None, oob_fill=None): + """Create a tiled :obj:`TensorMapDescriptor` from this view. + + This is the public entry point for creating tiled tensor map + descriptors in ``cuda.core``. Pass either ``box_dim`` and the + individual keyword arguments directly, or provide bundled tiled + options via ``options=``. + """ + + def copy_from(self, other: StridedMemoryView, stream: Stream, allocator=None, blocking: bool | None=None): + """ + Copies the data from the other view into this view. + + The copy can be performed between following memory spaces: + host-to-device, device-to-host, device-to-device (on the same device). + + Parameters + ---------- + other : StridedMemoryView + The view to copy data from. + stream : Stream | None, optional + The stream to schedule the copy on. + allocator : MemoryResource | None, optional + If temporary buffers are needed, the specified memory resources + will be used to allocate the memory. If not specified, default + resources will be used. + blocking : bool | None, optional + Whether the call should block until the copy is complete. + * ``True``: the ``stream`` is synchronized with the host at the end of the call, + blocking until the copy is complete. + * ``False``: if possible, the call returns immediately once the copy is scheduled. + However, in some cases of host-to-device or device-to-host copies, the call may + still synchronize with the host if necessary. + * ``None`` (default): + * for device-to-device, it defaults to ``False`` (non-blocking), + * for host-to-device or device-to-host, it defaults to ``True`` (blocking). + """ + + def copy_to(self, other: StridedMemoryView, stream: Stream | None=None, allocator=None, blocking: bool | None=None): + """ + Copies the data from this view into the ``other`` view. + + For details, see :meth:`copy_from`. + """ + + def __dlpack__(self, *, stream: int | None=None, max_version: tuple[int, int] | None=None, dl_device: tuple[int, int] | None=None, copy: bool | None=None): + ... + + def __dlpack_device__(self) -> tuple[int, int]: + ... + + @property + def _layout(self) -> _StridedLayout: + """ + The layout of the tensor. For StridedMemoryView created from DLPack or CAI, + the layout is inferred from the tensor object's metadata. + """ + + @property + def size(self) -> int: + ... + + @property + def shape(self) -> tuple[int, ...]: + """ + Shape of the tensor. + """ + + @property + def strides(self) -> tuple[int, ...] | None: + """ + Strides of the tensor (in **counts**, not bytes). + """ + + @property + def dtype(self) -> numpy.dtype | None: + """ + Data type of the tensor. + + Supports standard NumPy dtypes as well as narrow data types (e.g., ``bfloat16``) + when the optional `ml_dtypes `_ package is + installed. If ``ml_dtypes`` is not available and such a tensor is encountered, + a :obj:`NotImplementedError` will be raised. + """ + + def __repr__(self): + ... + +class _StridedMemoryViewProxy: + + def view(self, stream_ptr=None) -> StridedMemoryView: + ... + + def __init__(self, obj): + ... +_SMV_DLPACK_EXCHANGE_API_CAPSULE = ... + +def view_as_cai(obj, stream_ptr, view=None) -> StridedMemoryView: + ... + +def view_as_array_interface(obj, view=None) -> StridedMemoryView: + ... + +@functools.lru_cache +def _typestr2dtype(typestr: str): + ... + +@functools.lru_cache +def _typestr2itemsize(typestr: str): + ... + +def args_viewable_as_strided_memory(arg_indices: tuple): + """ + Decorator to create proxy objects to :obj:`StridedMemoryView` for the + specified positional arguments. + + This allows array/tensor attributes to be accessed inside the function + implementation, while keeping the function body array-library-agnostic (if + desired). + + Inside the decorated function, the specified arguments become instances + of an (undocumented) proxy type, regardless of its original source. A + :obj:`StridedMemoryView` instance can be obtained by passing the (consumer) + stream pointer (as a Python `int`) to the proxies's ``view()`` method. For + example: + + .. code-block:: python + + @args_viewable_as_strided_memory((1,)) + def my_func(arg0, arg1, arg2, stream: Stream): + # arg1 can be any object supporting DLPack or CUDA Array Interface + view = arg1.view(stream.handle) + assert isinstance(view, StridedMemoryView) + ... + + Parameters + ---------- + arg_indices : tuple + The indices of the target positional arguments. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_module.pyi b/cuda_core/cuda/core/_module.pyi new file mode 100644 index 00000000000..f6c6e341d8a --- /dev/null +++ b/cuda_core/cuda/core/_module.pyi @@ -0,0 +1,489 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_module.pyx + +from __future__ import annotations + +from collections import namedtuple + +from cuda.core._launch_config import LaunchConfig +from cuda.core._stream import Stream +from cuda.core._utils.cuda_utils import driver + + +class KernelAttributes: + """Read-only view of a kernel's per-device attributes. + + The default view returned by :attr:`Kernel.attributes` is bound to + the current device, resolved at attribute-access time. Use + ``kernel.attributes[device]`` to obtain a view bound to a specific + device (an :class:`int` device ordinal or :class:`Device`). Per-device + views share the underlying cache so a value queried through one view + is visible through the others. + """ + + def __init__(self, *args, **kwargs): + ... + + def __getitem__(self, device) -> KernelAttributes: + """Return a view of these attributes bound to a specific device. + + Parameters + ---------- + device : Device or int + The device whose attributes to query. Accepts a :class:`Device` + or a device ordinal (:class:`int`). + + Returns + ------- + KernelAttributes + A view bound to ``device`` that shares the underlying cache + with this view. + """ + + @property + def max_threads_per_block(self) -> int: + """int : The maximum number of threads per block. + This attribute is read-only.""" + + @property + def shared_size_bytes(self) -> int: + """int : The size in bytes of statically-allocated shared memory required by this function. + This attribute is read-only.""" + + @property + def const_size_bytes(self) -> int: + """int : The size in bytes of user-allocated constant memory required by this function. + This attribute is read-only.""" + + @property + def local_size_bytes(self) -> int: + """int : The size in bytes of local memory used by each thread of this function. + This attribute is read-only.""" + + @property + def num_regs(self) -> int: + """int : The number of registers used by each thread of this function. + This attribute is read-only.""" + + @property + def ptx_version(self) -> int: + """int : The PTX virtual architecture version for which the function was compiled. + This attribute is read-only.""" + + @property + def binary_version(self) -> int: + """int : The binary architecture version for which the function was compiled. + This attribute is read-only.""" + + @property + def cache_mode_ca(self) -> bool: + """bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set. + This attribute is read-only.""" + + @property + def max_dynamic_shared_size_bytes(self) -> int: + """int : The maximum size in bytes of dynamically-allocated shared memory that can be used + by this function.""" + + @property + def preferred_shared_memory_carveout(self) -> int: + """int : The shared memory carveout preference, in percent of the total shared memory.""" + + @property + def cluster_size_must_be_set(self) -> bool: + """bool : The kernel must launch with a valid cluster size specified. + This attribute is read-only.""" + + @property + def required_cluster_width(self) -> int: + """int : The required cluster width in blocks.""" + + @property + def required_cluster_height(self) -> int: + """int : The required cluster height in blocks.""" + + @property + def required_cluster_depth(self) -> int: + """int : The required cluster depth in blocks.""" + + @property + def non_portable_cluster_size_allowed(self) -> bool: + """bool : Whether the function can be launched with non-portable cluster size.""" + + @property + def cluster_scheduling_policy_preference(self) -> int: + """int : The block scheduling policy of a function.""" + +class KernelOccupancy: + """This class offers methods to query occupancy metrics that help determine optimal + launch parameters such as block size, grid size, and shared memory usage. + """ + + def __init__(self, *args, **kwargs): + ... + + def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int: + """Occupancy of the kernel. + + Returns the maximum number of active blocks per multiprocessor for this kernel. + + Parameters + ---------- + block_size: int + Block size parameter used to launch this kernel. + dynamic_shared_memory_size: int + The amount of dynamic shared memory in bytes needed by block. + Use `0` if block does not need shared memory. + + Returns + ------- + int + The maximum number of active blocks per multiprocessor. + + Note + ---- + The fraction of the product of maximum number of active blocks per multiprocessor + and the block size to the maximum number of threads per multiprocessor is known as + theoretical multiprocessor utilization (occupancy). + + """ + + def max_potential_block_size(self, dynamic_shared_memory_needed: int | driver.CUoccupancyB2DSize, block_size_limit: int) -> MaxPotentialBlockSizeOccupancyResult: + """MaxPotentialBlockSizeOccupancyResult: Suggested launch configuration for reasonable occupancy. + + Returns the minimum grid size needed to achieve the maximum occupancy and + the maximum block size that can achieve the maximum occupancy. + + Parameters + ---------- + dynamic_shared_memory_needed: Union[int, driver.CUoccupancyB2DSize] + The amount of dynamic shared memory in bytes needed by block. + Use `0` if block does not need shared memory. Use C-callable + represented by :obj:`~driver.CUoccupancyB2DSize` to encode + amount of needed dynamic shared memory which varies depending + on tne block size. + block_size_limit: int + Known upper limit on the kernel block size. Use `0` to indicate + the maximum block size permitted by the device / kernel instead + + Returns + ------- + :obj:`~MaxPotentialBlockSizeOccupancyResult` + An object with `min_grid_size` and `max_block_size` attributes encoding + the suggested launch configuration. + + Note + ---- + Please be advised that use of C-callable that requires Python Global + Interpreter Lock may lead to deadlocks. + + """ + + def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int: + """Dynamic shared memory available per block for given launch configuration. + + The amount of dynamic shared memory per block, in bytes, for given kernel launch configuration. + + Parameters + ---------- + num_blocks_per_multiprocessor: int + Number of blocks to be concurrently executing on a multiprocessor. + block_size: int + Block size parameter used to launch this kernel. + + Returns + ------- + int + Dynamic shared memory available per block for given launch configuration. + """ + + def max_potential_cluster_size(self, config: LaunchConfig, *, stream: Stream) -> int: + """Maximum potential cluster size. + + The maximum potential cluster size for this kernel and given launch configuration. + + Parameters + ---------- + config: :obj:`~_launch_config.LaunchConfig` + Kernel launch configuration. Cluster dimensions in the configuration are ignored. + stream: :obj:`~Stream` + Keyword-only. The stream on which this kernel is to be launched. + Must be passed explicitly; pass ``device.default_stream`` to + use the default stream. + + Returns + ------- + int + The maximum cluster size that can be launched for this kernel and launch configuration. + """ + + def max_active_clusters(self, config: LaunchConfig, *, stream: Stream) -> int: + """Maximum number of active clusters on the target device. + + The maximum number of clusters that could concurrently execute on the target device. + + Parameters + ---------- + config: :obj:`~_launch_config.LaunchConfig` + Kernel launch configuration. + stream: :obj:`~Stream` + Keyword-only. The stream on which this kernel is to be launched. + Must be passed explicitly; pass ``device.default_stream`` to + use the default stream. + + Returns + ------- + int + The maximum number of clusters that could co-exist on the target device. + """ + +class Kernel: + """Represent a compiled kernel that had been loaded onto the device. + + Kernel instances can execution when passed directly into the + :func:`~launch` function. + + Directly creating a :obj:`~_module.Kernel` is not supported, and they + should instead be created through a :obj:`~_module.ObjectCode` object. + + """ + + def __init__(self, *args, **kwargs): + ... + + @property + def attributes(self) -> KernelAttributes: + """Get the read-only attributes of this kernel.""" + + @property + def num_arguments(self) -> int: + """int : The number of arguments of this function""" + + @property + def arguments_info(self) -> list[ParamInfo]: + """list[ParamInfo]: (offset, size) for each argument of this function""" + + @property + def occupancy(self) -> KernelOccupancy: + """Get the occupancy information for launching this kernel.""" + + @property + def handle(self): + """Return the underlying kernel handle object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Kernel.handle)``. + """ + + @property + def _handle(self): + ... + + @staticmethod + def from_handle(handle, mod: ObjectCode | None=None) -> Kernel: + """Creates a new :obj:`Kernel` object from a kernel handle. + + Parameters + ---------- + handle : int + Kernel handle representing the address of a foreign + kernel object (CUkernel). + mod : :obj:`ObjectCode`, optional + The ObjectCode object associated with this kernel. Provides + library lifetime for foreign kernels not created by + cuda.core. + """ + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... + + def __repr__(self) -> str: + ... + +class ObjectCode: + """Represent a compiled program to be loaded onto the device. + + This object provides a unified interface for different types of + compiled programs that will be loaded onto the device. + + Note + ---- + This class has no default constructor. If you already have a cubin that you would + like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly + from all other possible code types should be avoided in favor of compilation through + :class:`~cuda.core.Program` + """ + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def _init(cls, module, code_type, *, name: str='', symbol_mapping: dict | None=None): + ... + + @staticmethod + def _reduce_helper(module, code_type, name, symbol_mapping): + ... + + def __reduce__(self): + ... + + @staticmethod + def from_cubin(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode: + """Create an :class:`ObjectCode` instance from an existing cubin. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory cubin to load, or + a file path string pointing to the on-disk cubin to load. + name : Optional[str] + A human-readable identifier representing this code object. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + + @staticmethod + def from_ptx(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode: + """Create an :class:`ObjectCode` instance from an existing PTX. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory ptx code to load, or + a file path string pointing to the on-disk ptx file to load. + name : Optional[str] + A human-readable identifier representing this code object. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + + @staticmethod + def from_ltoir(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode: + """Create an :class:`ObjectCode` instance from an existing LTOIR. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory ltoir code to load, or + a file path string pointing to the on-disk ltoir file to load. + name : Optional[str] + A human-readable identifier representing this code object. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + + @staticmethod + def from_fatbin(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode: + """Create an :class:`ObjectCode` instance from an existing fatbin. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory fatbin to load, or + a file path string pointing to the on-disk fatbin to load. + name : Optional[str] + A human-readable identifier representing this code object. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + + @staticmethod + def from_object(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode: + """Create an :class:`ObjectCode` instance from an existing object code. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory object code to load, or + a file path string pointing to the on-disk object code to load. + name : Optional[str] + A human-readable identifier representing this code object. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + + @staticmethod + def from_library(module: bytes | str, *, name: str='', symbol_mapping: dict | None=None) -> ObjectCode: + """Create an :class:`ObjectCode` instance from an existing library. + + Parameters + ---------- + module : Union[bytes, str] + Either a bytes object containing the in-memory library to load, or + a file path string pointing to the on-disk library to load. + name : Optional[str] + A human-readable identifier representing this code object. + symbol_mapping : Optional[dict] + A dictionary specifying how the unmangled symbol names (as keys) + should be mapped to the mangled names before trying to retrieve + them (default to no mappings). + """ + + def get_kernel(self, name) -> Kernel: + """Return the :obj:`~_module.Kernel` of a specified name from this object code. + + Parameters + ---------- + name : str | bytes + Name of the kernel to retrieve. + + Returns + ------- + :obj:`~_module.Kernel` + Newly created kernel object. + + """ + + @property + def code(self) -> CodeTypeT: + """Return the underlying code object.""" + + @property + def name(self) -> str: + """Return a human-readable name of this code object.""" + + @property + def code_type(self) -> str: + """Return the type of the underlying code object.""" + + @property + def symbol_mapping(self) -> dict: + """Return a copy of the symbol mapping dictionary.""" + + @property + def handle(self): + """Return the underlying handle object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(ObjectCode.handle)``. + """ + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... + + def __repr__(self) -> str: + ... +__all__ = ['Kernel', 'ObjectCode'] +MaxPotentialBlockSizeOccupancyResult = namedtuple('MaxPotentialBlockSizeOccupancyResult', ('min_grid_size', 'max_block_size')) +ParamInfo = namedtuple('ParamInfo', ['offset', 'size']) +CodeTypeT = bytes | bytearray | str \ No newline at end of file diff --git a/cuda_core/cuda/core/_program.pyi b/cuda_core/cuda/core/_program.pyi new file mode 100644 index 00000000000..62b2e8650ef --- /dev/null +++ b/cuda_core/cuda/core/_program.pyi @@ -0,0 +1,440 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_program.pyx + +"""Compilation machinery for CUDA programs. + +This module provides :class:`Program` for compiling source code into +:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration. +""" +from __future__ import annotations + +from dataclasses import dataclass + +from cuda.bindings import nvrtc +from cuda.core._linker import LinkerHandleT +from cuda.core._module import ObjectCode +from cuda.core.typing import (CompilerBackendType, ObjectCodeFormatType, + PCHStatusType, SourceCodeType) +from cuda.core.utils._program_cache import ProgramCacheResource + + +class Program: + """Represent a compilation machinery to process programs into + :class:`~cuda.core.ObjectCode`. + + This object provides a unified interface to multiple underlying + compiler libraries. Compilation support is enabled for a wide + range of code types and compilation types. + + Parameters + ---------- + code : str | bytes | bytearray + The source code to compile. For C++ and PTX, must be a string. + For NVVM IR, can be str, bytes, or bytearray. + code_type : SourceCodeType | str + The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``. + options : :class:`ProgramOptions`, optional + Options to customize the compilation process. + """ + + def __init__(self, code: str | bytes | bytearray, code_type: SourceCodeType | str, options: ProgramOptions | None=None): + ... + + def close(self): + """Destroy this program.""" + + def compile(self, target_type: ObjectCodeFormatType | str, name_expressions: tuple | list=..., logs=None, *, cache: ProgramCacheResource | None=None) -> ObjectCode: + """Compile the program to the specified target type. + + Parameters + ---------- + target_type : ObjectCodeFormatType | str + The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``. + name_expressions : tuple | list, optional + Sequence of name expressions to make accessible in the compiled code. + Used for template instantiation and similar cases. + logs : object, optional + Object with a ``write`` method to receive compilation logs. + On a cache hit no compilation runs and ``logs`` receives + nothing -- callers that rely on log output to confirm a + compile happened should compile without ``cache=``. + cache : :class:`~cuda.core.utils.ProgramCacheResource`, optional + If provided, the compiled binary is looked up in ``cache`` via a + key derived from the program's code, options, and ``target_type``. + On a hit the cached bytes are wrapped in a fresh + :class:`~cuda.core.ObjectCode` (with the same ``target_type`` + and ``ProgramOptions.name``) and returned without re-compiling; + on a miss the compile output is stored as raw bytes (the cache + extracts ``bytes(object_code.code)``). Passing a non-empty + ``name_expressions`` together with ``cache=`` raises + ``ValueError``: NVRTC populates + ``ObjectCode.symbol_mapping`` at compile time and that mapping + is not carried in the binary the cache stores, so cache hits + would silently miss ``get_kernel(name_expression)`` lookups. + Options that require an ``extra_digest`` (``include_path``, + ``pre_include``, ``pch``, ``use_pch``, ``pch_dir``, NVVM + ``use_libdevice=True``, or NVRTC ``options.name`` with a + directory component) raise ``ValueError`` via + :func:`~cuda.core.utils.make_program_cache_key`; for those + compiles, use the manual ``make_program_cache_key(...)`` + pattern directly. + + ``cache=`` is independent of ``ProgramOptions.no_cache``: the + former controls this program-level cache (compiled-output + reuse across calls), while ``no_cache`` is forwarded to the + Linker to disable its in-process JIT cache for cuLink/nvJitLink. + Setting ``options.no_cache=True`` does not bypass ``cache=``, + and vice-versa. + + Returns + ------- + :class:`~cuda.core.ObjectCode` + The compiled object code. + """ + + @property + def pch_status(self) -> PCHStatusType | None: + """PCH creation outcome from the most recent :meth:`compile` call. + + Possible values: + + * ``"created"`` — PCH file was written successfully. + * ``"not_attempted"`` — PCH creation was not attempted (e.g. the + compiler decided not to, or automatic PCH processing skipped it). + * ``"failed"`` — an error prevented PCH creation. + * ``None`` — PCH was not requested, the program has not been + compiled yet, the backend is not NVRTC (e.g. PTX or NVVM), + or the NVRTC bindings are too old to report status. + + When ``create_pch`` is set in :class:`ProgramOptions` and the PCH + heap is too small, :meth:`compile` automatically resizes the heap + and retries, so ``"created"`` should be the common outcome. + + .. note:: + + PCH is only supported for ``code_type="c++"`` programs that + use the NVRTC backend. For PTX and NVVM programs this property + always returns ``None``. + """ + + @property + def backend(self) -> CompilerBackendType: + """Return this Program instance's underlying :class:`CompilerBackendType`.""" + + @property + def handle(self) -> ProgramHandleT: + """Return the underlying handle object. + + .. note:: + + The type of the returned object depends on the backend. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Program.handle)``. + """ + + def __repr__(self) -> str: + ... + +@dataclass +class ProgramOptions: + """Customizable options for configuring :class:`Program`. + + Attributes + ---------- + name : str, optional + Name of the program. If the compilation succeeds, the name is passed down to the generated :class:`ObjectCode`. + arch : str, optional + Pass the SM architecture value, such as ``sm_`` (for generating CUBIN) or + ``compute_`` (for generating PTX). If not provided, the current device's architecture + will be used. + relocatable_device_code : bool, optional + Enable (disable) the generation of relocatable device code. + Default: False + extensible_whole_program : bool, optional + Do extensible whole program compilation of device code. + Default: False + debug : bool, optional + Generate debug information. If --dopt is not specified, then turns off all optimizations. + Default: False + lineinfo: bool, optional + Generate line-number information. + Default: False + device_code_optimize : bool, optional + Enable device code optimization. When specified along with '-G', enables limited debug information generation + for optimized device code. + Default: None + ptxas_options : Union[str, list[str]], optional + Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. + For example ["-v", "-O2"]. + Default: None + max_register_count : int, optional + Specify the maximum amount of registers that GPU functions can use. + Default: None + ftz : bool, optional + When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal + values. + Default: False + prec_sqrt : bool, optional + For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. + Default: True + prec_div : bool, optional + For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster + approximation. + Default: True + fma : bool, optional + Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point + multiply-add operations. + Default: True + use_fast_math : bool, optional + Make use of fast math operations. + Default: False + extra_device_vectorization : bool, optional + Enables more aggressive device code vectorization in the NVVM optimizer. + Default: False + link_time_optimization : bool, optional + Generate intermediate code for later link-time optimization. + Default: False + gen_opt_lto : bool, optional + Run the optimizer passes before generating the LTO IR. + Default: False + define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional + Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of + strings, in which case the first element is defined as the second, or a list of strings or tuples. + Default: None + undefine_macro : Union[str, list[str]], optional + Cancel any previous definition of a macro, or list of macros. + Default: None + include_path : Union[str, list[str]], optional + Add the directory or directories to the list of directories to be searched for headers. + Default: None + pre_include : Union[str, list[str]], optional + Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. + Default: None + no_source_include : bool, optional + Disable the default behavior of adding the directory of each input source to the include path. + Default: False + std : str, optional + Set language dialect to C++03, C++11, C++14, C++17 or C++20. + Default: c++17 + builtin_move_forward : bool, optional + Provide builtin definitions of std::move and std::forward. + Default: True + builtin_initializer_list : bool, optional + Provide builtin definitions of std::initializer_list class and member functions. + Default: True + disable_warnings : bool, optional + Inhibit all warning messages. + Default: False + restrict : bool, optional + Programmer assertion that all kernel pointer parameters are restrict pointers. + Default: False + device_as_default_execution_space : bool, optional + Treat entities with no execution space annotation as __device__ entities. + Default: False + device_int128 : bool, optional + Allow the __int128 type in device code. + Default: False + optimization_info : str, optional + Provide optimization reports for the specified kind of optimization. + Default: None + no_display_error_number : bool, optional + Disable the display of a diagnostic number for warning messages. + Default: False + diag_error : Union[int, list[int]], optional + Emit error for a specified diagnostic message number or comma-separated list of numbers. + Default: None + diag_suppress : Union[int, list[int]], optional + Suppress a specified diagnostic message number or comma-separated list of numbers. + Default: None + diag_warn : Union[int, list[int]], optional + Emit warning for a specified diagnostic message number or comma-separated list of numbers. + Default: None + brief_diagnostics : bool, optional + Disable or enable showing source line and column info in a diagnostic. + Default: False + time : str, optional + Generate a CSV table with the time taken by each compilation phase. + Default: None + split_compile : int, optional + Perform compiler optimizations in parallel. + Default: 1 + fdevice_syntax_only : bool, optional + Ends device compilation after front-end syntax checking. + Default: False + minimal : bool, optional + Omit certain language features to reduce compile time for small programs. + Default: False + no_cache : bool, optional + Disable compiler caching. + Default: False + fdevice_time_trace : str, optional + Generate time trace JSON for profiling compilation (NVRTC only). + Default: None + device_float128 : bool, optional + Allow __float128 type in device code (NVRTC only). + Default: False + frandom_seed : str, optional + Set random seed for randomized optimizations (NVRTC only). + Default: None + ofast_compile : str, optional + Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only). + Default: None + pch : bool, optional + Use default precompiled header (NVRTC only, CUDA 12.8+). + Default: False + create_pch : str, optional + Create precompiled header file (NVRTC only, CUDA 12.8+). + Default: None + use_pch : str, optional + Use specific precompiled header file (NVRTC only, CUDA 12.8+). + Default: None + pch_dir : str, optional + PCH directory location (NVRTC only, CUDA 12.8+). + Default: None + pch_verbose : bool, optional + Verbose PCH output (NVRTC only, CUDA 12.8+). + Default: False + pch_messages : bool, optional + Control PCH diagnostic messages (NVRTC only, CUDA 12.8+). + Default: False + instantiate_templates_in_pch : bool, optional + Control template instantiation in PCH (NVRTC only, CUDA 12.8+). + Default: False + extra_sources : list of 2-tuples or tuple of 2-tuples, optional + Additional NVVM IR modules to compile together with the main program, specified as + ``((name1, source1), (name2, source2), ...)``. Each name is a string identifier used + in diagnostic messages. Each source can be a string (textual LLVM IR) or bytes/bytearray + (LLVM bitcode). Only supported for the NVVM backend. + Default: None + use_libdevice : bool, optional + Load NVIDIA's `libdevice `_ + math builtins library. Only supported for the NVVM backend. + Default: False + """ + name: str | None = 'default_program' + arch: str | None = None + relocatable_device_code: bool | None = None + extensible_whole_program: bool | None = None + debug: bool | None = None + lineinfo: bool | None = None + device_code_optimize: bool | None = None + ptxas_options: str | list[str] | tuple[str] | None = None + max_register_count: int | None = None + ftz: bool | None = None + prec_sqrt: bool | None = None + prec_div: bool | None = None + fma: bool | None = None + use_fast_math: bool | None = None + extra_device_vectorization: bool | None = None + link_time_optimization: bool | None = None + gen_opt_lto: bool | None = None + define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None + undefine_macro: str | list[str] | tuple[str] | None = None + include_path: str | list[str] | tuple[str] | None = None + pre_include: str | list[str] | tuple[str] | None = None + no_source_include: bool | None = None + std: str | None = None + builtin_move_forward: bool | None = None + builtin_initializer_list: bool | None = None + disable_warnings: bool | None = None + restrict: bool | None = None + device_as_default_execution_space: bool | None = None + device_int128: bool | None = None + optimization_info: str | None = None + no_display_error_number: bool | None = None + diag_error: int | list[int] | tuple[int] | None = None + diag_suppress: int | list[int] | tuple[int] | None = None + diag_warn: int | list[int] | tuple[int] | None = None + brief_diagnostics: bool | None = None + time: str | None = None + split_compile: int | None = None + fdevice_syntax_only: bool | None = None + minimal: bool | None = None + no_cache: bool | None = None + fdevice_time_trace: str | None = None + device_float128: bool | None = None + frandom_seed: str | None = None + ofast_compile: str | None = None + pch: bool | None = None + create_pch: str | None = None + use_pch: str | None = None + pch_dir: str | None = None + pch_verbose: bool | None = None + pch_messages: bool | None = None + instantiate_templates_in_pch: bool | None = None + extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None + use_libdevice: bool | None = None + numba_debug: bool | None = None + + def __post_init__(self): + ... + + def _prepare_nvrtc_options(self) -> list[bytes]: + ... + + def _prepare_nvvm_options(self, as_bytes: bool=True) -> list[bytes] | list[str]: + ... + + def as_bytes(self, backend: CompilerBackendType | str, target_type: ObjectCodeFormatType | str | None=None) -> list[bytes]: + """Convert program options to bytes format for the specified backend. + + This method transforms the program options into a format suitable for the + specified compiler backend. Different backends may use different option names + and formats even for the same conceptual options. + + Parameters + ---------- + backend : CompilerBackendType | str + The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm". + target_type : ObjectCodeFormatType | str, optional + The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends + require additional options based on the target type. + + Returns + ------- + list[bytes] + List of option strings encoded as bytes. + + Raises + ------ + ValueError + If an unknown backend is specified. + CUDAError + If an option incompatible with the specified backend is set. + + Examples + -------- + >>> options = ProgramOptions(arch="sm_80", debug=True) + >>> nvrtc_options = options.as_bytes("nvrtc") + """ + + def __repr__(self): + ... + + def _prepare_extra_sources_bytes(self) -> list[tuple[bytes, bytes]] | None: + """Convert extra_sources to bytes format for NVVM.""" +__all__ = ['Program', 'ProgramOptions'] +ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT +_nvvm_module = None +_nvvm_import_attempted = False + +def _can_load_generated_ptx() -> bool: + """Check if the driver can load PTX generated by the current NVRTC version.""" + +def _program_compile_uncached(program, target_type, name_expressions, logs): + """Run ``Program_compile`` without the cache wrapper. + + Module-level Python function so tests can monkeypatch it from + ``cuda.core._program`` to avoid invoking NVRTC when exercising the cache + wrapper in :meth:`Program.compile`. ``Program`` itself is a ``cdef class`` + and its methods cannot be reassigned from Python, so the seam must live + outside the class. + """ + +def _get_nvvm_module(): + """Get the NVVM module, importing it lazily with availability checks.""" + +def _find_libdevice_path(): + """Find libdevice*.bc for NVVM compilation using cuda.pathfinder.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/_resource_handles.pyi b/cuda_core/cuda/core/_resource_handles.pyi new file mode 100644 index 00000000000..490073c9fd1 --- /dev/null +++ b/cuda_core/cuda/core/_resource_handles.pyi @@ -0,0 +1,22 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_resource_handles.pyx + +from __future__ import annotations + +from libcpp.memory import shared_ptr + +ContextHandle = shared_ptr +GreenCtxHandle = shared_ptr +StreamHandle = shared_ptr +EventHandle = shared_ptr +MemoryPoolHandle = shared_ptr +DevicePtrHandle = shared_ptr +LibraryHandle = shared_ptr +KernelHandle = shared_ptr +GraphHandle = shared_ptr +GraphNodeHandle = shared_ptr +GraphicsResourceHandle = shared_ptr +NvrtcProgramHandle = shared_ptr +NvvmProgramHandle = shared_ptr +NvJitLinkHandle = shared_ptr +CuLinkHandle = shared_ptr +FileDescriptorHandle = shared_ptr \ No newline at end of file diff --git a/cuda_core/cuda/core/_stream.pyi b/cuda_core/cuda/core/_stream.pyi new file mode 100644 index 00000000000..5651f4ad4e1 --- /dev/null +++ b/cuda_core/cuda/core/_stream.pyi @@ -0,0 +1,229 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_stream.pyx + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Protocol + +import cuda.bindings.driver +import cython +from cuda.core._context import Context +from cuda.core._device import Device +from cuda.core._event import Event, EventOptions +from cuda.core.graph import GraphBuilder + + +@dataclass +class StreamOptions: + """Customizable :obj:`~_stream.Stream` options. + + Attributes + ---------- + nonblocking : bool, optional + Stream does not synchronize with the NULL stream. (Default to True) + priority : int, optional + Stream priority where lower number represents a + higher priority. (Default to lowest priority) + + """ + nonblocking: cython.bint = True + priority: int | None = None + +class IsStreamType(Protocol): + + def __cuda_stream__(self) -> tuple[int, int]: + """ + For any Python object that is meant to be interpreted as a CUDA stream, the intent + can be communicated by implementing this protocol that returns a 2-tuple: The protocol + version number (currently ``0``) and the address of ``cudaStream_t``. Both values + should be Python `int`. + """ + +class Stream: + """Represent a queue of GPU operations that are executed in a specific order. + + Applications use streams to control the order of execution for + GPU work. Work within a single stream are executed sequentially. + Whereas work across multiple streams can be further controlled + using stream priorities and :obj:`~_event.Event` managements. + + Advanced users can utilize default streams for enforce complex + implicit synchronization behaviors. + + Directly creating a :obj:`~_stream.Stream` is not supported due to ambiguity. + New streams should instead be created through a :obj:`~_device.Device` + object, or created directly through using an existing handle + using Stream.from_handle(). + """ + + def close(self): + """Destroy the stream. + + Releases the stream handle. For owned streams, this destroys the + underlying CUDA stream. For borrowed streams, this releases the + reference and allows the Python owner to be GC'd. + """ + + def __init__(self, *args, **kwargs): + ... + + @classmethod + def _legacy_default(cls): + """Return the legacy default stream (supports subclassing).""" + + @classmethod + def _per_thread_default(cls): + """Return the per-thread default stream (supports subclassing).""" + + @classmethod + def _init(cls, obj: IsStreamType | None=None, options=None, device_id: int | None=None, ctx: Context | None=None): + ... + + def __cuda_stream__(self) -> tuple[int, int]: + """Return an instance of a __cuda_stream__ protocol.""" + + def __hash__(self) -> int: + ... + + def __eq__(self, other) -> bool: + ... + + def __repr__(self) -> str: + ... + + @property + def handle(self) -> cuda.bindings.driver.CUstream: + """Return the underlying ``CUstream`` object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Stream.handle)``. + """ + + @property + def is_nonblocking(self) -> bool: + """Return True if this is a nonblocking stream, otherwise False.""" + + @property + def priority(self) -> int: + """Return the stream priority.""" + + def sync(self): + """Synchronize the stream.""" + + def record(self, event: Event | None=None, options: EventOptions | None=None) -> Event: + """Record an event onto the stream. + + Creates an :obj:`~_event.Event` object (or reuses the given one) by + recording on the stream. + + Parameters + ---------- + event : :obj:`~_event.Event`, optional + Optional event object to be reused for recording. + options : :obj:`EventOptions`, optional + Customizable dataclass for event creation options. + + Returns + ------- + :obj:`~_event.Event` + Newly created event object. + + """ + + def wait(self, event_or_stream: Event | Stream): + """Wait for a CUDA event or a CUDA stream. + + Waiting for an event or a stream establishes a stream order. + + If a :obj:`~_stream.Stream` is provided, then wait until the stream's + work is completed. This is done by recording a new :obj:`~_event.Event` + on the stream and then waiting on it. + + Parameters + ---------- + event_or_stream : :obj:`~_event.Event` | :obj:`~_stream.Stream` + The event or stream to wait for. Objects supporting the + ``__cuda_stream__`` protocol are also accepted and treated as + streams. + + """ + + @property + def device(self) -> Device: + """Return the :obj:`~_device.Device` singleton associated with this stream. + + Note + ---- + The current context on the device may differ from this + stream's context. This case occurs when a different CUDA + context is set current after a stream is created. + + """ + + @property + def context(self) -> Context: + """Return the :obj:`~_context.Context` associated with this stream.""" + + @property + def resources(self): + """Query the hardware resources provisioned for this stream's context. + + For streams created from a green context, returns the resources + that context was provisioned with. For streams on the primary + context, returns the full device resources. + """ + + @staticmethod + def from_handle(handle: int) -> Stream: + """Create a new :obj:`~_stream.Stream` object from a foreign stream handle. + + Uses a cudaStream_t pointer address represented as a Python int + to create a new :obj:`~_stream.Stream` object. + + Note + ---- + Stream lifetime is not managed, foreign object must remain + alive while this stream is active. + + Parameters + ---------- + handle : int + Stream handle representing the address of a foreign + stream object. + + Returns + ------- + :obj:`~_stream.Stream` + Newly created stream object. + + """ + + def create_graph_builder(self) -> GraphBuilder: + """Create a new :obj:`~graph.GraphBuilder` object. + + The new graph builder will be associated with this stream. + + Returns + ------- + :obj:`~graph.GraphBuilder` + Newly created graph builder object. + + """ +LEGACY_DEFAULT_STREAM: Stream = Stream._legacy_default() +PER_THREAD_DEFAULT_STREAM: Stream = Stream._per_thread_default() + +def default_stream() -> Stream: + """Return the default CUDA :obj:`~_stream.Stream`. + + The type of default stream returned depends on if the environment + variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set. + + If set, returns a per-thread default stream. Otherwise returns + the legacy stream. + + """ + +def Stream_accept(arg, allow_stream_protocol: bool=False) -> Stream: + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/_tensor_bridge.pyi b/cuda_core/cuda/core/_tensor_bridge.pyi new file mode 100644 index 00000000000..d2d9182eeb9 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_bridge.pyi @@ -0,0 +1,82 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_tensor_bridge.pyx + +"""Tensor bridge: extract PyTorch tensor metadata via the AOTI stable C ABI. + +PyTorch is NOT required at build time. At runtime the AOTI symbols are +resolved from ``torch._C`` (which is loaded with ``RTLD_GLOBAL``). + +The ``pyobj_to_aten_handle`` trick exploits the internal layout of +``THPVariable`` (PyTorch's Python tensor wrapper). + +In PyTorch 2.10+ ``cdata`` is ``at::Tensor`` directly:: + + struct THPVariable { + PyObject_HEAD + at::Tensor cdata; // <-- &cdata is usable as AtenTensorHandle + ... + }; + +In PyTorch 2.3–2.9 ``cdata`` was ``c10::MaybeOwned``, +whose first member is ``bool isBorrowed_`` (padded to 8 bytes), +followed by the ``at::Tensor`` union member:: + + struct THPVariable { + PyObject_HEAD + c10::MaybeOwned cdata; + // MaybeOwned layout: { bool isBorrowed_ (8 bytes); at::Tensor own_; } + ... + }; + +In both cases the address of the ``at::Tensor`` inside ``cdata`` is +accepted by the AOTI stable C ABI functions as an ``AtenTensorHandle``. +The extra 8-byte skip for the ``isBorrowed_`` member is determined +at runtime from the PyTorch version (see ``_get_cdata_extra_offset``). + +Offsetting past ``PyObject_HEAD`` gives us the handle +without any Python attribute access or method calls (~14 ns for all +7 metadata queries). + +Credit: Emilio Castillo (ecastillo@nvidia.com) – original tensor-bridge POC. + +.. note:: + + This module must NOT be imported at ``cuda.core`` load time. It is + loaded lazily (by ``_memoryview.pyx``) only when the user actually + passes a ``torch.Tensor``. The caller must ensure that + ``torch._C`` has been re-opened with ``RTLD_GLOBAL`` *before* + importing this module so that the AOTI symbols are visible. +""" +from __future__ import annotations + +AOTITorchError = int + +def sync_torch_stream(device_index: int, consumer_s: int) -> int: + """Establish stream ordering between PyTorch's current CUDA stream + and the given consumer stream. + + Records an event on PyTorch's current stream (the producer) and makes + the consumer stream wait on it. This is a no-op if both streams are + the same. + """ + +def resolve_aoti_dtype(dtype_code: int): + """Python-callable wrapper around _get_aoti_dtype (for lazy resolution).""" + +def view_as_torch_tensor(obj: object, stream_ptr: object, view=None): + """Create/populate a :class:`StridedMemoryView` from a ``torch.Tensor``. + + This is a fast path that avoids DLPack/CAI protocol overhead by + reading tensor metadata directly through the AOTI stable C ABI. + + Parameters + ---------- + obj : torch.Tensor + The source tensor. + stream_ptr : int or None + Consumer stream pointer. When not ``-1``, stream ordering is + established between PyTorch's current CUDA stream (the producer) + and the consumer stream, matching the DLPack contract. + view : StridedMemoryView, optional + If provided, populate this existing view in-place. Otherwise a + new instance is created. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/_tensor_map.pyi b/cuda_core/cuda/core/_tensor_map.pyi new file mode 100644 index 00000000000..f3071760834 --- /dev/null +++ b/cuda_core/cuda/core/_tensor_map.pyi @@ -0,0 +1,335 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_tensor_map.pyx + +from __future__ import annotations + +from dataclasses import dataclass + +import numpy +from cuda.bindings import cydriver + + +class TensorMapDataType: + """Data types for tensor map descriptors. + + These correspond to the ``CUtensorMapDataType`` driver enum values. + """ + UINT8 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8 + UINT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16 + UINT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32 + INT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32 + UINT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64 + INT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64 + FLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16 + FLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32 + FLOAT64 = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64 + BFLOAT16 = cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16 + FLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ + TFLOAT32 = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32 + TFLOAT32_FTZ = cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ + +class TensorMapInterleave: + """Interleave layout for tensor map descriptors. + + These correspond to the ``CUtensorMapInterleave`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_INTERLEAVE_NONE + INTERLEAVE_16B = cydriver.CU_TENSOR_MAP_INTERLEAVE_16B + INTERLEAVE_32B = cydriver.CU_TENSOR_MAP_INTERLEAVE_32B + +class TensorMapSwizzle: + """Swizzle mode for tensor map descriptors. + + These correspond to the ``CUtensorMapSwizzle`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_SWIZZLE_NONE + SWIZZLE_32B = cydriver.CU_TENSOR_MAP_SWIZZLE_32B + SWIZZLE_64B = cydriver.CU_TENSOR_MAP_SWIZZLE_64B + SWIZZLE_128B = cydriver.CU_TENSOR_MAP_SWIZZLE_128B + +class TensorMapL2Promotion: + """L2 promotion mode for tensor map descriptors. + + These correspond to the ``CUtensorMapL2promotion`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_L2_PROMOTION_NONE + L2_64B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_64B + L2_128B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_128B + L2_256B = cydriver.CU_TENSOR_MAP_L2_PROMOTION_L2_256B + +class TensorMapOOBFill: + """Out-of-bounds fill mode for tensor map descriptors. + + These correspond to the ``CUtensorMapFloatOOBfill`` driver enum values. + """ + NONE = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE + NAN_REQUEST_ZERO_FMA = cydriver.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA + +class TensorMapIm2ColWideMode: + """Im2col wide mode for tensor map descriptors. + + This enum is always defined for API stability, but the + :meth:`TensorMapDescriptor._from_im2col_wide` factory requires a CUDA 13+ + build and will raise otherwise. + """ + W = 0 + W128 = 1 + +@dataclass +class TensorMapDescriptorOptions: + """Options for :meth:`cuda.core.StridedMemoryView.as_tensor_map`. + + Attributes + ---------- + box_dim : tuple[int, ...] + Tile size for each tensor dimension, expressed in elements. + element_strides : tuple[int, ...], optional + Per-dimension element traversal strides. + data_type : object, optional + Explicit dtype override. Prefer NumPy or ``ml_dtypes`` dtype objects; + :class:`TensorMapDataType` remains accepted for compatibility. + interleave : TensorMapInterleave, optional + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle, optional + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion, optional + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill, optional + Out-of-bounds fill mode. Default ``NONE``. + """ + box_dim: tuple[int, ...] + element_strides: tuple[int, ...] | None = None + data_type: object = None + interleave: TensorMapInterleave = TensorMapInterleave.NONE + swizzle: TensorMapSwizzle = TensorMapSwizzle.NONE + l2_promotion: TensorMapL2Promotion = TensorMapL2Promotion.NONE + oob_fill: TensorMapOOBFill = TensorMapOOBFill.NONE + + def __post_init__(self): + ... + +class TensorMapDescriptor: + """Describes a TMA (Tensor Memory Accelerator) tensor map for Hopper+ GPUs. + + A ``TensorMapDescriptor`` wraps the opaque 128-byte ``CUtensorMap`` struct + used by the hardware TMA unit for efficient bulk data movement between + global and shared memory. + + Public tiled descriptors are created via + :meth:`cuda.core.StridedMemoryView.as_tensor_map`. Specialized + ``_from_*`` helpers remain private while this API surface settles, and + descriptors can be passed directly to :func:`~cuda.core.launch` as a + kernel argument. + """ + + def __init__(self): + ... + + @property + def device(self): + """Return the :obj:`~cuda.core.Device` associated with this descriptor.""" + + @classmethod + def _from_tiled(cls, view, box_dim=None, *, options=None, element_strides=None, data_type=None, interleave=..., swizzle=..., l2_promotion=..., oob_fill=...): + """Create a tiled TMA descriptor from a validated view. + + Parameters + ---------- + view : StridedMemoryView + A device-accessible view with a 16-byte-aligned pointer. + box_dim : tuple of int, optional + The size of each tile dimension (in elements). Must have the + same rank as the tensor and each value must be in [1, 256]. + Specified in the same (row-major) order as the tensor shape. + Required unless ``options`` is provided. + options : TensorMapDescriptorOptions or mapping, optional + Bundled tiled-descriptor options. When provided, do not also pass + ``box_dim`` or the individual option kwargs. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + Specified in the same (row-major) order as the tensor shape. + data_type : dtype-like or TensorMapDataType, optional + Explicit dtype override. If ``None``, inferred from the tensor's + dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is + accepted for compatibility. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [1, 5], the pointer is not + 16-byte aligned, or dimension/stride constraints are violated. + """ + + @classmethod + def _from_im2col(cls, view, pixel_box_lower_corner, pixel_box_upper_corner, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, interleave=..., swizzle=..., l2_promotion=..., oob_fill=...): + """Create an im2col TMA descriptor from a validated view. + + Im2col layout is used for convolution-style data access patterns. + + Parameters + ---------- + view : StridedMemoryView + A device-accessible view with a 16-byte-aligned pointer. + pixel_box_lower_corner : tuple of int + Lower corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + pixel_box_upper_corner : tuple of int + Upper corner of the pixel bounding box for each spatial + dimension (rank - 2 elements). Specified in row-major order + matching the tensor's spatial dimensions. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : dtype-like or TensorMapDataType, optional + Explicit dtype override. If ``None``, inferred from the tensor's + dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is + accepted for compatibility. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``NONE``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + + @classmethod + def _from_im2col_wide(cls, view, pixel_box_lower_corner_width, pixel_box_upper_corner_width, channels_per_pixel, pixels_per_column, *, element_strides=None, data_type=None, interleave=..., mode=..., swizzle=..., l2_promotion=..., oob_fill=...): + """Create an im2col-wide TMA descriptor from a validated view. + + Im2col-wide layout loads elements exclusively along the W (width) + dimension. This variant is supported on compute capability 10.0+ + (Blackwell and later). + + Parameters + ---------- + view : StridedMemoryView + A device-accessible view with a 16-byte-aligned pointer. + pixel_box_lower_corner_width : int + Lower corner of the pixel bounding box along the W dimension. + pixel_box_upper_corner_width : int + Upper corner of the pixel bounding box along the W dimension. + channels_per_pixel : int + Number of channels per pixel. + pixels_per_column : int + Number of pixels per column. + element_strides : tuple of int, optional + Per-dimension element traversal strides. Default is all 1s. + data_type : dtype-like or TensorMapDataType, optional + Explicit dtype override. If ``None``, inferred from the tensor's + dtype. Prefer NumPy or ``ml_dtypes`` dtype objects; the enum is + accepted for compatibility. + interleave : TensorMapInterleave + Interleave layout. Default ``NONE``. + mode : TensorMapIm2ColWideMode + Im2col wide mode. Default ``W``. + swizzle : TensorMapSwizzle + Swizzle mode. Default ``SWIZZLE_128B``. + l2_promotion : TensorMapL2Promotion + L2 promotion mode. Default ``NONE``. + oob_fill : TensorMapOOBFill + Out-of-bounds fill mode. Default ``NONE``. + + Returns + ------- + TensorMapDescriptor + + Raises + ------ + ValueError + If the tensor rank is outside [3, 5], the pointer is not + 16-byte aligned, or other constraints are violated. + """ + + def replace_address(self, tensor): + """Replace the global memory address in this tensor map descriptor. + + This is useful when the tensor data has been reallocated but the + shape, strides, and other parameters remain the same. + + Parameters + ---------- + tensor : object + Any object supporting DLPack or ``__cuda_array_interface__``, + or a :obj:`~cuda.core.StridedMemoryView`. Must refer to + device-accessible memory with a 16-byte-aligned pointer. + """ + + def __repr__(self): + ... +_TMA_DT_UINT8 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT8) +_TMA_DT_UINT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT16) +_TMA_DT_UINT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT32) +_TMA_DT_INT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT32) +_TMA_DT_UINT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_UINT64) +_TMA_DT_INT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_INT64) +_TMA_DT_FLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT16) +_TMA_DT_FLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32) +_TMA_DT_FLOAT64 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT64) +_TMA_DT_BFLOAT16 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16) +_TMA_DT_FLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ) +_TMA_DT_TFLOAT32 = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32) +_TMA_DT_TFLOAT32_FTZ = int(cydriver.CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ) +_NUMPY_DTYPE_TO_TMA = {numpy.dtype(numpy.uint8): _TMA_DT_UINT8, numpy.dtype(numpy.uint16): _TMA_DT_UINT16, numpy.dtype(numpy.uint32): _TMA_DT_UINT32, numpy.dtype(numpy.int32): _TMA_DT_INT32, numpy.dtype(numpy.uint64): _TMA_DT_UINT64, numpy.dtype(numpy.int64): _TMA_DT_INT64, numpy.dtype(numpy.float16): _TMA_DT_FLOAT16, numpy.dtype(numpy.float32): _TMA_DT_FLOAT32, numpy.dtype(numpy.float64): _TMA_DT_FLOAT64} +_TMA_DATA_TYPE_SIZE = {_TMA_DT_UINT8: 1, _TMA_DT_UINT16: 2, _TMA_DT_UINT32: 4, _TMA_DT_INT32: 4, _TMA_DT_UINT64: 8, _TMA_DT_INT64: 8, _TMA_DT_FLOAT16: 2, _TMA_DT_FLOAT32: 4, _TMA_DT_FLOAT64: 8, _TMA_DT_BFLOAT16: 2, _TMA_DT_FLOAT32_FTZ: 4, _TMA_DT_TFLOAT32: 4, _TMA_DT_TFLOAT32_FTZ: 4} + +def _normalize_tensor_map_data_type(data_type): + ... + +def _normalize_tensor_map_sequence(name, values): + ... + +def _require_tensor_map_enum(name, value, enum_type): + ... + +def _coerce_tensor_map_descriptor_options(box_dim, options, *, element_strides, data_type, interleave, swizzle, l2_promotion, oob_fill): + ... + +def _resolve_data_type(view, data_type): + """Resolve the TMA data type from an explicit value or the view's dtype.""" + +def _get_validated_view(tensor): + """Obtain a device-accessible StridedMemoryView with a 16-byte-aligned pointer.""" + +def _require_view_device(view, expected_device_id, operation): + """Ensure device-local tensors match the current CUDA device. + + DLPack reports host/managed CUDA memory as ``kDLCUDAHost`` / + ``kDLCUDAManaged`` with ``device_id=0`` regardless of the current device, + so only true ``kDLCUDA`` tensors are rejected by device-id mismatch. + """ + +def _compute_byte_strides(shape, strides, elem_size): + """Compute byte strides from element strides or C-contiguous fallback. + + Returns a tuple of byte strides in row-major order. + """ + +def _validate_element_strides(element_strides, rank): + """Validate or default element_strides to all-ones.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyi b/cuda_core/cuda/core/_utils/cuda_utils.pyi new file mode 100644 index 00000000000..13c43e594ca --- /dev/null +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyi @@ -0,0 +1,144 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/cuda_utils.pyx + +from __future__ import annotations + +from collections import namedtuple +from typing import Callable + +from cuda.bindings import cydriver +from cuda.bindings import driver as driver +from cuda.bindings import nvrtc as nvrtc +from cuda.bindings import runtime as runtime + + +class CUDAError(Exception): + ... + +class NVRTCError(CUDAError): + ... + +class Transaction: + """ + A context manager for transactional operations with undo capability. + + The Transaction class allows you to register undo actions (callbacks) that will be executed + if the transaction is not committed before exiting the context. This is useful for managing + resources or operations that need to be rolled back in case of errors or early exits. + + Usage: + with Transaction() as txn: + txn.append(some_cleanup_function, arg1, arg2) + # ... perform operations ... + txn.commit() # Disarm undo actions; nothing will be rolled back on exit + + Methods: + append(fn, *args, **kwargs): Register an undo action to be called on rollback. + commit(): Disarm all undo actions; nothing will be rolled back on exit. + """ + + def __init__(self): + ... + + def __enter__(self): + ... + + def __exit__(self, exc_type, exc, tb): + ... + + def append(self, fn, /, *args, **kwargs): + """ + Register an undo action (runs if the with-block exits without commit()). + Values are bound now via partial so late mutations don't bite you. + """ + + def commit(self): + """ + Disarm all undo actions. After this, exiting the with-block does nothing. + """ +_keep_driver_in_stub: 'driver.CUresult' +_keep_nvrtc_in_stub: 'nvrtc.nvrtcResult' +_keep_runtime_in_stub: 'runtime.cudaError_t' +ComputeCapability = namedtuple('ComputeCapability', ('major', 'minor')) +_fork_warning_checked = False + +def _check_driver_error(error: cydriver.CUresult) -> int: + ... + +def _check_runtime_error(error) -> int: + ... + +def _check_nvrtc_error(error, handle=None) -> int: + ... + +def check_or_create_options(cls: type, options, options_description: str='', keep_none: bool=False): + """ + Create the specified options dataclass from a dictionary of options or None. + """ + +def _parse_fill_value(value) -> tuple: + """Parse a fill/memset value into (raw_value, element_size). + + Parameters + ---------- + value : int or buffer-protocol object + - int: Must be in range [0, 256). Treated as 1-byte fill. + - bytes or buffer-protocol: Must be 1, 2, or 4 bytes. + + Returns + ------- + tuple of (int, int) + (raw_value, element_size) where element_size is 1, 2, or 4. + + Raises + ------ + OverflowError + If int value is outside [0, 256). + TypeError + If value is not an int and does not support the buffer protocol. + ValueError + If value byte length is not 1, 2, or 4. + """ + +def cast_to_3_tuple(label, cfg): + ... + +def handle_return(result: tuple, handle=None): + ... + +def _handle_boolean_option(option: bool) -> str: + """ + Convert a boolean option to a string representation. + """ + +def precondition(checker: Callable[..., None], what: str='') -> Callable: + """ + A decorator that adds checks to ensure any preconditions are met. + + Args: + checker: The function to call to check whether the preconditions are met. It has + the same signature as the wrapped function with the addition of the keyword argument `what`. + what: A string that is passed in to `checker` to provide context information. + + Returns: + Callable: A decorator that creates the wrapping. + """ + +def is_sequence(obj): + """ + Check if the given object is a sequence (list or tuple). + """ + +def is_nested_sequence(obj): + """ + Check if the given object is a nested sequence (list or tuple with atleast one list or tuple element). + """ + +def reset_fork_warning(): + """Reset the fork warning check flag for testing purposes. + + This function is intended for use in tests to allow multiple test runs + to check the warning behavior. + """ + +def check_multiprocessing_start_method(): + """Check if multiprocessing start method is 'fork' and warn if so.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/_utils/version.pyi b/cuda_core/cuda/core/_utils/version.pyi new file mode 100644 index 00000000000..bb7f0129917 --- /dev/null +++ b/cuda_core/cuda/core/_utils/version.pyi @@ -0,0 +1,14 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/_utils/version.pyx + +from __future__ import annotations + +import functools + + +@functools.cache +def binding_version() -> tuple[int, int, int]: + """Return the cuda-bindings version as a (major, minor, patch) triple.""" + +@functools.cache +def driver_version() -> tuple[int, int, int]: + """Return the CUDA driver version as a (major, minor, patch) triple.""" \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi new file mode 100644 index 00000000000..287ed9e300a --- /dev/null +++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi @@ -0,0 +1,59 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx + +"""Mutable-set proxy for graph node predecessors and successors.""" +from __future__ import annotations + +from collections.abc import MutableSet +from collections.abc import Set as AbstractSet +from typing import Any + +from cuda.core.graph._graph_node import GraphNode + + +class AdjacencySetProxy(MutableSet): + """Mutable set proxy for a node's predecessors or successors. Mutations + write through to the underlying CUDA graph.""" + __slots__ = ('_core',) + + def __init__(self, node, is_fwd: bool): + ... + + @classmethod + def _from_iterable(cls, it): + ... + + def __contains__(self, x): + ... + + def __iter__(self): + ... + + def __len__(self): + ... + + def add(self, value): + ... + + def discard(self, value): + ... + + def clear(self): + """Remove all edges in a single driver call.""" + + def __isub__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy': + """Remove edges to all nodes in *it* in a single driver call.""" + + def update(self, *others): + """Add edges to multiple nodes at once.""" + + def __ior__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy': # type: ignore[override,misc] + """Add edges to all nodes in *it* in a single driver call.""" + + def __repr__(self): + ... + +class _AdjacencySetCore: + """Cythonized core implementing AdjacencySetProxy""" + + def __init__(self, node: GraphNode, is_fwd: bool): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_graph_builder.pyi b/cuda_core/cuda/core/graph/_graph_builder.pyi new file mode 100644 index 00000000000..83395a76db5 --- /dev/null +++ b/cuda_core/cuda/core/graph/_graph_builder.pyi @@ -0,0 +1,461 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_graph_builder.pyx + +from __future__ import annotations + +from dataclasses import dataclass + +from cuda.core._stream import Stream +from cuda.core._utils.cuda_utils import driver +from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition + + +@dataclass +class GraphDebugPrintOptions: + """Options for debug_dot_print(). + + Attributes + ---------- + verbose : bool + Output all debug data as if every debug flag is enabled (Default to False) + runtime_types : bool + Use CUDA Runtime structures for output (Default to False) + kernel_node_params : bool + Adds kernel parameter values to output (Default to False) + memcpy_node_params : bool + Adds memcpy parameter values to output (Default to False) + memset_node_params : bool + Adds memset parameter values to output (Default to False) + host_node_params : bool + Adds host parameter values to output (Default to False) + event_node_params : bool + Adds event parameter values to output (Default to False) + ext_semas_signal_node_params : bool + Adds external semaphore signal parameter values to output (Default to False) + ext_semas_wait_node_params : bool + Adds external semaphore wait parameter values to output (Default to False) + kernel_node_attributes : bool + Adds kernel node attributes to output (Default to False) + handles : bool + Adds node handles and every kernel function handle to output (Default to False) + mem_alloc_node_params : bool + Adds memory alloc parameter values to output (Default to False) + mem_free_node_params : bool + Adds memory free parameter values to output (Default to False) + batch_mem_op_node_params : bool + Adds batch mem op parameter values to output (Default to False) + extra_topo_info : bool + Adds edge numbering information (Default to False) + conditional_node_params : bool + Adds conditional node parameter values to output (Default to False) + + """ + verbose: bool = False + runtime_types: bool = False + kernel_node_params: bool = False + memcpy_node_params: bool = False + memset_node_params: bool = False + host_node_params: bool = False + event_node_params: bool = False + ext_semas_signal_node_params: bool = False + ext_semas_wait_node_params: bool = False + kernel_node_attributes: bool = False + handles: bool = False + mem_alloc_node_params: bool = False + mem_free_node_params: bool = False + batch_mem_op_node_params: bool = False + extra_topo_info: bool = False + conditional_node_params: bool = False + + def _to_flags(self) -> int: + """Convert options to CUDA driver API flags (internal use).""" + +@dataclass +class GraphCompleteOptions: + """Options for graph instantiation. + + Attributes + ---------- + auto_free_on_launch : bool, optional + Automatically free memory allocated in a graph before relaunching. (Default to False) + upload_stream : Stream, optional + Stream to use to automatically upload the graph after completion. (Default to None) + device_launch : bool, optional + Configure the graph to be launchable from the device. This flag can only + be used on platforms which support unified addressing. This flag cannot be + used in conjunction with auto_free_on_launch. (Default to False) + use_node_priority : bool, optional + Run the graph using the per-node priority attributes rather than the + priority of the stream it is launched into. (Default to False) + + """ + auto_free_on_launch: bool = False + upload_stream: Stream | None = None + device_launch: bool = False + use_node_priority: bool = False + +class GraphBuilder: + """A graph under construction by stream capture. + + A graph groups a set of CUDA kernels and other CUDA operations together and executes + them with a specified dependency tree. It speeds up the workflow by combining the + driver activities associated with CUDA kernel launches and CUDA API calls. + + Directly creating a :obj:`~graph.GraphBuilder` is not supported due + to ambiguity. New graph builders should instead be created through a + :obj:`~_device.Device`, or a :obj:`~_stream.stream` object. + + """ + + class _MembersNeededForFinalize: + __slots__ = ('conditional_graph', 'graph', 'is_join_required', 'is_stream_owner', 'stream') + + def __init__(self, graph_builder_obj, stream_obj, is_stream_owner, conditional_graph, is_join_required): + ... + + def close(self): + ... + __slots__ = ('__weakref__', '_building_ended', '_mnff') + + def __init__(self): + ... + + @classmethod + def _init(cls, stream, is_stream_owner, conditional_graph=None, is_join_required=False): + ... + + @property + def stream(self) -> Stream: + """Returns the stream associated with the graph builder.""" + + @property + def is_join_required(self) -> bool: + """Returns True if this graph builder must be joined before building is ended.""" + + def begin_building(self, mode='relaxed') -> GraphBuilder: + """Begins the building process. + + Build `mode` for controlling interaction with other API calls must be one of the following: + + - `global` : Prohibit potentially unsafe operations across all streams in the process. + - `thread_local` : Prohibit potentially unsafe operations in streams created by the current thread. + - `relaxed` : The local thread is not prohibited from potentially unsafe operations. + + Parameters + ---------- + mode : str, optional + Build mode to control the interaction with other API calls that are porentially unsafe. + Default set to use relaxed. + + """ + + @property + def is_building(self) -> bool: + """Returns True if the graph builder is currently building.""" + + def end_building(self) -> GraphBuilder: + """Ends the building process.""" + + def complete(self, options: GraphCompleteOptions | None=None) -> 'Graph': + """Completes the graph builder and returns the built :obj:`~graph.Graph` object. + + Parameters + ---------- + options : :obj:`~graph.GraphCompleteOptions`, optional + Customizable dataclass for the graph builder completion options. + + Returns + ------- + graph : :obj:`~graph.Graph` + The newly built graph. + + """ + + def debug_dot_print(self, path, options: GraphDebugPrintOptions | None=None): + """Generates a DOT debug file for the graph builder. + + Parameters + ---------- + path : str + File path to use for writting debug DOT output + options : :obj:`~graph.GraphDebugPrintOptions`, optional + Customizable dataclass for the debug print options. + + """ + + def split(self, count: int) -> tuple[GraphBuilder, ...]: + """Splits the original graph builder into multiple graph builders. + + The new builders inherit work dependencies from the original builder. + The original builder is reused for the split and is returned first in the tuple. + + Parameters + ---------- + count : int + The number of graph builders to split the graph builder into. + + Returns + ------- + graph_builders : tuple[:obj:`~graph.GraphBuilder`, ...] + A tuple of split graph builders. The first graph builder in the tuple + is always the original graph builder. + + """ + + @staticmethod + def join(*graph_builders) -> GraphBuilder: + """Joins multiple graph builders into a single graph builder. + + The returned builder inherits work dependencies from the provided builders. + + Parameters + ---------- + *graph_builders : :obj:`~graph.GraphBuilder` + The graph builders to join. + + Returns + ------- + graph_builder : :obj:`~graph.GraphBuilder` + The newly joined graph builder. + + """ + + def __cuda_stream__(self) -> tuple[int, int]: + """Return an instance of a __cuda_stream__ protocol.""" + + def _get_conditional_context(self) -> driver.CUcontext: + ... + + def create_condition(self, default_value=None) -> GraphCondition: + """Create a condition variable for use with conditional nodes. + + The returned :class:`GraphCondition` object is passed to conditional-node + builder methods (:meth:`if_then`, :meth:`if_else`, :meth:`while_loop`, + :meth:`switch`). Its value is controlled at runtime by device code via + ``cudaGraphSetConditional``. + + Parameters + ---------- + default_value : int, optional + The default value to assign to the condition. If None, no + default is assigned. + + Returns + ------- + GraphCondition + A condition variable for controlling conditional execution. + """ + + def _cond_with_params(self, node_params) -> tuple: + ... + + def if_then(self, condition: GraphCondition) -> GraphBuilder: + """Adds an if condition branch and returns a new graph builder for it. + + The resulting if graph will only execute the branch if the + condition evaluates to true at runtime. + + The new builder inherits work dependencies from the original builder. + + Parameters + ---------- + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` controlling + whether the branch executes. + + Returns + ------- + graph_builder : :obj:`~graph.GraphBuilder` + The newly created conditional graph builder. + + """ + + def if_else(self, condition: GraphCondition) -> tuple[GraphBuilder, GraphBuilder]: + """Adds an if-else condition branch and returns new graph builders for both branches. + + The resulting if graph will execute the branch if the condition + evaluates to true at runtime, otherwise the else branch will execute. + + The new builders inherit work dependencies from the original builder. + + Parameters + ---------- + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` controlling + which branch executes. + + Returns + ------- + graph_builders : tuple[:obj:`~graph.GraphBuilder`, :obj:`~graph.GraphBuilder`] + A tuple of two new graph builders, one for the if branch and one for the else branch. + + """ + + def switch(self, condition: GraphCondition, count: int) -> tuple[GraphBuilder, ...]: + """Adds a switch condition branch and returns new graph builders for all cases. + + The resulting switch graph will execute the branch whose case index + matches the value of the condition at runtime. If no match is found, no + branch will be executed. + + The new builders inherit work dependencies from the original builder. + + Parameters + ---------- + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` selecting + which case executes. + count : int + The number of cases to add to the switch conditional. + + Returns + ------- + graph_builders : tuple[:obj:`~graph.GraphBuilder`, ...] + A tuple of new graph builders, one for each branch. + + """ + + def while_loop(self, condition: GraphCondition) -> GraphBuilder: + """Adds a while loop and returns a new graph builder for it. + + The resulting while loop graph will execute the branch repeatedly at runtime + until the condition evaluates to false. + + The new builder inherits work dependencies from the original builder. + + Parameters + ---------- + condition : :class:`~graph.GraphCondition` + The condition variable from :meth:`create_condition` controlling + loop continuation. + + Returns + ------- + graph_builder : :obj:`~graph.GraphBuilder` + The newly created while loop graph builder. + + """ + + def close(self): + """Destroy the graph builder. + + Closes the associated stream if we own it. Borrowed stream + object will instead have their references released. + + """ + + def embed(self, child: GraphBuilder): + """Embed a previously-built :obj:`~graph.GraphBuilder` as a child node. + + Parameters + ---------- + child : :obj:`~graph.GraphBuilder` + The child graph builder. Must have finished building. + """ + + def callback(self, fn, *, user_data=None): + """Add a host callback to the graph during stream capture. + + The callback runs on the host CPU when the graph reaches this point + in execution. Two modes are supported: + + - **Python callable**: Pass any callable. The GIL is acquired + automatically. The callable must take no arguments; use closures + or ``functools.partial`` to bind state. + - **ctypes function pointer**: Pass a ``ctypes.CFUNCTYPE`` instance. + The function receives a single ``void*`` argument (the + ``user_data``). The caller must keep the ctypes wrapper alive + for the lifetime of the graph. + + .. warning:: + + Callbacks must not call CUDA API functions. Doing so may + deadlock or corrupt driver state. + + Parameters + ---------- + fn : callable or ctypes function pointer + The callback function. + user_data : int or bytes-like, optional + Only for ctypes function pointers. If ``int``, passed as a raw + pointer (caller manages lifetime). If bytes-like, the data is + copied and its lifetime is tied to the graph. + """ + +class Graph: + """An executable graph. + + A graph groups a set of CUDA kernels and other CUDA operations together and executes + them with a specified dependency tree. It speeds up the workflow by combining the + driver activities associated with CUDA kernel launches and CUDA API calls. + + Graphs must be built using a :obj:`~graph.GraphBuilder` object. + + """ + + class _MembersNeededForFinalize: + __slots__ = 'graph' + + def __init__(self, graph_obj, graph): + ... + + def close(self): + ... + __slots__ = ('__weakref__', '_mnff') + + def __init__(self): + ... + + @classmethod + def _init(cls, graph): + ... + + def close(self): + """Destroy the graph.""" + + @property + def handle(self) -> driver.CUgraphExec: + """Return the underlying ``CUgraphExec`` object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int()`` on the returned object. + + """ + + def update(self, source: 'GraphBuilder | GraphDefinition') -> None: + """Update the graph using a new graph definition. + + The topology of the provided source must be identical to this graph. + + Parameters + ---------- + source : :obj:`~graph.GraphBuilder` or :obj:`~graph.GraphDefinition` + The graph definition to update from. A GraphBuilder must have + finished building. + + """ + + def upload(self, stream: Stream): + """Uploads the graph in a stream. + + Parameters + ---------- + stream : :obj:`~_stream.Stream` + The stream in which to upload the graph + + """ + + def launch(self, stream: Stream): + """Launches the graph in a stream. + + Parameters + ---------- + stream : :obj:`~_stream.Stream` + The stream in which to launch the graph. + + """ +__all__ = ['Graph', 'GraphBuilder', 'GraphCompleteOptions', 'GraphDebugPrintOptions'] + +def _instantiate_graph(h_graph, options: GraphCompleteOptions | None=None) -> 'Graph': + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_graph_definition.pyi b/cuda_core/cuda/core/graph/_graph_definition.pyi new file mode 100644 index 00000000000..c016671c9af --- /dev/null +++ b/cuda_core/cuda/core/graph/_graph_definition.pyi @@ -0,0 +1,238 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_graph_definition.pyx + +"""GraphDefinition: explicit CUDA graph definition.""" +from __future__ import annotations + +from cuda.core._device import Device +from cuda.core._utils.cuda_utils import driver +from cuda.core.graph._graph_node import GraphNode +from cuda.core.graph._subclasses import (AllocNode, ChildGraphNode, EmptyNode, + EventRecordNode, EventWaitNode, + FreeNode, HostCallbackNode, + IfElseNode, IfNode, KernelNode, + MemcpyNode, MemsetNode, SwitchNode, + WhileNode) +from cuda.core.typing import GraphMemoryType + + +class GraphCondition: + """A condition variable for conditional graph nodes. + + Created by :meth:`GraphDefinition.create_condition` (or + :meth:`GraphBuilder.create_condition`) and passed to + conditional-node builder methods (:meth:`~GraphDefinition.if_then`, + :meth:`~GraphDefinition.if_else`, :meth:`~GraphDefinition.while_loop`, + :meth:`~GraphDefinition.switch`). The underlying value is set at + runtime by device code via ``cudaGraphSetConditional``. + + A :class:`GraphCondition` may be passed directly as a kernel + argument to ``launch()``: the launcher unwraps it to the underlying + ``CUgraphConditionalHandle`` value so device code can update the + condition. + """ + + def __repr__(self) -> str: + ... + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... + + @property + def handle(self) -> driver.CUgraphConditionalHandle: + """The raw CUgraphConditionalHandle as an int.""" + +class GraphDefinition: + """A graph definition. + + A GraphDefinition is used to construct a graph explicitly by adding nodes + and specifying dependencies. Once construction is complete, call + instantiate() to obtain an executable Graph. + """ + + def __init__(self): + """Create a new empty graph definition.""" + + def __repr__(self) -> str: + ... + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... + + @property + def _entry(self) -> GraphNode: + """Return the internal entry-point GraphNode (no dependencies).""" + + def allocate(self, size: int, *, device: Device | int | None=None, memory_type: GraphMemoryType=..., peer_access: list[Device | int] | None=None) -> AllocNode: + """Add an entry-point memory allocation node (no dependencies). + + See :meth:`GraphNode.allocate` for full documentation. + """ + + def deallocate(self, dptr) -> FreeNode: + """Add an entry-point memory free node (no dependencies). + + See :meth:`GraphNode.deallocate` for full documentation. + """ + + def memset(self, dst, value, width: int, height: int=1, pitch: int=0) -> MemsetNode: + """Add an entry-point memset node (no dependencies). + + See :meth:`GraphNode.memset` for full documentation. + """ + + def launch(self, config, kernel, *args) -> KernelNode: + """Add an entry-point kernel launch node (no dependencies). + + See :meth:`GraphNode.launch` for full documentation. + """ + + def empty(self) -> EmptyNode: + """Add an entry-point empty node (no dependencies). + + Returns + ------- + EmptyNode + A new EmptyNode with no dependencies. + """ + + def join(self, *nodes) -> EmptyNode: + """Create an empty node that depends on all given nodes. + + Parameters + ---------- + *nodes : GraphNode + Nodes to merge. + + Returns + ------- + EmptyNode + A new EmptyNode that depends on all input nodes. + """ + + def memcpy(self, dst, src, size: int) -> MemcpyNode: + """Add an entry-point memcpy node (no dependencies). + + See :meth:`GraphNode.memcpy` for full documentation. + """ + + def embed(self, child: GraphDefinition) -> ChildGraphNode: + """Add an entry-point child graph node (no dependencies). + + See :meth:`GraphNode.embed` for full documentation. + """ + + def record(self, event) -> EventRecordNode: + """Add an entry-point event record node (no dependencies). + + See :meth:`GraphNode.record` for full documentation. + """ + + def wait(self, event) -> EventWaitNode: + """Add an entry-point event wait node (no dependencies). + + See :meth:`GraphNode.wait` for full documentation. + """ + + def callback(self, fn, *, user_data=None) -> HostCallbackNode: + """Add an entry-point host callback node (no dependencies). + + See :meth:`GraphNode.callback` for full documentation. + """ + + def create_condition(self, default_value: int | None=None) -> GraphCondition: + """Create a condition variable for use with conditional nodes. + + The returned :class:`GraphCondition` object is passed to conditional-node + builder methods. Its value is controlled at runtime by device code + via ``cudaGraphSetConditional``. + + Parameters + ---------- + default_value : int, optional + The default value to assign to the condition. + If None, no default is assigned. + + Returns + ------- + GraphCondition + A condition variable for controlling conditional execution. + """ + + def if_then(self, condition: GraphCondition) -> IfNode: + """Add an entry-point if-conditional node (no dependencies). + + See :meth:`GraphNode.if_then` for full documentation. + """ + + def if_else(self, condition: GraphCondition) -> IfElseNode: + """Add an entry-point if-else conditional node (no dependencies). + + See :meth:`GraphNode.if_else` for full documentation. + """ + + def while_loop(self, condition: GraphCondition) -> WhileNode: + """Add an entry-point while-loop conditional node (no dependencies). + + See :meth:`GraphNode.while_loop` for full documentation. + """ + + def switch(self, condition: GraphCondition, count: int) -> SwitchNode: + """Add an entry-point switch conditional node (no dependencies). + + See :meth:`GraphNode.switch` for full documentation. + """ + + def instantiate(self, options=None): + """Instantiate the graph definition into an executable Graph. + + Parameters + ---------- + options : :obj:`~graph.GraphCompleteOptions`, optional + Customizable dataclass for graph instantiation options. + + Returns + ------- + Graph + An executable graph that can be launched on a stream. + """ + + def debug_dot_print(self, path: str, options=None) -> None: + """Write a GraphViz DOT representation of the graph to a file. + + Parameters + ---------- + path : str + File path for the DOT output. + options : GraphDebugPrintOptions, optional + Customizable options for the debug print. + """ + + def nodes(self) -> set: + """Return all nodes in the graph. + + Returns + ------- + set of GraphNode + All nodes in the graph. + """ + + def edges(self) -> set: + """Return all edges in the graph as (from_node, to_node) pairs. + + Returns + ------- + set of tuple + Each element is a (from_node, to_node) pair representing + a dependency edge in the graph. + """ + + @property + def handle(self) -> driver.CUgraph: + """Return the underlying driver CUgraph handle.""" +__all__ = ['GraphCondition', 'GraphDefinition'] \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_graph_node.pyi b/cuda_core/cuda/core/graph/_graph_node.pyi new file mode 100644 index 00000000000..ff8f6a3519d --- /dev/null +++ b/cuda_core/cuda/core/graph/_graph_node.pyi @@ -0,0 +1,376 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_graph_node.pyx + +"""GraphNode base class — factory, properties, and builder methods.""" +from __future__ import annotations + +import weakref + +from cuda.core._device import Device +from cuda.core._event import Event +from cuda.core._launch_config import LaunchConfig +from cuda.core._module import Kernel +from cuda.core._utils.cuda_utils import driver +from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition +from cuda.core.graph._subclasses import (AllocNode, ChildGraphNode, EmptyNode, + EventRecordNode, EventWaitNode, + FreeNode, HostCallbackNode, + IfElseNode, IfNode, KernelNode, + MemcpyNode, MemsetNode, SwitchNode, + WhileNode) +from cuda.core.typing import GraphMemoryType + + +class GraphNode: + """A node in a graph definition. + + Nodes are created by calling builder methods on GraphDefinition (for + entry-point nodes with no dependencies) or on other Nodes (for + nodes that depend on a predecessor). + """ + + def __repr__(self) -> str: + ... + + def __eq__(self, other) -> bool: + ... + + def __hash__(self) -> int: + ... + + @property + def type(self): + """Return the CUDA graph node type. + + Returns + ------- + CUgraphNodeType or None + The node type enum value, or None for the entry node. + """ + + @property + def graph(self) -> GraphDefinition: + """Return the GraphDefinition this node belongs to.""" + + @property + def handle(self) -> driver.CUgraphNode: + """Return the underlying driver CUgraphNode handle. + + Returns None for the entry node. + """ + + @property + def is_valid(self): + """Whether this node is valid (not destroyed). + + Returns ``False`` after :meth:`destroy` has been called. + """ + + def destroy(self): + """Destroy this node and remove all its edges from the parent graph. + + After this call, :attr:`is_valid` returns ``False`` and the node + cannot be re-added to any graph. Safe to call on an + already-destroyed node (no-op). + """ + + @property + def pred(self): + """A mutable set-like view of this node's predecessors.""" + + @pred.setter + def pred(self, value): + ... + + @property + def succ(self): + """A mutable set-like view of this node's successors.""" + + @succ.setter + def succ(self, value): + ... + + def launch(self, config: LaunchConfig, kernel: Kernel, *args) -> KernelNode: + """Add a kernel launch node depending on this node. + + Parameters + ---------- + config : LaunchConfig + Launch configuration (grid, block, shared memory, etc.) + kernel : Kernel + The kernel to launch. + *args + Kernel arguments. + + Returns + ------- + KernelNode + A new KernelNode representing the kernel launch. + """ + + def join(self, *nodes: GraphNode) -> EmptyNode: + """Create an empty node that depends on this node and all given nodes. + + This is used to synchronize multiple branches of execution. + + Parameters + ---------- + *nodes : GraphNode + Additional nodes to depend on. + + Returns + ------- + EmptyNode + A new EmptyNode that depends on all input nodes. + """ + + def allocate(self, size: int, *, device: Device | int | None=None, memory_type: GraphMemoryType=..., peer_access: list[Device | int] | None=None) -> AllocNode: + """Add a memory allocation node depending on this node. + + Parameters + ---------- + size : int + Number of bytes to allocate. + device : int or Device, optional + The device on which to allocate memory. If None (default), + uses the current CUDA context's device. + memory_type : GraphMemoryType or str, optional + Type of memory to allocate. One of: + + - ``GraphMemoryType.DEVICE`` (default): Pinned device memory, + optimal for GPU kernels. + - ``GraphMemoryType.HOST``: Pinned host memory, accessible from + both host and device. Useful for graphs containing host + callback nodes. Note: may not be supported on all + systems/drivers. + - ``GraphMemoryType.MANAGED``: Managed/unified memory that + automatically migrates between host and device. Useful for + mixed host/device access patterns. + + peer_access : list of int or Device, optional + List of devices that should have read-write access to the + allocated memory. If None (default), only the allocating + device has access. + + Returns + ------- + AllocNode + A new AllocNode representing the allocation. Access the allocated + device pointer via the dptr property. + + Notes + ----- + IPC (inter-process communication) is not supported for graph + memory allocation nodes per CUDA documentation. + """ + + def deallocate(self, dptr: int) -> FreeNode: + """Add a memory free node depending on this node. + + Parameters + ---------- + dptr : int + Device pointer to free (typically from AllocNode.dptr). + + Returns + ------- + FreeNode + A new FreeNode representing the free operation. + """ + + def memset(self, dst: int, value, width: int, height: int=1, pitch: int=0) -> MemsetNode: + """Add a memset node depending on this node. + + Parameters + ---------- + dst : int + Destination device pointer. + value : int or buffer-protocol object + Fill value. int for 1-byte fill (range [0, 256)), + or buffer-protocol object of 1, 2, or 4 bytes. + width : int + Width of the row in elements. + height : int, optional + Number of rows (default 1). + pitch : int, optional + Pitch of destination in bytes (default 0, unused if height is 1). + + Returns + ------- + MemsetNode + A new MemsetNode representing the memset operation. + """ + + def memcpy(self, dst: int, src: int, size: int) -> MemcpyNode: + """Add a memcpy node depending on this node. + + Copies ``size`` bytes from ``src`` to ``dst``. Memory types are + auto-detected via the driver, so both device and pinned host + pointers are supported. + + Parameters + ---------- + dst : int + Destination pointer (device or pinned host). + src : int + Source pointer (device or pinned host). + size : int + Number of bytes to copy. + + Returns + ------- + MemcpyNode + A new MemcpyNode representing the copy operation. + """ + + def embed(self, child: GraphDefinition) -> ChildGraphNode: + """Add a child graph node depending on this node. + + Embeds a clone of the given graph definition as a sub-graph node. + The child graph must not contain allocation, free, or conditional + nodes. + + Parameters + ---------- + child : GraphDefinition + The graph definition to embed (will be cloned). + + Returns + ------- + ChildGraphNode + A new ChildGraphNode representing the embedded sub-graph. + """ + + def record(self, event: Event) -> EventRecordNode: + """Add an event record node depending on this node. + + Parameters + ---------- + event : Event + The event to record. + + Returns + ------- + EventRecordNode + A new EventRecordNode representing the event record operation. + """ + + def wait(self, event: Event) -> EventWaitNode: + """Add an event wait node depending on this node. + + Parameters + ---------- + event : Event + The event to wait for. + + Returns + ------- + EventWaitNode + A new EventWaitNode representing the event wait operation. + """ + + def callback(self, fn, *, user_data=None) -> HostCallbackNode: + """Add a host callback node depending on this node. + + The callback runs on the host CPU when the graph reaches this node. + Two modes are supported: + + - **Python callable**: Pass any callable. The GIL is acquired + automatically. The callable must take no arguments; use closures + or ``functools.partial`` to bind state. + - **ctypes function pointer**: Pass a ``ctypes.CFUNCTYPE`` instance. + The function receives a single ``void*`` argument (the + ``user_data``). The caller must keep the ctypes wrapper alive + for the lifetime of the graph. + + .. warning:: + + Callbacks must not call CUDA API functions. Doing so may + deadlock or corrupt driver state. + + Parameters + ---------- + fn : callable or ctypes function pointer + The callback function. + user_data : int or bytes-like, optional + Only for ctypes function pointers. If ``int``, passed as a raw + pointer (caller manages lifetime). If bytes-like, the data is + copied and its lifetime is tied to the graph. + + Returns + ------- + HostCallbackNode + A new HostCallbackNode representing the callback. + """ + + def if_then(self, condition: GraphCondition) -> IfNode: + """Add an if-conditional node depending on this node. + + The body graph executes only when the condition evaluates to + a non-zero value at runtime. + + Parameters + ---------- + condition : GraphCondition + GraphCondition from :meth:`GraphDefinition.create_condition`. + + Returns + ------- + IfNode + A new IfNode with one branch accessible via ``.then``. + """ + + def if_else(self, condition: GraphCondition) -> IfElseNode: + """Add an if-else conditional node depending on this node. + + Two body graphs: the first executes when the condition is + non-zero, the second when it is zero. + + Parameters + ---------- + condition : GraphCondition + GraphCondition from :meth:`GraphDefinition.create_condition`. + + Returns + ------- + IfElseNode + A new IfElseNode with branches accessible via + ``.then`` and ``.else_``. + """ + + def while_loop(self, condition: GraphCondition) -> WhileNode: + """Add a while-loop conditional node depending on this node. + + The body graph executes repeatedly while the condition + evaluates to a non-zero value. + + Parameters + ---------- + condition : GraphCondition + GraphCondition from :meth:`GraphDefinition.create_condition`. + + Returns + ------- + WhileNode + A new WhileNode with body accessible via ``.body``. + """ + + def switch(self, condition: GraphCondition, count: int) -> SwitchNode: + """Add a switch conditional node depending on this node. + + The condition value selects which branch to execute. If the + value is out of range, no branch executes. + + Parameters + ---------- + condition : GraphCondition + GraphCondition from :meth:`GraphDefinition.create_condition`. + count : int + Number of switch cases (branches). + + Returns + ------- + SwitchNode + A new SwitchNode with branches accessible via ``.branches``. + """ +__all__ = ['GraphNode'] +_node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueDictionary() \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_subclasses.pyi b/cuda_core/cuda/core/graph/_subclasses.pyi new file mode 100644 index 00000000000..6f9bb1ae99e --- /dev/null +++ b/cuda_core/cuda/core/graph/_subclasses.pyi @@ -0,0 +1,339 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_subclasses.pyx + +"""GraphNode subclasses — EmptyNode through SwitchNode.""" +from __future__ import annotations + +from cuda.core._event import Event +from cuda.core._launch_config import LaunchConfig +from cuda.core._module import Kernel +from cuda.core.graph._graph_definition import GraphCondition, GraphDefinition +from cuda.core.graph._graph_node import GraphNode +from cuda.core.typing import GraphConditionalType + + +class EmptyNode(GraphNode): + """An empty (synchronization) node.""" + + def __repr__(self) -> str: + ... + +class KernelNode(GraphNode): + """A kernel launch node. + + Properties + ---------- + grid : tuple of int + Grid dimensions (gridDimX, gridDimY, gridDimZ). + block : tuple of int + Block dimensions (blockDimX, blockDimY, blockDimZ). + shmem_size : int + Dynamic shared memory size in bytes. + kernel : Kernel + The kernel object for this launch node. + config : LaunchConfig + A LaunchConfig reconstructed from this node's parameters. + """ + + def __repr__(self) -> str: + ... + + @property + def grid(self) -> tuple: + """Grid dimensions as a 3-tuple (gridDimX, gridDimY, gridDimZ).""" + + @property + def block(self) -> tuple: + """Block dimensions as a 3-tuple (blockDimX, blockDimY, blockDimZ).""" + + @property + def shmem_size(self) -> int: + """Dynamic shared memory size in bytes.""" + + @property + def kernel(self) -> Kernel: + """The Kernel object for this launch node.""" + + @property + def config(self) -> LaunchConfig: + """A LaunchConfig reconstructed from this node's grid, block, and shmem_size. + + Note: cluster dimensions and is_cooperative are not preserved + by the CUDA driver's kernel node params, so they are not included. + """ + +class AllocNode(GraphNode): + """A memory allocation node. + + Properties + ---------- + dptr : int + The device pointer for the allocation. + bytesize : int + The number of bytes allocated. + device_id : int + The device on which the allocation was made. + memory_type : GraphMemoryType | str + The type of memory allocated. + peer_access : tuple of int + Device IDs that have read-write access to this allocation. + """ + + def __repr__(self) -> str: + ... + + @property + def dptr(self) -> int: + """The device pointer for the allocation.""" + + @property + def bytesize(self) -> int: + """The number of bytes allocated.""" + + @property + def device_id(self) -> int: + """The device on which the allocation was made.""" + + @property + def memory_type(self) -> str: + """The type of memory: ``"device"``, ``"host"``, or ``"managed"``.""" + + @property + def peer_access(self) -> tuple: + """Device IDs with read-write access to this allocation.""" + +class FreeNode(GraphNode): + """A memory deallocation node. + + Properties + ---------- + dptr : int + The device pointer being freed. + """ + + def __repr__(self) -> str: + ... + + @property + def dptr(self) -> int: + """The device pointer being freed.""" + +class MemsetNode(GraphNode): + """A memset node. + + Properties + ---------- + dptr : int + The destination device pointer. + value : int + The fill value. + element_size : int + Element size in bytes (1, 2, or 4). + width : int + Width of the row in elements. + height : int + Number of rows. + pitch : int + Pitch in bytes (unused if height is 1). + """ + + def __repr__(self) -> str: + ... + + @property + def dptr(self) -> int: + """The destination device pointer.""" + + @property + def value(self) -> int: + """The fill value.""" + + @property + def element_size(self) -> int: + """Element size in bytes (1, 2, or 4).""" + + @property + def width(self) -> int: + """Width of the row in elements.""" + + @property + def height(self) -> int: + """Number of rows.""" + + @property + def pitch(self) -> int: + """Pitch in bytes (unused if height is 1).""" + +class MemcpyNode(GraphNode): + """A memcpy node. + + Properties + ---------- + dst : int + The destination pointer. + src : int + The source pointer. + size : int + The number of bytes copied. + """ + + def __repr__(self) -> str: + ... + + @property + def dst(self) -> int: + """The destination pointer.""" + + @property + def src(self) -> int: + """The source pointer.""" + + @property + def size(self) -> int: + """The number of bytes copied.""" + +class ChildGraphNode(GraphNode): + """A child graph node. + + Properties + ---------- + child_graph : GraphDefinition + The embedded graph definition (non-owning wrapper). + """ + + def __repr__(self) -> str: + ... + + @property + def child_graph(self) -> GraphDefinition: + """The embedded graph definition (non-owning wrapper).""" + +class EventRecordNode(GraphNode): + """An event record node. + + Properties + ---------- + event : Event + The event being recorded. + """ + + def __repr__(self) -> str: + ... + + @property + def event(self) -> Event: + """The event being recorded.""" + +class EventWaitNode(GraphNode): + """An event wait node. + + Properties + ---------- + event : Event + The event being waited on. + """ + + def __repr__(self) -> str: + ... + + @property + def event(self) -> Event: + """The event being waited on.""" + +class HostCallbackNode(GraphNode): + """A host callback node. + + Properties + ---------- + callback : callable or None + The Python callable (None for ctypes function pointer callbacks). + """ + + def __repr__(self) -> str: + ... + + @property + def callback(self): + """The Python callable, or None for ctypes function pointer callbacks.""" + +class ConditionalNode(GraphNode): + """Base class for conditional nodes. + + When created via builder methods (if_then, if_else, while_loop, switch), + a specific subclass (IfNode, IfElseNode, WhileNode, SwitchNode) is + returned. When reconstructed from the driver on CUDA 13.2+, the + correct subclass is determined via cuGraphNodeGetParams. On older + drivers, this base class is used as a fallback. + + Properties + ---------- + condition : GraphCondition or None + The condition variable controlling execution (None pre-13.2). + cond_type : str or None + The conditional type ("if", "while", or "switch"; None pre-13.2). + branches : tuple of GraphDefinition + The body graphs for each branch (empty pre-13.2). + """ + + def __repr__(self) -> str: + ... + + @property + def condition(self) -> GraphCondition | None: + """The condition variable controlling execution.""" + + @property + def cond_type(self) -> GraphConditionalType | None: + """The conditional type: GraphConditionalType.IF, .WHILE, or .SWITCH + + Returns None when reconstructed from the driver pre-CUDA 13.2, + as the conditional type cannot be determined. + """ + + @property + def branches(self) -> tuple: + """The body graphs for each branch as a tuple of GraphDefinition. + + Returns an empty tuple when reconstructed from the driver + pre-CUDA 13.2. + """ + +class IfNode(ConditionalNode): + """An if-conditional node.""" + + def __repr__(self) -> str: + ... + + @property + def then(self) -> GraphDefinition: + """The 'then' branch graph.""" + +class IfElseNode(ConditionalNode): + """An if-else conditional node.""" + + def __repr__(self) -> str: + ... + + @property + def then(self) -> GraphDefinition: + """The ``then`` branch graph (executed when condition is non-zero).""" + + @property + def else_(self) -> GraphDefinition: + """The ``else`` branch graph (executed when condition is zero).""" + +class WhileNode(ConditionalNode): + """A while-loop conditional node.""" + + def __repr__(self) -> str: + ... + + @property + def body(self) -> GraphDefinition: + """The loop body graph.""" + +class SwitchNode(ConditionalNode): + """A switch conditional node.""" + + def __repr__(self) -> str: + ... +__all__ = ['AllocNode', 'ChildGraphNode', 'ConditionalNode', 'EmptyNode', 'EventRecordNode', 'EventWaitNode', 'FreeNode', 'HostCallbackNode', 'IfElseNode', 'IfNode', 'KernelNode', 'MemcpyNode', 'MemsetNode', 'SwitchNode', 'WhileNode'] \ No newline at end of file diff --git a/cuda_core/cuda/core/graph/_utils.pyi b/cuda_core/cuda/core/graph/_utils.pyi new file mode 100644 index 00000000000..79072e66ebe --- /dev/null +++ b/cuda_core/cuda/core/graph/_utils.pyi @@ -0,0 +1,3 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/graph/_utils.pyx + +from __future__ import annotations \ No newline at end of file diff --git a/cuda_core/cuda/core/system/_device.pyi b/cuda_core/cuda/core/system/_device.pyi new file mode 100644 index 00000000000..797ca295fbb --- /dev/null +++ b/cuda_core/cuda/core/system/_device.pyi @@ -0,0 +1,1900 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_device.pyx + +from __future__ import annotations + +from typing import Iterable + +import cuda.core +from cuda.bindings import nvml +from cuda.core.system.typing import (AddressingMode, AffinityScope, ClockId, + ClocksEventReasons, ClockType, + CoolerControl, CoolerTarget, DeviceArch, + EventType, FanControlPolicy, FieldId, + GpuP2PCapsIndex, GpuP2PStatus, + GpuTopologyLevel, InforomObject, + TemperatureThresholds, ThermalController, + ThermalTarget) + + +class ClockOffsets: + """ + Contains clock offset information. + """ + + def __init__(self, clock_offset: nvml.ClockOffset): + ... + + @property + def clock_offset_mhz(self) -> int: + """ + The current clock offset in MHz. + """ + + @property + def max_offset_mhz(self) -> int: + """ + The maximum clock offset in MHz. + """ + + @property + def min_offset_mhz(self) -> int: + """ + The minimum clock offset in MHz. + """ + +class ClockInfo: + """ + Accesses various clock information about a device. + """ + + def __init__(self, handle, clock_type: ClockType | str): + ... + + def get_current_mhz(self, clock_id: ClockId | str=...) -> int: + """ + Get the current clock speed of a specific clock domain, in MHz. + + For Kepler™ or newer fully supported devices. + + Parameters + ---------- + clock_id: :class:`ClockId` | str + The clock ID to query. Defaults to the current clock value. + + Returns + ------- + int + The clock speed in MHz. + """ + + def get_max_mhz(self) -> int: + """ + Get the maximum clock speed of a specific clock domain, in MHz. + + For Fermi™ or newer fully supported devices. + + Current P0 clocks (reported by :meth:`get_current_mhz` can differ from + max clocks by a few MHz. + + Returns + ------- + int + The maximum clock speed in MHz. + """ + + def get_max_customer_boost_mhz(self) -> int: + """ + Get the maximum customer boost clock speed of a specific clock, in MHz. + + For Pascal™ or newer fully supported devices. + + Returns + ------- + int + The maximum customer boost clock speed in MHz. + """ + + def get_min_max_clock_of_pstate_mhz(self, pstate: int) -> tuple[int, int]: + """ + Get the minimum and maximum clock speeds for this clock domain + at a given performance state (Pstate), in MHz. + + Parameters + ---------- + pstate: int + The performance state to query. Must be an int between 0 and 15, + where 0 is the highest performance state (P0) and 15 is the lowest + (P15). + + Returns + ------- + tuple[int, int] + A tuple containing the minimum and maximum clock speeds in MHz. + """ + + def get_offsets(self, pstate: int) -> ClockOffsets: + """ + Retrieve min, max and current clock offset of some clock domain for a given Pstate. + + For Maxwell™ or newer fully supported devices. + + Parameters + ---------- + pstate: int + The performance state to query. Must be an int between 0 and 15, + where 0 is the highest performance state (P0) and 15 is the lowest + (P15). + + Returns + ------- + :obj:`~_device.ClockOffsets` + An object with the min, max and current clock offset. + """ + +class CoolerInfo: + + def __init__(self, cooler_info: nvml.CoolerInfo): + ... + + @property + def signal_type(self) -> CoolerControl | None: + """ + The cooler's control signal characteristics. + + The possible types are variable and toggle. + """ + + @property + def target(self) -> list[CoolerTarget]: + """ + The target that cooler controls. + + Targets may be GPU, Memory, Power Supply, or all of these. See + :class:`CoolerTarget` for details. + """ + +class DeviceAttributes: + """ + Various device attributes. + """ + + def __init__(self, attributes: nvml.DeviceAttributes): + ... + + @property + def multiprocessor_count(self) -> int: + """ + The streaming multiprocessor count + """ + + @property + def shared_copy_engine_count(self) -> int: + """ + The shared copy engine count + """ + + @property + def shared_decoder_count(self) -> int: + """ + The shared decoder engine count + """ + + @property + def shared_encoder_count(self) -> int: + """ + The shared encoder engine count + """ + + @property + def shared_jpeg_count(self) -> int: + """ + The shared JPEG engine count + """ + + @property + def shared_ofa_count(self) -> int: + """ + The shared optical flow accelerator (OFA) engine count + """ + + @property + def gpu_instance_slice_count(self) -> int: + """ + The GPU instance slice count + """ + + @property + def compute_instance_slice_count(self) -> int: + """ + The compute instance slice count + """ + + @property + def memory_size_mb(self) -> int: + """ + Device memory size in MiB + """ + +class EventData: + """ + Data about a single event. + """ + + def __init__(self, event_data: nvml.EventData): + ... + + @property + def device(self) -> Device: + """ + The device on which the event occurred. + """ + + @property + def event_type(self) -> EventType: + """ + The type of event that was triggered. + """ + + @property + def event_data(self) -> int: + """ + Returns Xid error for the device in the event of + :attr:`~cuda.core.system.EventType.XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + + @property + def gpu_instance_id(self) -> int: + """ + The GPU instance ID for MIG devices. + + Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + + @property + def compute_instance_id(self) -> int: + """ + The Compute instance ID for MIG devices. + + Only valid for events of type :attr:`EventType.XID_CRITICAL_ERROR`. + + Raises :class:`ValueError` for other event types. + """ + +class DeviceEvents: + """ + Represents a set of events that can be waited on for a specific device. + """ + + def __init__(self, device_handle: int, events: EventType | str | list[EventType | str]): + ... + + def __dealloc__(self): + ... + + def wait(self, timeout_ms: int=0) -> EventData: + """ + Wait for events in the event set. + + For Fermi™ or newer fully supported devices. + + If some events are ready to be delivered at the time of the call, + function returns immediately. If there are no events ready to be + delivered, function sleeps until event arrives but not longer than + specified timeout. If timeout passes, a + :class:`cuda.core.system.TimeoutError` is raised. This function in + certain conditions can return before specified timeout passes (e.g. when + interrupt arrives). + + On Windows, in case of Xid error, the function returns the most recent + Xid error type seen by the system. If there are multiple Xid errors + generated before ``wait`` is invoked, then the last seen Xid + error type is returned for all Xid error events. + + On Linux, every Xid error event would return the associated event data + and other information if applicable. + + In MIG mode, if device handle is provided, the API reports all the + events for the available instances, only if the caller has appropriate + privileges. In absence of required privileges, only the events which + affect all the instances (i.e. whole device) are reported. + + This API does not currently support per-instance event reporting using + MIG device handles. + + Parameters + ---------- + timeout_ms: int + The timeout in milliseconds. A value of 0 means to wait indefinitely. + + Raises + ------ + :class:`cuda.core.system.TimeoutError` + If the timeout expires before an event is received. + :class:`cuda.core.system.GpuIsLostError` + If the GPU has fallen off the bus or is otherwise inaccessible. + """ + +class FanInfo: + """ + Manages information related to a specific fan on a specific device. + """ + + def __init__(self, handle: int, fan: int): + ... + + @property + def speed(self) -> int: + """ + Get/set the intended operating speed of the device's fan. + + For all discrete products with dedicated fans. + + Note: The reported speed is the intended fan speed. If the fan is + physically blocked and unable to spin, the output will not match the + actual fan speed. + + The fan speed is expressed as a percentage of the product's maximum + noise tolerance fan speed. This value may exceed 100% in certain cases. + """ + + @speed.setter + def speed(self, speed: int): + ... + + @property + def speed_rpm(self) -> int: + """ + The intended operating speed of the device's fan in rotations per minute + (RPM). + + For Maxwell™ or newer fully supported devices. + + For all discrete products with dedicated fans. + + Note: The reported speed is the intended fan speed. If the fan is + physically blocked and unable to spin, the output will not match the + actual fan speed. + """ + + @property + def target_speed(self) -> int: + """ + Retrieves the intended target speed of the device's specified fan. + + For all discrete products with dedicated fans. + + Normally, the driver dynamically adjusts the fan based on + the needs of the GPU. But when users set fan speed using ``speed``, + the driver will attempt to make the fan achieve that setting. + The actual current speed of the fan is reported in ``speed``. + + The fan speed is expressed as a percentage of the product's maximum + noise tolerance fan speed. This value may exceed 100% in certain cases. + """ + + @property + def min_max_speed(self) -> tuple[int, int]: + """ + Retrieves the minimum and maximum fan speed all of the device's fans. + + For all discrete products with dedicated fans. + + Returns + ------- + tuple[int, int] + A tuple of (min_speed, max_speed) + """ + + @property + def control_policy(self) -> FanControlPolicy: + """ + The current fan control policy. + + For Maxwell™ or newer fully supported devices. + + For all CUDA-capable discrete products with fans. + """ + + def set_default_speed(self): + """ + Set the speed of the fan control policy to default. + + For all CUDA-capable discrete products with fans. + """ + +class FieldValue: + """ + Represents the data from a single field value. + + Use :meth:`Device.get_field_values` to get multiple field values at once. + """ + + def __init__(self, field_value: nvml.FieldValue): + ... + + @property + def field_id(self) -> FieldId: + """ + The field ID. + """ + + @property + def scope_id(self) -> int: + """ + The scope ID. + """ + + @property + def timestamp(self) -> int: + """ + The CPU timestamp (in microseconds since 1970) at which the value was + sampled. + """ + + @property + def latency_usec(self) -> int: + """ + How long this field value took to update (in usec) within NVML. This may + be averaged across several fields that are serviced by the same driver + call. + """ + + @property + def value(self) -> int | float: + """ + The field value. + + Raises + ------ + :class:`cuda.core.system.NvmlError` + If there was an error retrieving the field value. + """ + +class FieldValues: + """ + Container of multiple field values. + """ + + def __init__(self, field_values: nvml.FieldValue): + ... + + def __getitem__(self, idx: int) -> FieldValue: + ... + + def __len__(self) -> int: + ... + + def validate(self) -> None: + """ + Validate that there are no issues in any of the contained field values. + + Raises an exception for the first issue found, if any. + + Raises + ------ + :class:`cuda.core.system.NvmlError` + If any of the contained field values has an associated exception. + """ + + def get_all_values(self) -> list[int | float]: + """ + Get all field values as a list. + + This will validate each of the values and include just the core value in + the list. + + Returns + ------- + list[int | float] + List of all field values. + + Raises + ------ + :class:`cuda.core.system.NvmlError` + If any of the contained field values has an associated exception. + """ + +class InforomInfo: + + def __init__(self, device: Device): + ... + + def get_version(self, inforom: InforomObject | str) -> str: + """ + Retrieves the InfoROM version for a given InfoROM object. + + For all products with an InfoROM. + + Fermi™ and higher parts have non-volatile on-board memory for persisting + device info, such as aggregate ECC counts. + + Parameters + ---------- + inforom: :class:`InforomObject` + The InfoROM object to query. + + Returns + ------- + str + The InfoROM version. + """ + + @property + def image_version(self) -> str: + """ + Retrieves the global InfoROM image version. + + For all products with an InfoROM. + + Image version just like VBIOS version uniquely describes the exact + version of the InfoROM flashed on the board in contrast to InfoROM + object version which is only an indicator of supported features. + + Returns + ------- + str + The InfoROM image version. + """ + + @property + def configuration_checksum(self) -> int: + """ + Retrieves the checksum of the configuration stored in the device's InfoROM. + + For all products with an InfoROM. + + Can be used to make sure that two GPUs have the exact same + configuration. Current checksum takes into account configuration stored + in PWR and ECC InfoROM objects. Checksum can change between driver + releases or when user changes configuration (e.g. disable/enable ECC) + + Returns + ------- + int + The InfoROM checksum. + """ + + def validate(self) -> None: + """ + Reads the InfoROM from the flash and verifies the checksums. + + For all products with an InfoROM. + + Raises + ------ + :class:`cuda.core.system.CorruptedInforomError` + If the device's InfoROM is corrupted. + """ + + @property + def bbx_flush_time(self) -> tuple[int, int]: + """ + Retrieves the timestamp and duration of the last flush of the BBX + (blackbox) InfoROM object during the current run. + + For all products with an InfoROM. + + Returns + ------- + tuple[int, int] + - timestamp: The start timestamp of the last BBX flush + - duration_us: The duration (in μs) of the last BBX flush + """ + + @property + def board_part_number(self) -> str: + """ + The device board part number which is programmed into the board's InfoROM. + """ + +class MemoryInfo: + """ + Memory allocation information for a device. + """ + + def __init__(self, memory_info: nvml.Memory_v2): + ... + + @property + def free(self) -> int: + """ + Unallocated device memory (in bytes) + """ + + @property + def total(self) -> int: + """ + Total physical device memory (in bytes) + """ + + @property + def used(self) -> int: + """ + Allocated device memory (in bytes) + """ + + @property + def reserved(self) -> int: + """ + Device memory (in bytes) reserved for system use (driver or firmware) + """ + +class BAR1MemoryInfo(MemoryInfo): + """ + BAR1 Memory allocation information for a device. + """ + + def __init__(self, memory_info: nvml.BAR1Memory): + ... + + @property + def free(self) -> int: + """ + Unallocated BAR1 memory (in bytes) + """ + + @property + def total(self) -> int: + """ + Total BAR1 memory (in bytes) + """ + + @property + def used(self) -> int: + """ + Allocated used memory (in bytes) + """ + +class MigInfo: + + def __init__(self, device: Device): + ... + + @property + def is_mig_device(self) -> bool: + """ + Whether this device is a MIG (Multi-Instance GPU) device. + + A MIG device handle is an NVML abstraction which maps to a MIG compute + instance. These overloaded references can be used (with some + restrictions) interchangeably with a GPU device handle to execute + queries at a per-compute instance granularity. + + For Ampere™ or newer fully supported devices. + """ + + @property + def mode(self) -> bool: + """ + Get current MIG mode for the device. + + For Ampere™ or newer fully supported devices. + + Changing MIG modes may require device unbind or reset. The "pending" MIG + mode refers to the target mode following the next activation trigger. + + Returns + ------- + bool + `True` if current MIG mode is enabled. + """ + + @mode.setter + def mode(self, mode: bool): + """ + Set the MIG mode for the device. + + For Ampere™ or newer fully supported devices. + + Changing MIG modes may require device unbind or reset. The "pending" MIG + mode refers to the target mode following the next activation trigger. + + Parameters + ---------- + mode: bool + `True` to enable MIG mode, `False` to disable MIG mode. + """ + + @property + def pending_mode(self) -> bool: + """ + Get pending MIG mode for the device. + + For Ampere™ or newer fully supported devices. + + Changing MIG modes may require device unbind or reset. The "pending" MIG + mode refers to the target mode following the next activation trigger. + + If the device is not a MIG device, returns `False`. + + Returns + ------- + bool + `True` if pending MIG mode is enabled. + """ + + @property + def device_count(self) -> int: + """ + Get the maximum number of MIG devices that can exist under this device. + + Returns zero if MIG is not supported or enabled. + + For Ampere™ or newer fully supported devices. + + Returns + ------- + int + The number of MIG devices (compute instances) on this GPU. + """ + + @property + def parent(self) -> Device: + """ + For MIG devices, get the parent GPU device. + + For Ampere™ or newer fully supported devices. + + Returns + ------- + Device + The parent GPU device for this MIG device. + """ + + def get_device_by_index(self, index: int) -> Device: + """ + Get MIG device for the given index under its parent device. + + If the compute instance is destroyed either explicitly or by destroying, + resetting or unbinding the parent GPU instance or the GPU device itself + the MIG device handle would remain invalid and must be requested again + using this API. Handles may be reused and their properties can change in + the process. + + For Ampere™ or newer fully supported devices. + + Parameters + ---------- + index: int + The index of the MIG device (compute instance) to retrieve. Must be + between 0 and the value returned by `device_count - 1`. + + Returns + ------- + Device + The MIG device corresponding to the given index. + """ + + def get_all_devices(self) -> Iterable[Device]: + """ + Get all MIG devices under its parent device. + + If the compute instance is destroyed either explicitly or by destroying, + resetting or unbinding the parent GPU instance or the GPU device itself + the MIG device handle would remain invalid and must be requested again + using this API. Handles may be reused and their properties can change in + the process. + + For Ampere™ or newer fully supported devices. + + Returns + ------- + list[Device] + A list of all MIG devices corresponding to this GPU. + """ + +class NvlinkInfo: + """ + Nvlink information for a device. + """ + max_links = nvml.NVLINK_MAX_LINKS + + def __init__(self, device: Device, link: int): + ... + + @property + def version(self) -> tuple[int, int]: + """ + Retrieves the NvLink version for the device and link. + + For all products with NvLink support. + + Returns + ------- + tuple[int, int] + The Nvlink version as a tuple of (major, minor). + """ + + @property + def state(self) -> bool: + """ + Retrieves the state of the device's Nvlink for the device and link specified. + + For Pascal™ or newer fully supported devices. + + For all products with Nvlink support. + + Returns + ------- + bool + `True` if the Nvlink is active. + """ + +class PciInfo: + """ + PCI information about a GPU device. + """ + + def __init__(self, pci_info_ext: nvml.PciInfoExt_v1, handle: int): + ... + + @property + def bus(self) -> int: + """ + The bus on which the device resides, 0 to 255 + """ + + @property + def bus_id(self) -> str: + """ + The tuple domain:bus:device.function PCI identifier string + """ + + @property + def device(self) -> int: + """ + The device's id on the bus, 0 to 31 + """ + + @property + def domain(self) -> int: + """ + The PCI domain on which the device's bus resides, 0 to 0xffffffff + """ + + @property + def vendor_id(self) -> int: + """ + The PCI vendor id of the device + """ + + @property + def device_id(self) -> int: + """ + The PCI device id of the device + """ + + @property + def subsystem_id(self) -> int: + """ + The subsystem device ID + """ + + @property + def base_class(self) -> int: + """ + The 8-bit PCI base class code + """ + + @property + def sub_class(self) -> int: + """ + The 8-bit PCI sub class code + """ + + @property + def link_generation(self) -> int: + """ + Retrieve the maximum PCIe link generation possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a generation 2 PCIe device attached to a generation 1 + PCIe bus, the max link generation this function will report is + generation 1. + """ + + @property + def max_link_generation(self) -> int: + """ + Retrieve the maximum PCIe link generation supported by this GPU device. + + For Fermi™ or newer fully supported devices. + """ + + @property + def max_link_width(self) -> int: + """ + Retrieve the maximum PCIe link width possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a device with a 16x PCIe bus width attached to a 8x + PCIe system bus this function will report + a max link width of 8. + """ + + @property + def current_link_generation(self) -> int: + """ + Retrieve the current PCIe link generation. + + For Fermi™ or newer fully supported devices. + """ + + @property + def current_link_width(self) -> int: + """ + Retrieve the current PCIe link width. + + For Fermi™ or newer fully supported devices. + """ + + @property + def rx_throughput(self) -> int: + """ + Retrieve PCIe reception throughput, in KB/s. + + This function is querying a byte counter over a 20ms interval, and thus + is the PCIe throughput over that interval. + + For Maxwell™ or newer fully supported devices. + + This method is not supported in virtual machines running virtual GPU + (vGPU). + """ + + @property + def tx_throughput(self) -> int: + """ + Retrieve PCIe transmission throughput, in KB/s. + + This function is querying a byte counter over a 20ms interval, and thus + is the PCIe throughput over that interval. + + For Maxwell™ or newer fully supported devices. + + This method is not supported in virtual machines running virtual GPU + (vGPU). + """ + + @property + def replay_counter(self) -> int: + """ + Retrieve the PCIe replay counter. + + For Kepler™ or newer fully supported devices. + """ + +class GpuDynamicPstatesUtilization: + + def __init__(self, ptr: int, owner: object): + ... + + @property + def is_present(self) -> bool: + """ + Set if the utilization domain is present on this GPU. + """ + + @property + def percentage(self) -> int: + """ + Percentage of time where the domain is considered busy in the last 1-second interval. + """ + + @property + def inc_threshold(self) -> int: + """ + Utilization threshold that can trigger a perf-increasing P-State change when crossed. + """ + + @property + def dec_threshold(self) -> int: + """ + Utilization threshold that can trigger a perf-decreasing P-State change when crossed. + """ + +class GpuDynamicPstatesInfo: + """ + Handles performance monitor samples from the device. + """ + + def __init__(self, gpu_dynamic_pstates_info: nvml.GpuDynamicPstatesInfo): + ... + + def __len__(self): + ... + + def __getitem__(self, idx: int) -> GpuDynamicPstatesUtilization: + ... + +class ProcessInfo: + """ + Information about running compute processes on the GPU. + """ + + def __init__(self, device: 'Device', process_info: nvml.ProcessInfo): + ... + + @property + def pid(self) -> int: + """ + The PID of the process. + """ + + @property + def used_gpu_memory(self) -> int: + """ + The amount of GPU memory (in bytes) used by the process. + """ + + @property + def gpu_instance_id(self) -> int: + """ + The GPU instance ID for MIG devices. + + Only valid for processes running on MIG devices. + """ + + @property + def compute_instance_id(self) -> int: + """ + The Compute instance ID for MIG devices. + + Only valid for processes running on MIG devices. + """ + +class RepairStatus: + """ + Repair status for TPC/Channel repair. + """ + + def __init__(self, handle: int): + ... + + @property + def channel_repair_pending(self) -> bool: + """ + `True` if a channel repair is pending. + """ + + @property + def tpc_repair_pending(self) -> bool: + """ + `True` if a TPC repair is pending. + """ + +class ThermalSensor: + + def __init__(self, ptr: int, owner: object): + ... + + @property + def controller(self) -> ThermalController: + ... + + @property + def default_min_temp(self) -> int: + ... + + @property + def default_max_temp(self) -> int: + ... + + @property + def current_temp(self) -> int: + ... + + @property + def target(self) -> ThermalTarget: + ... + +class ThermalSettings: + + def __init__(self, thermal_settings: nvml.ThermalSettings): + ... + + def __len__(self): + ... + + def __getitem__(self, idx: int) -> nvml.ThermalSensor: + ... + +class Temperature: + + def __init__(self, handle: int): + ... + + def get_sensor(self) -> int: + """ + Get the temperature reading from a specific sensor on the device, in + degrees Celsius. + + The only sensor currently supported is the GPU temperature sensor. + + Returns + ------- + int + The temperature in degrees Celsius. + """ + + def get_threshold(self, threshold_type: TemperatureThresholds | str) -> int: + """ + Retrieves the temperature threshold for this GPU with the specified + threshold type, in degrees Celsius. + + For Kepler™ or newer fully supported devices. + + See :class:`TemperatureThresholds` for possible threshold types. + + Note: This API is no longer the preferred interface for retrieving the + following temperature thresholds on Ada and later architectures: + ``NVML_TEMPERATURE_THRESHOLD_SHUTDOWN``, + ``NVML_TEMPERATURE_THRESHOLD_SLOWDOWN``, + ``NVML_TEMPERATURE_THRESHOLD_MEM_MAX`` and + ``NVML_TEMPERATURE_THRESHOLD_GPU_MAX``. + + Support for reading these temperature thresholds for Ada and later + architectures would be removed from this API in future releases. Please + use :meth:`get_field_values` with ``NVML_FI_DEV_TEMPERATURE_*`` fields + to retrieve temperature thresholds on these architectures. + """ + + @property + def margin(self) -> int: + """ + The thermal margin temperature (distance to nearest slowdown threshold) for the device. + """ + + def get_thermal_settings(self, sensor_index: ThermalTarget | str) -> ThermalSettings: + """ + Used to execute a list of thermal system instructions. + + Parameters + ---------- + sensor_index: ThermalTarget + The index of the thermal sensor. + + Returns + ------- + :obj:`~_device.ThermalSettings` + The thermal settings for the specified sensor. + """ + +class Utilization: + """ + Utilization rates for a device. + + For devices with compute capability 2.0 or higher. + """ + + def __init__(self, utilization: nvml.Utilization): + ... + + @property + def gpu(self) -> int: + """ + Percent of time over the past sample period during which one or more kernels was executing on the GPU. + """ + + @property + def memory(self) -> int: + """ + Percent of time over the past sample period during which global (device) memory was being read or written. + """ + +class Device: + """ + Representation of a device. + + :class:`cuda.core.system.Device` provides access to various pieces of metadata + about devices and their topology, as provided by the NVIDIA Management + Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`. + + Creating a device instance causes NVML to initialize the target GPU. + NVML may initialize additional GPUs if the target GPU is an SLI slave. + + Parameters + ---------- + index: int, optional + Integer representing the CUDA device index to get a handle to. Valid + values are between ``0`` and ``cuda.core.system.get_num_devices() - 1``. + + The order in which devices are enumerated has no guarantees of + consistency between reboots. For that reason, it is recommended that + devices are looked up by their PCI ids or UUID. + + uuid: bytes or str, optional + UUID of a CUDA device to get a handle to. + + pci_bus_id: bytes or str, optional + PCI bus ID of a CUDA device to get a handle to. + + Raises + ------ + ValueError + If anything other than a single `index`, `uuid` or `pci_bus_id` are specified. + """ + _handle: int + + def __init__(self, *, index: int | None=None, uuid: bytes | str | None=None, pci_bus_id: bytes | str | None=None): + ... + + @property + def index(self) -> int: + """ + The NVML index of this device. + + Valid indices are derived from the count returned by + :meth:`Device.get_device_count`. For example, if ``get_device_count()`` + returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU + 1. + + The order in which NVML enumerates devices has no guarantees of + consistency between reboots. For that reason, it is recommended that + devices be looked up by their PCI ids or GPU UUID. + + Note: The NVML index may not correlate with other APIs, such as the CUDA + device index. + """ + + @property + def uuid(self) -> str: + """ + Retrieves the globally unique immutable UUID associated with this + device, as a 5 part hexadecimal string, that augments the immutable, + board serial identifier. + + In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` + prefix. If you need a `uuid` without that prefix (for example, to + interact with CUDA), use the `uuid_without_prefix` property. + """ + + @property + def uuid_without_prefix(self) -> str: + """ + Retrieves the globally unique immutable UUID associated with this + device, as a 5 part hexadecimal string, that augments the immutable, + board serial identifier. + + In the upstream NVML C++ API, the UUID includes a ``gpu-`` or ``mig-`` + prefix. This property returns it without the prefix, to match the UUIDs + used in CUDA. If you need the prefix, use the `uuid` property. + """ + + @property + def pci_bus_id(self) -> str: + """ + Retrieves the PCI bus ID of this device. + """ + + @property + def numa_node_id(self) -> int: + """ + The NUMA node of the given GPU device. + + This only applies to platforms where the GPUs are NUMA nodes. + """ + + @property + def arch(self) -> DeviceArch: + """ + :obj:`~DeviceArch` device architecture. + + For example, a Tesla V100 will report ``DeviceArchitecture.name == + "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name == + "AMPERE"``. + """ + + @property + def name(self) -> str: + """ + Name of the device, e.g.: `"Tesla V100-SXM2-32GB"` + """ + + @property + def brand(self) -> str: + """ + The brand of the device. + + Returns "Unknown" if the brand is unknown. + """ + + @property + def serial(self) -> str: + """ + Retrieves the globally unique board serial number associated with this + device's board. + + For all products with an InfoROM. + """ + + @property + def module_id(self) -> int: + """ + Get a unique identifier for the device module on the baseboard. + + This API retrieves a unique identifier for each GPU module that exists + on a given baseboard. For non-baseboard products, this ID would always + be 0. + """ + + @property + def minor_number(self) -> int: + """ + The minor number of this device. + + For Linux only. + + The minor number is used by the Linux device driver to identify the + device node in ``/dev/nvidiaX``. + """ + + @property + def is_c2c_enabled(self) -> bool: + """ + Whether the C2C (Chip-to-Chip) mode is enabled for this device. + """ + + @property + def is_persistence_mode_enabled(self) -> bool: + """ + Whether persistence mode is enabled for this device. + + For Linux only. + """ + + @is_persistence_mode_enabled.setter + def is_persistence_mode_enabled(self, enabled: bool) -> None: + ... + + @property + def cuda_compute_capability(self) -> tuple[int, int]: + """ + CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100. + + Returns a tuple `(major, minor)`. + """ + + def to_cuda_device(self) -> 'cuda.core.Device': + """ + Get the corresponding :class:`cuda.core.Device` (which is used for CUDA + access) for this :class:`cuda.core.system.Device` (which is used for + NVIDIA machine library (NVML) access). + + The devices are mapped to one another by their UUID. + + Returns + ------- + cuda.core.Device + The corresponding CUDA device. + """ + + @classmethod + def get_device_count(cls) -> int: + """ + Get the number of available devices. + + Returns + ------- + int + The number of available devices. + """ + + @classmethod + def get_all_devices(cls) -> Iterable[Device]: + """ + Query the available device instances. + + Returns + ------- + Iterator over :obj:`~Device` + An iterator over available devices. + """ + + @property + def addressing_mode(self) -> AddressingMode | None: + """ + Get the :obj:`~AddressingMode` of the device. + """ + + @property + def mig(self) -> MigInfo: + """ + Get :obj:`~MigInfo` accessor for MIG (Multi-Instance GPU) information. + + For Ampere™ or newer fully supported devices. + """ + + @classmethod + def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: + """ + Retrieve the set of GPUs that have a CPU affinity with the given CPU number. + + Supported on Linux only. + + Parameters + ---------- + cpu_index: int + The CPU index. + + Returns + ------- + Iterator of :obj:`~Device` + An iterator over available devices. + """ + + def get_memory_affinity(self, scope: AffinityScope | str=...) -> list[int]: + """ + Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal + memory affinity for the device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + + If requested scope is not applicable to the target topology, the API + will fall back to reporting the memory affinity for the immediate non-I/O + ancestor of the device. + + Parameters + ---------- + scope: AffinityScope | str, optional + The scope of the affinity query. Must be one of the values of + :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. + + Returns + ------- + list[int] + A list of indices of NUMA nodes or CPU sockets with the ideal memory + affinity for the device. + """ + + def get_cpu_affinity(self, scope: AffinityScope | str=...) -> list[int]: + """ + Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal + CPU affinity for the device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + + If requested scope is not applicable to the target topology, the API + will fall back to reporting the memory affinity for the immediate non-I/O + ancestor of the device. + + Parameters + ---------- + scope: AffinityScope | str, optional + The scope of the affinity query. Must be one of the values of + :class:`AffinityScope`. Default is :attr:`AffinityScope.NODE`. + + Returns + ------- + list[int] + A list of indices of NUMA nodes or CPU sockets with the ideal memory + affinity for the device. + """ + + def set_cpu_affinity(self): + """ + Sets the ideal affinity for the calling thread and device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + """ + + def clear_cpu_affinity(self): + """ + Clear all affinity bindings for the calling thread. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + """ + + def get_clock(self, clock_type: ClockType | str) -> ClockInfo: + """ + :obj:`~_device.ClockInfo` object to get information about and manage a specific clock on a device. + """ + + @property + def is_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]: + """ + Retrieve the current state of auto boosted clocks on a device. + + For Kepler™ or newer fully supported devices. + + Auto Boosted clocks are enabled by default on some hardware, allowing + the GPU to run at higher clock rates to maximize performance as thermal + limits allow. + + On Pascal™ and newer hardware, Auto Boosted clocks are controlled + through application clocks. Use :meth:`set_application_clocks` and + :meth:`reset_application_clocks` to control Auto Boost behavior. + + Returns + ------- + bool + The current state of Auto Boosted clocks + bool + The default Auto Boosted clocks behavior + + """ + + @property + def current_clock_event_reasons(self) -> list[ClocksEventReasons]: + """ + Retrieves the current :obj:`~ClocksEventReasons`. + + For all fully supported products. + """ + + @property + def supported_clock_event_reasons(self) -> list[ClocksEventReasons]: + """ + Retrieves supported :obj:`~ClocksEventReasons` that can be returned by + :meth:`get_current_clock_event_reasons`. + + For all fully supported products. + + This method is not supported in virtual machines running virtual GPU (vGPU). + """ + + @property + def cooler(self) -> CoolerInfo: + """ + :obj:`~_device.CoolerInfo` object with cooler information for the device. + """ + + @property + def attributes(self) -> DeviceAttributes: + """ + :obj:`~_device.DeviceAttributes` object with various device attributes. + + For Ampere™ or newer fully supported devices. Only available on Linux + systems. + """ + + @property + def is_display_connected(self) -> bool: + """ + The display mode for this device. + + Indicates whether a physical display (e.g. monitor) is currently connected to + any of the device's connectors. + """ + + @property + def is_display_active(self) -> bool: + """ + The display active status for this device. + + Indicates whether a display is initialized on the device. For example, + whether X Server is attached to this device and has allocated memory for + the screen. + + Display can be active even when no monitor is physically attached. + """ + + def register_events(self, events: EventType | str | list[EventType | str]) -> DeviceEvents: + """ + Starts recording events on this device. + + For Fermi™ or newer fully supported devices. For Linux only. + + ECC events are available only on ECC-enabled devices (see + :meth:`Device.get_total_ecc_errors`). Power capping events are + available only on Power Management enabled devices (see + :meth:`Device.get_power_management_mode`). + + This call starts recording of events on specific device. All events + that occurred before this call are not recorded. Wait for events using + the :meth:`DeviceEvents.wait` method on the result. + + Examples + -------- + >>> device = Device(index=0) + >>> events = device.register_events([ + ... EventType.XID_CRITICAL_ERROR, + ... ]) + >>> while event := events.wait(timeout_ms=10000): + ... print(f"Event {event.event_type} occurred on device {event.device.uuid}") + + Parameters + ---------- + events: EventType, str, or list of EventType or str + The event type or list of event types to register for this device. + + Returns + ------- + :obj:`~_device.DeviceEvents` + An object representing the registered events. Call + :meth:`~_device.DeviceEvents.wait` on this object to wait for events. + + Raises + ------ + :class:`cuda.core.system.NotSupportedError` + None of the requested event types are registered. + """ + + def get_supported_event_types(self) -> list[EventType]: + """ + Get the list of event types supported by this device. + + For Fermi™ or newer fully supported devices. For Linux only (returns an + empty list on Windows). + + Returns + ------- + list[EventType] + The list of supported event types. + """ + + def get_fan(self, fan: int=0) -> FanInfo: + """ + :obj:`~_device.FanInfo` object to get information and manage a specific fan on a device. + """ + + @property + def num_fans(self) -> int: + """ + The number of fans on the device. + """ + + def get_field_values(self, field_ids: list[int | tuple[int, int]]) -> FieldValues: + """ + Get multiple field values from the device. + + Each value specified can raise its own exception. That exception will + be raised when attempting to access the corresponding ``value`` from the + returned :obj:`~_device.FieldValues` container. + + To confirm that there are no exceptions in the entire container, call + :meth:`~_device.FieldValues.validate`. + + Parameters + ---------- + field_ids: list[int | tuple[int, int]] + List of field IDs to query. + + Each item may be either a single value from the :class:`FieldId` + enum, or a pair of (:class:`FieldId`, scope ID). + + Returns + ------- + :obj:`~_device.FieldValues` + Container of field values corresponding to the requested field IDs. + """ + + def clear_field_values(self, field_ids: list[int | tuple[int, int]]) -> None: + """ + Clear multiple field values from the device. + + Parameters + ---------- + field_ids: list[int | tuple[int, int]] + List of field IDs to clear. + + Each item may be either a single value from the :class:`FieldId` + enum, or a pair of (:class:`FieldId`, scope ID). + """ + + @property + def inforom(self) -> InforomInfo: + """ + :obj:`~_device.InforomInfo` object with InfoROM information. + + For all products with an InfoROM. + """ + + @property + def bar1_memory_info(self) -> BAR1MemoryInfo: + """ + :obj:`~_device.BAR1MemoryInfo` object with BAR1 memory information. + + BAR1 is used to map the FB (device memory) so that it can be directly + accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE + bus). + """ + + @property + def memory_info(self) -> MemoryInfo: + """ + :obj:`~_device.MemoryInfo` object with memory information. + """ + + def get_nvlink(self, link: int) -> NvlinkInfo: + """ + Get :obj:`~NvlinkInfo` about this device. + + For devices with NVLink support. + """ + + @property + def pci_info(self) -> PciInfo: + """ + :obj:`~_device.PciInfo` object with the PCI attributes of this device. + """ + + @property + def performance_state(self) -> int | None: + """ + The current performance state of the device. + + For Fermi™ or newer fully supported devices. + + Returns + ------- + int | None + The current performance state of the device, as an integer between 0 and 15, + where 0 is maximum performance and higher numbers are lower performance. + Returns `None` if the performance state is unknown. + """ + + @property + def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo: + """ + :obj:`~_device.GpuDynamicPstatesInfo` object with performance monitor samples from the associated subdevice. + """ + + @property + def supported_pstates(self) -> list[int]: + """ + Get all supported Performance States (P-States) for the device. + + The returned list contains a contiguous list of valid P-States supported by + the device. + + Return + ------ + list[int] + A list of supported performance state of the device, as an integer + between 0 and 15, where 0 is maximum performance and higher numbers + are lower performance. + """ + + @property + def compute_running_processes(self) -> list[ProcessInfo]: + """ + Get information about processes with a compute context on a device + + For Fermi™ or newer fully supported devices. + + This function returns information only about compute running processes + (e.g. CUDA application which have active context). Any graphics + applications (e.g. using OpenGL, DirectX) won't be listed by this + function. + + Keep in mind that information returned by this call is dynamic and the + number of elements might change in time. + + In MIG mode, if device handle is provided, the API returns aggregate + information, only if the caller has appropriate privileges. Per-instance + information can be queried by using specific MIG device handles. + Querying per-instance information using MIG device handles is not + supported if the device is in vGPU Host virtualization mode. + """ + + @property + def repair_status(self) -> RepairStatus: + """ + :obj:`~_device.RepairStatus` object with TPC/Channel repair status. + + For Ampere™ or newer fully supported devices. + """ + + @property + def temperature(self) -> Temperature: + """ + :obj:`~_device.Temperature` object with temperature information for the device. + """ + + def get_topology_nearest_gpus(self, level: GpuTopologyLevel | str) -> Iterable[Device]: + """ + Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. + + Supported on Linux only. + + Parameters + ---------- + level: :class:`GpuTopologyLevel` + The topology level. + + Returns + ------- + Iterable of :class:`Device` + The nearest devices at the given topology level. + """ + + @property + def utilization(self) -> Utilization: + """ + Retrieves the current :obj:`~Utilization` rates for the device's major + subsystems. + + For Fermi™ or newer fully supported devices. + + Note: During driver initialization when ECC is enabled one can see high + GPU and Memory Utilization readings. This is caused by ECC Memory + Scrubbing mechanism that is performed during driver initialization. + + Note: On MIG-enabled GPUs, querying device utilization rates is not + currently supported. + + Returns + ------- + Utilization + An object containing the current utilization rates for the device. + """ +_CLOCK_ID_MAPPING = {ClockId.CURRENT: nvml.ClockId.CURRENT, ClockId.CUSTOMER_BOOST_MAX: nvml.ClockId.CUSTOMER_BOOST_MAX} +_CLOCKS_EVENT_REASONS_MAPPING = {nvml.ClocksEventReasons.EVENT_REASON_NONE: ClocksEventReasons.NONE, nvml.ClocksEventReasons.EVENT_REASON_GPU_IDLE: ClocksEventReasons.GPU_IDLE, nvml.ClocksEventReasons.EVENT_REASON_APPLICATIONS_CLOCKS_SETTING: ClocksEventReasons.APPLICATIONS_CLOCKS_SETTING, nvml.ClocksEventReasons.EVENT_REASON_SW_POWER_CAP: ClocksEventReasons.SW_POWER_CAP, nvml.ClocksEventReasons.THROTTLE_REASON_HW_SLOWDOWN: ClocksEventReasons.HW_SLOWDOWN, nvml.ClocksEventReasons.EVENT_REASON_SYNC_BOOST: ClocksEventReasons.SYNC_BOOST, nvml.ClocksEventReasons.EVENT_REASON_SW_THERMAL_SLOWDOWN: ClocksEventReasons.SW_THERMAL_SLOWDOWN, nvml.ClocksEventReasons.THROTTLE_REASON_HW_THERMAL_SLOWDOWN: ClocksEventReasons.HW_THERMAL_SLOWDOWN, nvml.ClocksEventReasons.THROTTLE_REASON_HW_POWER_BRAKE_SLOWDOWN: ClocksEventReasons.HW_POWER_BRAKE_SLOWDOWN, nvml.ClocksEventReasons.EVENT_REASON_DISPLAY_CLOCK_SETTING: ClocksEventReasons.DISPLAY_CLOCK_SETTING} +_CLOCK_TYPE_MAPPING = {ClockType.GRAPHICS: nvml.ClockType.CLOCK_GRAPHICS, ClockType.SM: nvml.ClockType.CLOCK_SM, ClockType.MEMORY: nvml.ClockType.CLOCK_MEM, ClockType.VIDEO: nvml.ClockType.CLOCK_VIDEO} +_COOLER_CONTROL_MAPPING = {nvml.CoolerControl.THERMAL_COOLER_SIGNAL_TOGGLE: CoolerControl.TOGGLE, nvml.CoolerControl.THERMAL_COOLER_SIGNAL_VARIABLE: CoolerControl.VARIABLE} +_COOLER_TARGET_MAPPING = {nvml.CoolerTarget.THERMAL_NONE: CoolerTarget.NONE, nvml.CoolerTarget.THERMAL_GPU: CoolerTarget.GPU, nvml.CoolerTarget.THERMAL_MEMORY: CoolerTarget.MEMORY, nvml.CoolerTarget.THERMAL_POWER_SUPPLY: CoolerTarget.POWER_SUPPLY} +_EVENT_TYPE_MAPPING = {nvml.EventType.NONE: EventType.NONE, nvml.EventType.SINGLE_BIT_ECC_ERROR: EventType.SINGLE_BIT_ECC_ERROR, nvml.EventType.DOUBLE_BIT_ECC_ERROR: EventType.DOUBLE_BIT_ECC_ERROR, nvml.EventType.PSTATE: EventType.PSTATE, nvml.EventType.XID_CRITICAL_ERROR: EventType.XID_CRITICAL_ERROR, nvml.EventType.CLOCK: EventType.CLOCK, nvml.EventType.POWER_SOURCE_CHANGE: EventType.POWER_SOURCE_CHANGE, nvml.EventType.MIG_CONFIG_CHANGE: EventType.MIG_CONFIG_CHANGE, nvml.EventType.SINGLE_BIT_ECC_ERROR_STORM: EventType.SINGLE_BIT_ECC_ERROR_STORM, nvml.EventType.DRAM_RETIREMENT_EVENT: EventType.DRAM_RETIREMENT_EVENT, nvml.EventType.DRAM_RETIREMENT_FAILURE: EventType.DRAM_RETIREMENT_FAILURE, nvml.EventType.NON_FATAL_POISON_ERROR: EventType.NON_FATAL_POISON_ERROR, nvml.EventType.FATAL_POISON_ERROR: EventType.FATAL_POISON_ERROR, nvml.EventType.GPU_UNAVAILABLE_ERROR: EventType.GPU_UNAVAILABLE_ERROR, nvml.EventType.GPU_RECOVERY_ACTION: EventType.GPU_RECOVERY_ACTION} +_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _EVENT_TYPE_MAPPING.items()} +_FAN_CONTROL_POLICY_MAPPING = {nvml.FanControlPolicy.TEMPERATURE_CONTINUOUS_SW: FanControlPolicy.TEMPERATURE_CONTROLLED, nvml.FanControlPolicy.MANUAL: FanControlPolicy.MANUAL} +_INFOROM_OBJECT_MAPPING = {InforomObject.OEM: nvml.InforomObject.INFOROM_OEM, InforomObject.ECC: nvml.InforomObject.INFOROM_ECC, InforomObject.POWER: nvml.InforomObject.INFOROM_POWER, InforomObject.DEN: nvml.InforomObject.INFOROM_DEN} +_NVLINK_VERSION_MAPPING = {nvml.NvlinkVersion.VERSION_1_0: (1, 0), nvml.NvlinkVersion.VERSION_2_0: (2, 0), nvml.NvlinkVersion.VERSION_2_2: (2, 2), nvml.NvlinkVersion.VERSION_3_0: (3, 0), nvml.NvlinkVersion.VERSION_3_1: (3, 1), nvml.NvlinkVersion.VERSION_4_0: (4, 0), nvml.NvlinkVersion.VERSION_5_0: (5, 0)} +_TEMPERATURE_THRESHOLD_MAPPING = {TemperatureThresholds.SHUTDOWN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SHUTDOWN, TemperatureThresholds.SLOWDOWN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_SLOWDOWN, TemperatureThresholds.MEM_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_MEM_MAX, TemperatureThresholds.GPU_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPU_MAX, TemperatureThresholds.ACOUSTIC_MIN: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_MIN, TemperatureThresholds.ACOUSTIC_CURR: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_CURR, TemperatureThresholds.ACOUSTIC_MAX: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_ACOUSTIC_MAX, TemperatureThresholds.GPS_CURR: nvml.TemperatureThresholds.TEMPERATURE_THRESHOLD_GPS_CURR} +_THERMAL_CONTROLLER_MAPPING = {nvml.ThermalController.GPU_INTERNAL: ThermalController.GPU_INTERNAL, nvml.ThermalController.ADM1032: ThermalController.ADM1032, nvml.ThermalController.ADT7461: ThermalController.ADT7461, nvml.ThermalController.MAX6649: ThermalController.MAX6649, nvml.ThermalController.MAX1617: ThermalController.MAX1617, nvml.ThermalController.LM99: ThermalController.LM99, nvml.ThermalController.LM89: ThermalController.LM89, nvml.ThermalController.LM64: ThermalController.LM64, nvml.ThermalController.G781: ThermalController.G781, nvml.ThermalController.ADT7473: ThermalController.ADT7473, nvml.ThermalController.SBMAX6649: ThermalController.SBMAX6649, nvml.ThermalController.VBIOSEVT: ThermalController.VBIOSEVT, nvml.ThermalController.OS: ThermalController.OS, nvml.ThermalController.NVSYSCON_CANOAS: ThermalController.NVSYSCON_CANOAS, nvml.ThermalController.NVSYSCON_E551: ThermalController.NVSYSCON_E551, nvml.ThermalController.MAX6649R: ThermalController.MAX6649R, nvml.ThermalController.ADT7473S: ThermalController.ADT7473S, nvml.ThermalController.UNKNOWN: ThermalController.UNKNOWN} +_THERMAL_TARGET_MAPPING = {nvml.ThermalTarget.NONE: ThermalTarget.NONE, nvml.ThermalTarget.GPU: ThermalTarget.GPU, nvml.ThermalTarget.MEMORY: ThermalTarget.MEMORY, nvml.ThermalTarget.POWER_SUPPLY: ThermalTarget.POWER_SUPPLY, nvml.ThermalTarget.BOARD: ThermalTarget.BOARD, nvml.ThermalTarget.VCD_BOARD: ThermalTarget.VCD_BOARD, nvml.ThermalTarget.VCD_INLET: ThermalTarget.VCD_INLET, nvml.ThermalTarget.VCD_OUTLET: ThermalTarget.VCD_OUTLET, nvml.ThermalTarget.ALL: ThermalTarget.ALL} +_THERMAL_TARGET_INV_MAPPING = {v: k for k, v in _THERMAL_TARGET_MAPPING.items()} +_ADDRESSING_MODE_MAPPING = {nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_HMM: AddressingMode.HMM, nvml.DeviceAddressingModeType.DEVICE_ADDRESSING_MODE_ATS: AddressingMode.ATS} +_AFFINITY_SCOPE_MAPPING = {AffinityScope.NODE: nvml.AffinityScope.NODE, AffinityScope.SOCKET: nvml.AffinityScope.SOCKET} +_BRAND_TYPE_MAPPING = {nvml.BrandType.BRAND_UNKNOWN: 'Unknown', nvml.BrandType.BRAND_QUADRO: 'Quadro', nvml.BrandType.BRAND_TESLA: 'Tesla', nvml.BrandType.BRAND_NVS: 'NVS', nvml.BrandType.BRAND_GRID: 'GRID', nvml.BrandType.BRAND_GEFORCE: 'GeForce', nvml.BrandType.BRAND_TITAN: 'Titan', nvml.BrandType.BRAND_NVIDIA_VAPPS: 'NVIDIA vApps', nvml.BrandType.BRAND_NVIDIA_VPC: 'NVIDIA VPC', nvml.BrandType.BRAND_NVIDIA_VCS: 'NVIDIA VCS', nvml.BrandType.BRAND_NVIDIA_VWS: 'NVIDIA VWS', nvml.BrandType.BRAND_NVIDIA_CLOUD_GAMING: 'NVIDIA Cloud Gaming', nvml.BrandType.BRAND_NVIDIA_VGAMING: 'NVIDIA vGaming', nvml.BrandType.BRAND_QUADRO_RTX: 'Quadro RTX', nvml.BrandType.BRAND_NVIDIA_RTX: 'NVIDIA RTX', nvml.BrandType.BRAND_NVIDIA: 'NVIDIA', nvml.BrandType.BRAND_GEFORCE_RTX: 'GeForce RTX', nvml.BrandType.BRAND_TITAN_RTX: 'Titan RTX'} +_GPU_P2P_CAPS_INDEX_MAPPING = {GpuP2PCapsIndex.READ: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ, GpuP2PCapsIndex.WRITE: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_WRITE, GpuP2PCapsIndex.NVLINK: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_NVLINK, GpuP2PCapsIndex.ATOMICS: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_ATOMICS, GpuP2PCapsIndex.PCI: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PCI, GpuP2PCapsIndex.PROP: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_PROP, GpuP2PCapsIndex.UNKNOWN: nvml.GpuP2PCapsIndex.P2P_CAPS_INDEX_UNKNOWN} +_GPU_P2P_STATUS_MAPPING = {nvml.GpuP2PStatus.P2P_STATUS_OK: GpuP2PStatus.OK, nvml.GpuP2PStatus.P2P_STATUS_CHIPSET_NOT_SUPPORTED: GpuP2PStatus.CHIPSET_NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_GPU_NOT_SUPPORTED: GpuP2PStatus.GPU_NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED: GpuP2PStatus.IOH_TOPOLOGY_NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_DISABLED_BY_REGKEY: GpuP2PStatus.DISABLED_BY_REGKEY, nvml.GpuP2PStatus.P2P_STATUS_NOT_SUPPORTED: GpuP2PStatus.NOT_SUPPORTED, nvml.GpuP2PStatus.P2P_STATUS_UNKNOWN: GpuP2PStatus.UNKNOWN} +_GPU_TOPOLOGY_LEVEL_MAPPING = {GpuTopologyLevel.INTERNAL: nvml.GpuTopologyLevel.TOPOLOGY_INTERNAL, GpuTopologyLevel.SINGLE: nvml.GpuTopologyLevel.TOPOLOGY_SINGLE, GpuTopologyLevel.MULTIPLE: nvml.GpuTopologyLevel.TOPOLOGY_MULTIPLE, GpuTopologyLevel.HOSTBRIDGE: nvml.GpuTopologyLevel.TOPOLOGY_HOSTBRIDGE, GpuTopologyLevel.NODE: nvml.GpuTopologyLevel.TOPOLOGY_NODE, GpuTopologyLevel.SYSTEM: nvml.GpuTopologyLevel.TOPOLOGY_SYSTEM} +_GPU_TOPOLOGY_LEVEL_INV_MAPPING = {v: k for k, v in _GPU_TOPOLOGY_LEVEL_MAPPING.items()} +__all__ = ['Device', 'get_p2p_status', 'get_topology_common_ancestor', 'NvlinkInfo'] + +def _unpack_bitmask(arr) -> list: + """ + Unpack a list of integers containing bitmasks. + """ + +def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: + """ + Retrieve the common ancestor for two devices. + + For Linux only. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + + Returns + ------- + :class:`GpuTopologyLevel` + The common ancestor level of the two devices. + """ + +def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex | str) -> GpuP2PStatus: + """ + Retrieve the P2P status between two devices. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + index: :class:`GpuP2PCapsIndex` | str + The P2P capability index being looked for between ``device1`` and ``device2``. + + Returns + ------- + :class:`GpuP2PStatus` + The P2P status between the two devices. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/system/_nvml_context.pyi b/cuda_core/cuda/core/system/_nvml_context.pyi new file mode 100644 index 00000000000..a061a9861ba --- /dev/null +++ b/cuda_core/cuda/core/system/_nvml_context.pyi @@ -0,0 +1,33 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_nvml_context.pyx + +from __future__ import annotations + +import threading + +_NVMLState = int +_lock = threading.Lock() + +def _initialize(): + """ + Initializes Nvidia Management Library (NVML), ensuring it only happens once per process. + """ + +def validate(): + """ + Validate NVML state. + + Validate that NVML is initialized, functional and that the system has at + least one GPU available. + + Raises + ------ + nvml.UninitializedError + If NVML hasn't been initialized. + nvml.LibraryNotFoundError + If the NVML library could not be found. + nvml.GpuNotFoundError + If no GPUs are available. + """ + +def _get_nvml_state(): + ... \ No newline at end of file diff --git a/cuda_core/cuda/core/system/_system.pyi b/cuda_core/cuda/core/system/_system.pyi new file mode 100644 index 00000000000..f25ce35be7f --- /dev/null +++ b/cuda_core/cuda/core/system/_system.pyi @@ -0,0 +1,75 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_system.pyx + +from __future__ import annotations + +CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool +__all__ = ['get_driver_branch', 'get_kernel_mode_driver_version', 'get_user_mode_driver_version', 'get_nvml_version', 'get_num_devices', 'get_process_name', 'CUDA_BINDINGS_NVML_IS_COMPATIBLE'] + +def get_user_mode_driver_version() -> tuple[int, ...]: + """ + Get the user-mode (UMD / CUDA) driver version. + + This is the most commonly needed version when checking CUDA driver + compatibility. It works with all ``cuda-bindings`` versions. + + Returns + ------- + version : tuple[int, ...] + A 2-tuple ``(MAJOR, MINOR)``, e.g. ``(13, 0)`` for CUDA 13.0. + """ + +def get_kernel_mode_driver_version() -> tuple[int, ...]: + """ + Get the kernel-mode (KMD / GPU) driver version, e.g. 580.65.06. + + Returns + ------- + version : tuple[int, ...] + Typically a 3-tuple ``(MAJOR, MINOR, PATCH)`` + (2-tuple on WSL), e.g. ``(580, 65, 6)``. + + Raises + ------ + RuntimeError + If the NVML library is not available. + """ + +def get_nvml_version() -> tuple[int, ...]: + """ + The version of the NVML library. + + Returns + ------- + version: tuple[int, ...] + Tuple of integers representing the NVML version components. + """ + +def get_driver_branch() -> str: + """ + Retrieves the driver branch of the NVIDIA driver installed on the system. + + Returns + ------- + branch: str + The driver branch string (e.g., ``"560"``, ``"open"``, etc.). + """ + +def get_num_devices() -> int: + """ + Return the number of devices in the system. + """ + +def get_process_name(pid: int) -> str: + """ + The name of process with given PID. + + Parameters + ---------- + pid: int + The PID of the process for which to get the name. + + Returns + ------- + name: str + The process name. + """ \ No newline at end of file diff --git a/cuda_core/cuda/core/system/_system_events.pyi b/cuda_core/cuda/core/system/_system_events.pyi new file mode 100644 index 00000000000..fdf7217318e --- /dev/null +++ b/cuda_core/cuda/core/system/_system_events.pyi @@ -0,0 +1,133 @@ +# This file was generated by stubgen-pyx v0.2.6 from cuda_core/cuda/core/system/_system_events.pyx + +from __future__ import annotations + +from cuda.bindings import nvml +from cuda.core.system.typing import SystemEventType + +from . import _device + + +class SystemEvent: + """ + Data about a collection of system events. + """ + + def __init__(self, event_data: nvml.SystemEventData_v1): + ... + + @property + def event_type(self) -> SystemEventType: + """ + The :obj:`~SystemEventType` that was triggered. + """ + + @property + def gpu_id(self) -> int: + """ + The GPU ID in PCI ID format. + """ + + @property + def device(self) -> _device.Device: + """ + The :obj:`~_device.Device` associated with this event. + """ + +class SystemEvents: + """ + Data about a collection of system events. + """ + + def __init__(self, event_data: nvml.SystemEventData_v1): + ... + + def __len__(self): + ... + + def __getitem__(self, idx: int) -> SystemEvent: + """ + Get the :obj:`~_system_events.SystemEvent` at the specified index. + """ + +class RegisteredSystemEvents: + """ + Represents a set of events that can be waited on for a specific device. + """ + + def __init__(self, events: SystemEventType | str | list[SystemEventType | str]): + ... + + def __dealloc__(self): + ... + + def wait(self, timeout_ms: int=0, buffer_size: int=1) -> SystemEvents: + """ + Wait for events in the system event set. + + For Fermi™ or newer fully supported devices. + + If some events are ready to be delivered at the time of the call, + function returns immediately. If there are no events ready to be + delivered, function sleeps till event arrives but not longer than + specified timeout. If timeout passes, a + :class:`cuda.core.system.TimeoutError` is raised. This function in + certain conditions can return before specified timeout passes (e.g. when + interrupt arrives) + + Parameters + ---------- + timeout_ms: int + The timeout in milliseconds. A value of 0 means to wait indefinitely. + buffer_size: int + The maximum number of events to retrieve. Must be at least 1. + + Returns + ------- + :obj:`~_system_events.SystemEvents` + A set of events that were received. The number of events returned may + be less than the specified buffer size if fewer events were available. + + Raises + ------ + :class:`cuda.core.system.TimeoutError` + If the timeout expires before an event is received. + :class:`cuda.core.system.GpuIsLostError` + If the GPU has fallen off the bus or is otherwise inaccessible. + """ +_SYSTEM_EVENT_TYPE_MAPPING = {nvml.SystemEventType.GPU_DRIVER_UNBIND: SystemEventType.UNBIND, nvml.SystemEventType.GPU_DRIVER_BIND: SystemEventType.BIND} +_SYSTEM_EVENT_TYPE_INV_MAPPING = {v: k for k, v in _SYSTEM_EVENT_TYPE_MAPPING.items()} +__all__ = ['register_events'] + +def register_events(events: SystemEventType | str | list[SystemEventType | str]) -> RegisteredSystemEvents: + """ + Starts recording of events on test system. + + For Linux only. + + All events that occurred before this call are not recorded. Wait for events + using the :meth:`RegisteredSystemEvents.wait` method on the result. + + Examples + -------- + >>> from cuda.core import system + >>> events = system.register_events([SystemEventType.UNBIND]) + >>> while event := events.wait(timeout_ms=10000): + ... print(f"Event {event.event_type} occurred.") + + Parameters + ---------- + events: SystemEventType, str, or list of SystemEventType or str + The event type or list of event types to register for this device. + + Returns + ------- + :obj:`~_system_events.RegisteredSystemEvents` + An object representing the registered events. Call + :meth:`~_system_events.RegisteredSystemEvents.wait` on this object to wait for events. + + Raises + ------ + :class:`cuda.core.system.NotSupportedError` + None of the requested event types are registered. + """ \ No newline at end of file From 1aeb0f9b8cd6c5769c405a117d909d72d2887cf1 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 12 May 2026 14:21:33 -0400 Subject: [PATCH 4/6] Pin Cython version --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d3a6af13f53..25fb48eca34 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,6 +51,7 @@ repos: pass_filenames: false additional_dependencies: - stubgen-pyx==0.2.6 + - Cython==3.2.4 # Standard hooks - repo: https://github.com/pre-commit/pre-commit-hooks From 9ae96668ae8c3b436546715d2fe5cbee05109d04 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 12 May 2026 15:31:15 -0400 Subject: [PATCH 5/6] Address feedback in the PR --- cuda_core/cuda/core/_memory/_peer_access_utils.pyi | 11 +++++------ cuda_core/cuda/core/_memory/_peer_access_utils.pyx | 10 +++++----- .../cuda/core/_memory/_virtual_memory_resource.py | 5 +---- cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi | 6 +++--- cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx | 6 +++--- 5 files changed, 17 insertions(+), 21 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyi b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi index 95162a395e4..ff73e77efe5 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyi +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyi @@ -2,8 +2,7 @@ from __future__ import annotations -from collections.abc import Callable, Iterable, MutableSet -from collections.abc import Set as AbstractSet +from collections.abc import Callable, Iterable, MutableSet, Set from dataclasses import dataclass from typing import Any @@ -74,16 +73,16 @@ class PeerAccessibleBySetProxy(MutableSet): def symmetric_difference_update(self, other) -> None: """Toggle peer access for every device in ``other`` in one driver call.""" - def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] + def __ior__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] ... - def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: + def __iand__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: ... - def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: + def __isub__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: ... - def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] + def __ixor__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] ... def __repr__(self) -> str: diff --git a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx index 1e04a7482fc..711442285c7 100644 --- a/cuda_core/cuda/core/_memory/_peer_access_utils.pyx +++ b/cuda_core/cuda/core/_memory/_peer_access_utils.pyx @@ -4,7 +4,7 @@ from __future__ import annotations -from collections.abc import Callable, Iterable, MutableSet, Set as AbstractSet +from collections.abc import Callable, Iterable, MutableSet, Set from dataclasses import dataclass from typing import TYPE_CHECKING, Any @@ -336,22 +336,22 @@ class PeerAccessibleBySetProxy(MutableSet): if to_add or to_remove: self._apply(to_add, to_remove) - def __ior__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] + def __ior__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] self.update(other) return self - def __iand__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: + def __iand__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: self.intersection_update(other) return self - def __isub__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: + def __isub__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: if other is self: self.clear() else: self.difference_update(other) return self - def __ixor__(self, other: AbstractSet[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] + def __ixor__(self, other: Set[Any]) -> PeerAccessibleBySetProxy: # type: ignore[override,misc] self.symmetric_difference_update(other) return self diff --git a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py index a1171191687..2f2a25f8e43 100644 --- a/cuda_core/cuda/core/_memory/_virtual_memory_resource.py +++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py @@ -583,10 +583,7 @@ def deallocate(self, ptr: DevicePointerType, size: int, *, stream: Stream | Grap Keyword-only. Unused because virtual memory operations are synchronous. """ - if ptr is None: - ptr = 0 - else: - ptr = int(ptr) + ptr = 0 if ptr is None else int(ptr) if stream is not None: from cuda.core._stream import Stream_accept diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi index 287ed9e300a..f8b3f416659 100644 --- a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi +++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyi @@ -4,7 +4,7 @@ from __future__ import annotations from collections.abc import MutableSet -from collections.abc import Set as AbstractSet +from collections.abc import Set as Set from typing import Any from cuda.core.graph._graph_node import GraphNode @@ -40,13 +40,13 @@ class AdjacencySetProxy(MutableSet): def clear(self): """Remove all edges in a single driver call.""" - def __isub__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy': + def __isub__(self, it: Set[Any]) -> 'AdjacencySetProxy': """Remove edges to all nodes in *it* in a single driver call.""" def update(self, *others): """Add edges to multiple nodes at once.""" - def __ior__(self, it: AbstractSet[Any]) -> 'AdjacencySetProxy': # type: ignore[override,misc] + def __ior__(self, it: Set[Any]) -> 'AdjacencySetProxy': # type: ignore[override,misc] """Add edges to all nodes in *it* in a single driver call.""" def __repr__(self): diff --git a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx index 8875284f8fa..a841ffce8af 100644 --- a/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx +++ b/cuda_core/cuda/core/graph/_adjacency_set_proxy.pyx @@ -15,7 +15,7 @@ from cuda.core._resource_handles cimport ( graph_node_get_graph, ) from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from collections.abc import MutableSet, Set as AbstractSet +from collections.abc import MutableSet, Set as Set from typing import Any @@ -71,7 +71,7 @@ class AdjacencySetProxy(MutableSet): if members: (<_AdjacencySetCore>self._core).remove_edges(members) - def __isub__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy": + def __isub__(self, it: Set[Any]) -> "AdjacencySetProxy": """Remove edges to all nodes in *it* in a single driver call.""" if it is self: self.clear() @@ -99,7 +99,7 @@ class AdjacencySetProxy(MutableSet): if new: (<_AdjacencySetCore>self._core).add_edges(new) - def __ior__(self, it: AbstractSet[Any]) -> "AdjacencySetProxy": # type: ignore[override,misc] + def __ior__(self, it: Set[Any]) -> "AdjacencySetProxy": # type: ignore[override,misc] """Add edges to all nodes in *it* in a single driver call.""" self.update(it) return self From 442d212693c2bf879bc7405637354c99ba769c74 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 12 May 2026 16:34:10 -0400 Subject: [PATCH 6/6] Fix tests --- cuda_core/cuda/core/_event.pyi | 6 ++++++ cuda_core/cuda/core/_event.pyx | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/cuda_core/cuda/core/_event.pyi b/cuda_core/cuda/core/_event.pyi index 995e5c2650e..b8bda3d0ba1 100644 --- a/cuda_core/cuda/core/_event.pyi +++ b/cuda_core/cuda/core/_event.pyi @@ -72,6 +72,12 @@ class Event: def __init__(self, *args, **kwargs): ... + def __isub__(self, other): # type: ignore[misc] + ... + + def __rsub__(self, other): + ... + def __sub__(self, other: Event): ... diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 5f113365a9b..076bcb573c7 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -154,6 +154,12 @@ cdef class Event: """ self._h_event.reset() + def __isub__(self, other): # type: ignore[misc] + return NotImplemented + + def __rsub__(self, other): + return NotImplemented + def __sub__(self, other: Event): # return self - other (in milliseconds) cdef float timing